1 |
99 |
ahitrov |
package Contenido::Parser; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use warnings; |
5 |
|
|
use locale; |
6 |
|
|
|
7 |
|
|
use Encode; |
8 |
|
|
use URI; |
9 |
|
|
use Data::Dumper; |
10 |
|
|
use Contenido::Globals; |
11 |
|
|
use LWP::UserAgent; |
12 |
|
|
use Contenido::File::Scheme::FILE; |
13 |
|
|
use Contenido::Parser::Util; |
14 |
|
|
|
15 |
112 |
ahitrov |
sub new { |
16 |
|
|
my ($proto) = @_; |
17 |
|
|
my $class = ref($proto) || $proto; |
18 |
|
|
my $self = {}; |
19 |
|
|
bless $self, $class; |
20 |
|
|
|
21 |
|
|
return $self; |
22 |
|
|
} |
23 |
|
|
|
24 |
|
|
|
25 |
99 |
ahitrov |
sub fetch { |
26 |
|
|
my ($self, $input, %opts) = @_; |
27 |
|
|
|
28 |
|
|
my ($fh, $content); |
29 |
136 |
ahitrov |
my $timeout = delete $opts{timeout} || 10; |
30 |
99 |
ahitrov |
my $encoding = delete $opts{encoding}; |
31 |
211 |
ahitrov |
my $user_agent = delete $opts{user_agent}; |
32 |
99 |
ahitrov |
if (not ref $input) { |
33 |
|
|
no strict "refs"; |
34 |
|
|
my $scheme = uc(scheme($input)); |
35 |
|
|
if ( $scheme eq 'FILE' ) { |
36 |
|
|
$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input); |
37 |
|
|
} else { |
38 |
|
|
my $request = new HTTP::Request GET => $input; |
39 |
211 |
ahitrov |
warn "REQUEST: ".Dumper( $request ) if $DEBUG; |
40 |
99 |
ahitrov |
my $ua = new LWP::UserAgent; |
41 |
136 |
ahitrov |
$ua->timeout($timeout); |
42 |
211 |
ahitrov |
$ua->agent($user_agent || 'Mozilla/5.0 Firefox/11.0'); |
43 |
99 |
ahitrov |
my $res = $ua->request($request); |
44 |
|
|
if ($res->is_success) { |
45 |
|
|
$self->{headers} = $res->headers; |
46 |
|
|
my $content_length = $res->headers->header('content-length'); |
47 |
|
|
my $content_type = $res->headers->header('content-type'); |
48 |
142 |
ahitrov |
my $headers_string = $res->headers->as_string; |
49 |
|
|
# warn $res->content_type_charset."\n\n"; |
50 |
|
|
# warn Dumper($res->headers) if $DEBUG; |
51 |
99 |
ahitrov |
$self->{content_type} = $content_type; |
52 |
142 |
ahitrov |
if ( $res->content_type_charset ) { |
53 |
|
|
$encoding = Encode::find_encoding($res->content_type_charset)->name; |
54 |
99 |
ahitrov |
} |
55 |
|
|
my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : ''; |
56 |
|
|
$self->{base_url} = $base_url if $base_url; |
57 |
142 |
ahitrov |
$content = $res->decoded_content( charset => 'none' ); |
58 |
|
|
# warn "Charset: ".$res->content_charset."\n"; |
59 |
99 |
ahitrov |
} else { |
60 |
146 |
ahitrov |
warn $res->status_line." \n" if $DEBUG; |
61 |
99 |
ahitrov |
$self->{success} = 0; |
62 |
|
|
$self->{reason} = $res->status_line; |
63 |
|
|
return $self; |
64 |
|
|
} |
65 |
|
|
} |
66 |
113 |
ahitrov |
} elsif ( ref $input eq 'Apache::Upload' ) { |
67 |
|
|
$fh = $input->fh; |
68 |
|
|
} elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) { |
69 |
99 |
ahitrov |
$fh = $input; |
70 |
|
|
} elsif (ref $input eq "SCALAR") { |
71 |
|
|
$fh = IO::Scalar->new($input); |
72 |
|
|
} else { |
73 |
|
|
warn("Path, scalar ref or fh needed"); |
74 |
|
|
$self->{success} = 0; |
75 |
|
|
$self->{reason} = 'Path, scalar ref or fh needed'; |
76 |
|
|
return $self; |
77 |
|
|
} |
78 |
|
|
|
79 |
|
|
if ( ref $fh ) { |
80 |
|
|
$content = <$fh>; |
81 |
|
|
} |
82 |
|
|
if ( $content ) { |
83 |
162 |
ahitrov |
warn "starting content decoding...\n" if $DEBUG; |
84 |
142 |
ahitrov |
if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html || $self->{headers}->content_is_xhtml || $self->{headers}->content_is_xml) ) { |
85 |
|
|
unless ( $encoding ) { |
86 |
|
|
$encoding = $self->__try_content_encoding( substr($content, 0, 350) ); |
87 |
|
|
} |
88 |
|
|
if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) { |
89 |
|
|
warn "Encoding from $encoding\n..." if $DEBUG; |
90 |
|
|
Encode::from_to($content, $encoding, 'utf-8'); |
91 |
|
|
if ( exists $self->{headers} ) { |
92 |
|
|
foreach my $header ( keys %{$self->{headers}} ) { |
93 |
|
|
if ( ref $self->{headers}{$header} eq 'ARRAY' ) { |
94 |
|
|
foreach my $val ( @{$self->{headers}{$header}} ) { |
95 |
|
|
Encode::from_to($val, $encoding, 'utf-8'); |
96 |
|
|
} |
97 |
|
|
} else { |
98 |
|
|
Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8'); |
99 |
112 |
ahitrov |
} |
100 |
|
|
} |
101 |
|
|
} |
102 |
142 |
ahitrov |
} else { |
103 |
|
|
# Encode::_utf8_off($content); |
104 |
|
|
if ( exists $self->{headers} ) { |
105 |
|
|
foreach my $header ( keys %{$self->{headers}} ) { |
106 |
|
|
if ( ref $self->{headers}{$header} eq 'ARRAY' ) { |
107 |
|
|
foreach my $val ( @{$self->{headers}{$header}} ) { |
108 |
|
|
Encode::_utf8_off($val); |
109 |
|
|
} |
110 |
|
|
} else { |
111 |
162 |
ahitrov |
warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n" if $DEBUG; |
112 |
142 |
ahitrov |
if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) { |
113 |
|
|
Encode::_utf8_off($self->{headers}{$header}); |
114 |
|
|
# Encode::_utf8_on($self->{headers}{$header}); |
115 |
|
|
# $self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET); |
116 |
|
|
# Encode::from_to($self->{headers}{$header}, $encoding, 'utf8'); |
117 |
|
|
} |
118 |
|
|
} |
119 |
|
|
} |
120 |
|
|
} |
121 |
99 |
ahitrov |
} |
122 |
142 |
ahitrov |
$self->{encoding} = $encoding; |
123 |
|
|
warn Dumper($self) if $DEBUG; |
124 |
|
|
if ( $self->{headers}->content_is_html ) { |
125 |
|
|
my $headers; |
126 |
|
|
if ( $content =~ /<head.*?>(.*?)<\/head>/si ) { |
127 |
|
|
$headers = $self->__parse_html_header( $1 ); |
128 |
|
|
} |
129 |
|
|
if ( ref $headers eq 'ARRAY' && @$headers ) { |
130 |
|
|
foreach my $header ( @$headers ) { |
131 |
|
|
if ( $header->{tag} eq 'title' ) { |
132 |
|
|
$self->{headers}{title} = $header->{content}; |
133 |
|
|
} elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) || ($header->{href} && $header->{href} =~ /\.ico$/)) ) { |
134 |
|
|
$self->{favicon} = $header->{href}; |
135 |
|
|
} |
136 |
|
|
} |
137 |
|
|
$self->{html_headers} = $headers; |
138 |
|
|
} |
139 |
|
|
} |
140 |
99 |
ahitrov |
} |
141 |
|
|
$self->{content} = $content; |
142 |
|
|
$self->{success} = 1; |
143 |
|
|
} else { |
144 |
|
|
$self->{success} = 0; |
145 |
|
|
$self->{reason} = 'Content is empty'; |
146 |
|
|
} |
147 |
|
|
return $self; |
148 |
|
|
} |
149 |
|
|
|
150 |
|
|
sub is_success { |
151 |
|
|
my ($self, $val) = @_; |
152 |
|
|
|
153 |
|
|
if ( defined $val ) { |
154 |
|
|
$self->{success} = $val; |
155 |
|
|
return $self; |
156 |
|
|
} else { |
157 |
|
|
return $self->{success}; |
158 |
|
|
} |
159 |
|
|
} |
160 |
|
|
|
161 |
|
|
sub __try_content_encoding { |
162 |
|
|
my ($self, $input)= @_; |
163 |
|
|
if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { |
164 |
|
|
return lc($1); |
165 |
112 |
ahitrov |
} elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { |
166 |
|
|
return lc($1); |
167 |
99 |
ahitrov |
} elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) { |
168 |
|
|
return lc($1); |
169 |
|
|
} else { |
170 |
|
|
return undef; |
171 |
|
|
} |
172 |
|
|
} |
173 |
|
|
|
174 |
142 |
ahitrov |
sub __parse_html_header { |
175 |
|
|
my ($self, $input)= @_; |
176 |
|
|
my @tags; |
177 |
|
|
$input =~ s/[\r\n\t]+/\ /sgi; |
178 |
804 |
ahitrov |
if ( $input =~ /<title.*?>(.*?)<\/title.*?>/si ) { |
179 |
142 |
ahitrov |
my $title = $1; |
180 |
|
|
for ( $title ) { |
181 |
|
|
s/^\s+//; |
182 |
|
|
s/\s+$//; |
183 |
|
|
} |
184 |
|
|
push @tags, { tag => 'title', content => $title }; |
185 |
|
|
} |
186 |
|
|
while ( $input =~ /<(.*?)\/?>/sgi ) { |
187 |
|
|
my $tag = $1; |
188 |
|
|
my $struct = {}; |
189 |
|
|
for ( $tag ) { |
190 |
|
|
s/\ *=\ */=/g; |
191 |
|
|
$_ = __encode_quotes($_); |
192 |
|
|
} |
193 |
|
|
my @tag = split /\ +/, $tag; |
194 |
|
|
$struct->{tag} = lc(shift @tag); |
195 |
|
|
next unless ($struct->{tag} eq 'link' || $struct->{tag} eq 'meta'); |
196 |
|
|
foreach my $str ( @tag ) { |
197 |
|
|
if ( $str =~ /^(.*?)=(.*)$/ ) { |
198 |
|
|
my $attr = $1; |
199 |
|
|
my $val = $2; |
200 |
|
|
for ( $val ) { |
201 |
|
|
s/^"//; |
202 |
|
|
s/"$//; |
203 |
|
|
s/ /\ /sg; |
204 |
|
|
} |
205 |
|
|
$struct->{$attr} = $val; |
206 |
|
|
} |
207 |
|
|
} |
208 |
|
|
push @tags, $struct; |
209 |
|
|
} |
210 |
|
|
return \@tags; |
211 |
|
|
} |
212 |
|
|
|
213 |
146 |
ahitrov |
### Имеет дело с "ободранным" тегом, |
214 |
|
|
# в котором отстутсвуют < и > |
215 |
|
|
######################################## |
216 |
|
|
sub parse_html_tag { |
217 |
|
|
my $self = shift; |
218 |
|
|
my $tagstr = shift; |
219 |
|
|
|
220 |
|
|
my %struct; |
221 |
|
|
for ( $tagstr ) { |
222 |
|
|
s/\ *=\ */=/g; |
223 |
|
|
$_ = __encode_quotes($_); |
224 |
|
|
} |
225 |
|
|
my @tag = split /\ +/, $tagstr; |
226 |
|
|
$struct{tag} = lc(shift @tag); |
227 |
|
|
|
228 |
|
|
foreach my $str ( @tag ) { |
229 |
|
|
if ( $str =~ /^(.*?)=(.*)$/ ) { |
230 |
|
|
my $attr = lc($1); |
231 |
|
|
my $val = $2; |
232 |
|
|
for ( $val ) { |
233 |
|
|
s/^"//; |
234 |
|
|
s/"$//; |
235 |
|
|
s/ /\ /sg; |
236 |
|
|
} |
237 |
|
|
$struct{$attr} = $val; |
238 |
|
|
} |
239 |
|
|
} |
240 |
|
|
return \%struct; |
241 |
|
|
} |
242 |
|
|
|
243 |
|
|
|
244 |
142 |
ahitrov |
sub __encode_quotes { |
245 |
|
|
my $str = shift; |
246 |
|
|
my @in = split //, $str; |
247 |
|
|
my $out = ''; |
248 |
|
|
my $quot = ''; |
249 |
|
|
foreach my $ch ( @in ) { |
250 |
|
|
if ( ($ch eq '"' && $quot eq '"') || ($ch eq "'" && $quot eq "'") ) { |
251 |
|
|
$quot = ''; |
252 |
|
|
} elsif ( ($ch eq "'" || $ch eq '"' ) && !$quot ) { |
253 |
|
|
$quot = $ch; |
254 |
|
|
} elsif ( ($ch eq '"' && $quot eq "'") ) { |
255 |
|
|
$ch = '"'; |
256 |
|
|
} elsif ( ($ch eq "'" && $quot eq '"') ) { |
257 |
|
|
$ch = '&'; |
258 |
|
|
} elsif ( ($ch eq ' ' && $quot) ) { |
259 |
|
|
$ch = ' '; |
260 |
|
|
} |
261 |
|
|
$out .= $ch; |
262 |
|
|
} |
263 |
|
|
$out =~ s/'/"/sgi; |
264 |
|
|
return $out; |
265 |
|
|
} |
266 |
|
|
|
267 |
146 |
ahitrov |
|
268 |
|
|
sub image_replace { |
269 |
|
|
my ($self, $img_params, $replace_struct) = @_; |
270 |
|
|
|
271 |
|
|
my $img = $self->parse_html_tag('img '.$img_params); |
272 |
|
|
if ( exists $replace_struct->{$img->{src}} ) { |
273 |
|
|
my $new_image = $replace_struct->{$img->{src}}; |
274 |
|
|
if ( ref $new_image && exists $new_image->{filename} ) { |
275 |
|
|
$img->{src} = $new_image->{filename}; |
276 |
|
|
} else { |
277 |
|
|
$img->{src} = $new_image; |
278 |
|
|
} |
279 |
|
|
return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>'; |
280 |
|
|
} else { |
281 |
|
|
return ''; |
282 |
|
|
} |
283 |
|
|
} |
284 |
|
|
|
285 |
99 |
ahitrov |
sub scheme { |
286 |
|
|
my $uri = shift; |
287 |
|
|
my $scheme; |
288 |
|
|
|
289 |
|
|
$scheme = URI->new($uri)->scheme() || "file"; |
290 |
|
|
|
291 |
|
|
return $scheme; |
292 |
|
|
} |
293 |
|
|
|
294 |
|
|
|
295 |
|
|
1; |