Line # Revision Author
1 99 ahitrov package Contenido::Parser;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 use Encode;
8 use URI;
9 use Data::Dumper;
10 use Contenido::Globals;
11 use LWP::UserAgent;
12 use Contenido::File::Scheme::FILE;
13 use Contenido::Parser::Util;
14
15 112 ahitrov sub new {
16 my ($proto) = @_;
17 my $class = ref($proto) || $proto;
18 my $self = {};
19 bless $self, $class;
20
21 return $self;
22 }
23
24
25 99 ahitrov sub fetch {
26 my ($self, $input, %opts) = @_;
27
28 my ($fh, $content);
29 136 ahitrov my $timeout = delete $opts{timeout} || 10;
30 99 ahitrov my $encoding = delete $opts{encoding};
31 211 ahitrov my $user_agent = delete $opts{user_agent};
32 99 ahitrov if (not ref $input) {
33 no strict "refs";
34 my $scheme = uc(scheme($input));
35 if ( $scheme eq 'FILE' ) {
36 $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
37 } else {
38 my $request = new HTTP::Request GET => $input;
39 211 ahitrov warn "REQUEST: ".Dumper( $request ) if $DEBUG;
40 99 ahitrov my $ua = new LWP::UserAgent;
41 136 ahitrov $ua->timeout($timeout);
42 211 ahitrov $ua->agent($user_agent || 'Mozilla/5.0 Firefox/11.0');
43 99 ahitrov my $res = $ua->request($request);
44 if ($res->is_success) {
45 $self->{headers} = $res->headers;
46 my $content_length = $res->headers->header('content-length');
47 my $content_type = $res->headers->header('content-type');
48 142 ahitrov my $headers_string = $res->headers->as_string;
49 # warn $res->content_type_charset."\n\n";
50 # warn Dumper($res->headers) if $DEBUG;
51 99 ahitrov $self->{content_type} = $content_type;
52 142 ahitrov if ( $res->content_type_charset ) {
53 $encoding = Encode::find_encoding($res->content_type_charset)->name;
54 99 ahitrov }
55 my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
56 $self->{base_url} = $base_url if $base_url;
57 142 ahitrov $content = $res->decoded_content( charset => 'none' );
58 # warn "Charset: ".$res->content_charset."\n";
59 99 ahitrov } else {
60 146 ahitrov warn $res->status_line." \n" if $DEBUG;
61 99 ahitrov $self->{success} = 0;
62 $self->{reason} = $res->status_line;
63 return $self;
64 }
65 }
66 113 ahitrov } elsif ( ref $input eq 'Apache::Upload' ) {
67 $fh = $input->fh;
68 } elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) {
69 99 ahitrov $fh = $input;
70 } elsif (ref $input eq "SCALAR") {
71 $fh = IO::Scalar->new($input);
72 } else {
73 warn("Path, scalar ref or fh needed");
74 $self->{success} = 0;
75 $self->{reason} = 'Path, scalar ref or fh needed';
76 return $self;
77 }
78
79 if ( ref $fh ) {
80 $content = <$fh>;
81 }
82 if ( $content ) {
83 162 ahitrov warn "starting content decoding...\n" if $DEBUG;
84 142 ahitrov if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html || $self->{headers}->content_is_xhtml || $self->{headers}->content_is_xml) ) {
85 unless ( $encoding ) {
86 $encoding = $self->__try_content_encoding( substr($content, 0, 350) );
87 }
88 if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) {
89 warn "Encoding from $encoding\n..." if $DEBUG;
90 Encode::from_to($content, $encoding, 'utf-8');
91 if ( exists $self->{headers} ) {
92 foreach my $header ( keys %{$self->{headers}} ) {
93 if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
94 foreach my $val ( @{$self->{headers}{$header}} ) {
95 Encode::from_to($val, $encoding, 'utf-8');
96 }
97 } else {
98 Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8');
99 112 ahitrov }
100 }
101 }
102 142 ahitrov } else {
103 # Encode::_utf8_off($content);
104 if ( exists $self->{headers} ) {
105 foreach my $header ( keys %{$self->{headers}} ) {
106 if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
107 foreach my $val ( @{$self->{headers}{$header}} ) {
108 Encode::_utf8_off($val);
109 }
110 } else {
111 162 ahitrov warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n" if $DEBUG;
112 142 ahitrov if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) {
113 Encode::_utf8_off($self->{headers}{$header});
114 # Encode::_utf8_on($self->{headers}{$header});
115 # $self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET);
116 # Encode::from_to($self->{headers}{$header}, $encoding, 'utf8');
117 }
118 }
119 }
120 }
121 99 ahitrov }
122 142 ahitrov $self->{encoding} = $encoding;
123 warn Dumper($self) if $DEBUG;
124 if ( $self->{headers}->content_is_html ) {
125 my $headers;
126 if ( $content =~ /<head.*?>(.*?)<\/head>/si ) {
127 $headers = $self->__parse_html_header( $1 );
128 }
129 if ( ref $headers eq 'ARRAY' && @$headers ) {
130 foreach my $header ( @$headers ) {
131 if ( $header->{tag} eq 'title' ) {
132 $self->{headers}{title} = $header->{content};
133 } elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) || ($header->{href} && $header->{href} =~ /\.ico$/)) ) {
134 $self->{favicon} = $header->{href};
135 }
136 }
137 $self->{html_headers} = $headers;
138 }
139 }
140 99 ahitrov }
141 $self->{content} = $content;
142 $self->{success} = 1;
143 } else {
144 $self->{success} = 0;
145 $self->{reason} = 'Content is empty';
146 }
147 return $self;
148 }
149
150 sub is_success {
151 my ($self, $val) = @_;
152
153 if ( defined $val ) {
154 $self->{success} = $val;
155 return $self;
156 } else {
157 return $self->{success};
158 }
159 }
160
161 sub __try_content_encoding {
162 my ($self, $input)= @_;
163 if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
164 return lc($1);
165 112 ahitrov } elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
166 return lc($1);
167 99 ahitrov } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
168 return lc($1);
169 } else {
170 return undef;
171 }
172 }
173
174 142 ahitrov sub __parse_html_header {
175 my ($self, $input)= @_;
176 my @tags;
177 $input =~ s/[\r\n\t]+/\ /sgi;
178 804 ahitrov if ( $input =~ /<title.*?>(.*?)<\/title.*?>/si ) {
179 142 ahitrov my $title = $1;
180 for ( $title ) {
181 s/^\s+//;
182 s/\s+$//;
183 }
184 push @tags, { tag => 'title', content => $title };
185 }
186 while ( $input =~ /<(.*?)\/?>/sgi ) {
187 my $tag = $1;
188 my $struct = {};
189 for ( $tag ) {
190 s/\ *=\ */=/g;
191 $_ = __encode_quotes($_);
192 }
193 my @tag = split /\ +/, $tag;
194 $struct->{tag} = lc(shift @tag);
195 next unless ($struct->{tag} eq 'link' || $struct->{tag} eq 'meta');
196 foreach my $str ( @tag ) {
197 if ( $str =~ /^(.*?)=(.*)$/ ) {
198 my $attr = $1;
199 my $val = $2;
200 for ( $val ) {
201 s/^"//;
202 s/"$//;
203 s/&nbsp;/\ /sg;
204 }
205 $struct->{$attr} = $val;
206 }
207 }
208 push @tags, $struct;
209 }
210 return \@tags;
211 }
212
213 146 ahitrov ### Имеет дело с "ободранным" тегом,
214 # в котором отстутсвуют < и >
215 ########################################
216 sub parse_html_tag {
217 my $self = shift;
218 my $tagstr = shift;
219
220 my %struct;
221 for ( $tagstr ) {
222 s/\ *=\ */=/g;
223 $_ = __encode_quotes($_);
224 }
225 my @tag = split /\ +/, $tagstr;
226 $struct{tag} = lc(shift @tag);
227
228 foreach my $str ( @tag ) {
229 if ( $str =~ /^(.*?)=(.*)$/ ) {
230 my $attr = lc($1);
231 my $val = $2;
232 for ( $val ) {
233 s/^"//;
234 s/"$//;
235 s/&nbsp;/\ /sg;
236 }
237 $struct{$attr} = $val;
238 }
239 }
240 return \%struct;
241 }
242
243
244 142 ahitrov sub __encode_quotes {
245 my $str = shift;
246 my @in = split //, $str;
247 my $out = '';
248 my $quot = '';
249 foreach my $ch ( @in ) {
250 if ( ($ch eq '"' && $quot eq '"') || ($ch eq "'" && $quot eq "'") ) {
251 $quot = '';
252 } elsif ( ($ch eq "'" || $ch eq '"' ) && !$quot ) {
253 $quot = $ch;
254 } elsif ( ($ch eq '"' && $quot eq "'") ) {
255 $ch = '&quot;';
256 } elsif ( ($ch eq "'" && $quot eq '"') ) {
257 $ch = '&amp;';
258 } elsif ( ($ch eq ' ' && $quot) ) {
259 $ch = '&nbsp;';
260 }
261 $out .= $ch;
262 }
263 $out =~ s/'/"/sgi;
264 return $out;
265 }
266
267 146 ahitrov
268 sub image_replace {
269 my ($self, $img_params, $replace_struct) = @_;
270
271 my $img = $self->parse_html_tag('img '.$img_params);
272 if ( exists $replace_struct->{$img->{src}} ) {
273 my $new_image = $replace_struct->{$img->{src}};
274 if ( ref $new_image && exists $new_image->{filename} ) {
275 $img->{src} = $new_image->{filename};
276 } else {
277 $img->{src} = $new_image;
278 }
279 return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
280 } else {
281 return '';
282 }
283 }
284
285 99 ahitrov sub scheme {
286 my $uri = shift;
287 my $scheme;
288
289 $scheme = URI->new($uri)->scheme() || "file";
290
291 return $scheme;
292 }
293
294
295 1;