Line # Revision Author
1 99 ahitrov package Contenido::Parser;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 use Encode;
8 use URI;
9 use Data::Dumper;
10 use Contenido::Globals;
11 use LWP::UserAgent;
12 use Contenido::File::Scheme::FILE;
13 use Contenido::Parser::Util;
14
15 112 ahitrov sub new {
16 my ($proto) = @_;
17 my $class = ref($proto) || $proto;
18 my $self = {};
19 bless $self, $class;
20
21 return $self;
22 }
23
24
25 99 ahitrov sub fetch {
26 my ($self, $input, %opts) = @_;
27
28 my ($fh, $content);
29 136 ahitrov my $timeout = delete $opts{timeout} || 10;
30 99 ahitrov my $encoding = delete $opts{encoding};
31 if (not ref $input) {
32 no strict "refs";
33 my $scheme = uc(scheme($input));
34 if ( $scheme eq 'FILE' ) {
35 $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
36 } else {
37 my $request = new HTTP::Request GET => $input;
38 my $ua = new LWP::UserAgent;
39 136 ahitrov $ua->timeout($timeout);
40 99 ahitrov my $res = $ua->request($request);
41 if ($res->is_success) {
42 $self->{headers} = $res->headers;
43 my $content_length = $res->headers->header('content-length');
44 my $content_type = $res->headers->header('content-type');
45 142 ahitrov my $headers_string = $res->headers->as_string;
46 # warn $res->content_type_charset."\n\n";
47 # warn Dumper($res->headers) if $DEBUG;
48 99 ahitrov $self->{content_type} = $content_type;
49 142 ahitrov if ( $res->content_type_charset ) {
50 $encoding = Encode::find_encoding($res->content_type_charset)->name;
51 99 ahitrov }
52 my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
53 $self->{base_url} = $base_url if $base_url;
54 142 ahitrov $content = $res->decoded_content( charset => 'none' );
55 # warn "Charset: ".$res->content_charset."\n";
56 99 ahitrov } else {
57 146 ahitrov warn $res->status_line." \n" if $DEBUG;
58 99 ahitrov $self->{success} = 0;
59 $self->{reason} = $res->status_line;
60 return $self;
61 }
62 }
63 113 ahitrov } elsif ( ref $input eq 'Apache::Upload' ) {
64 $fh = $input->fh;
65 } elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) {
66 99 ahitrov $fh = $input;
67 } elsif (ref $input eq "SCALAR") {
68 $fh = IO::Scalar->new($input);
69 } else {
70 warn("Path, scalar ref or fh needed");
71 $self->{success} = 0;
72 $self->{reason} = 'Path, scalar ref or fh needed';
73 return $self;
74 }
75
76 if ( ref $fh ) {
77 $content = <$fh>;
78 }
79 if ( $content ) {
80 142 ahitrov warn "starting content decoding...\n";
81 if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html || $self->{headers}->content_is_xhtml || $self->{headers}->content_is_xml) ) {
82 unless ( $encoding ) {
83 $encoding = $self->__try_content_encoding( substr($content, 0, 350) );
84 }
85 if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) {
86 warn "Encoding from $encoding\n..." if $DEBUG;
87 Encode::from_to($content, $encoding, 'utf-8');
88 if ( exists $self->{headers} ) {
89 foreach my $header ( keys %{$self->{headers}} ) {
90 if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
91 foreach my $val ( @{$self->{headers}{$header}} ) {
92 Encode::from_to($val, $encoding, 'utf-8');
93 }
94 } else {
95 Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8');
96 112 ahitrov }
97 }
98 }
99 142 ahitrov } else {
100 # Encode::_utf8_off($content);
101 if ( exists $self->{headers} ) {
102 foreach my $header ( keys %{$self->{headers}} ) {
103 if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
104 foreach my $val ( @{$self->{headers}{$header}} ) {
105 Encode::_utf8_off($val);
106 }
107 } else {
108 warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n";
109 if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) {
110 Encode::_utf8_off($self->{headers}{$header});
111 # Encode::_utf8_on($self->{headers}{$header});
112 # $self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET);
113 # Encode::from_to($self->{headers}{$header}, $encoding, 'utf8');
114 }
115 }
116 }
117 }
118 99 ahitrov }
119 142 ahitrov $self->{encoding} = $encoding;
120 warn Dumper($self) if $DEBUG;
121 if ( $self->{headers}->content_is_html ) {
122 my $headers;
123 if ( $content =~ /<head.*?>(.*?)<\/head>/si ) {
124 $headers = $self->__parse_html_header( $1 );
125 }
126 if ( ref $headers eq 'ARRAY' && @$headers ) {
127 foreach my $header ( @$headers ) {
128 if ( $header->{tag} eq 'title' ) {
129 $self->{headers}{title} = $header->{content};
130 } elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) || ($header->{href} && $header->{href} =~ /\.ico$/)) ) {
131 $self->{favicon} = $header->{href};
132 }
133 }
134 $self->{html_headers} = $headers;
135 }
136 }
137 99 ahitrov }
138 $self->{content} = $content;
139 $self->{success} = 1;
140 } else {
141 $self->{success} = 0;
142 $self->{reason} = 'Content is empty';
143 }
144 return $self;
145 }
146
147 sub is_success {
148 my ($self, $val) = @_;
149
150 if ( defined $val ) {
151 $self->{success} = $val;
152 return $self;
153 } else {
154 return $self->{success};
155 }
156 }
157
158 sub __try_content_encoding {
159 my ($self, $input)= @_;
160 if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
161 return lc($1);
162 112 ahitrov } elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
163 return lc($1);
164 99 ahitrov } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
165 return lc($1);
166 } else {
167 return undef;
168 }
169 }
170
171 142 ahitrov sub __parse_html_header {
172 my ($self, $input)= @_;
173 my @tags;
174 $input =~ s/[\r\n\t]+/\ /sgi;
175 if ( $input =~ /<title.*?>(.*?)<\/title.*?>/sgi ) {
176 my $title = $1;
177 for ( $title ) {
178 s/^\s+//;
179 s/\s+$//;
180 }
181 push @tags, { tag => 'title', content => $title };
182 }
183 while ( $input =~ /<(.*?)\/?>/sgi ) {
184 my $tag = $1;
185 my $struct = {};
186 for ( $tag ) {
187 s/\ *=\ */=/g;
188 $_ = __encode_quotes($_);
189 }
190 my @tag = split /\ +/, $tag;
191 $struct->{tag} = lc(shift @tag);
192 next unless ($struct->{tag} eq 'link' || $struct->{tag} eq 'meta');
193 foreach my $str ( @tag ) {
194 if ( $str =~ /^(.*?)=(.*)$/ ) {
195 my $attr = $1;
196 my $val = $2;
197 for ( $val ) {
198 s/^"//;
199 s/"$//;
200 s/&nbsp;/\ /sg;
201 }
202 $struct->{$attr} = $val;
203 }
204 }
205 push @tags, $struct;
206 }
207 return \@tags;
208 }
209
210 146 ahitrov ### Имеет дело с "ободранным" тегом,
211 # в котором отстутсвуют < и >
212 ########################################
213 sub parse_html_tag {
214 my $self = shift;
215 my $tagstr = shift;
216
217 my %struct;
218 for ( $tagstr ) {
219 s/\ *=\ */=/g;
220 $_ = __encode_quotes($_);
221 }
222 my @tag = split /\ +/, $tagstr;
223 $struct{tag} = lc(shift @tag);
224
225 foreach my $str ( @tag ) {
226 if ( $str =~ /^(.*?)=(.*)$/ ) {
227 my $attr = lc($1);
228 my $val = $2;
229 for ( $val ) {
230 s/^"//;
231 s/"$//;
232 s/&nbsp;/\ /sg;
233 }
234 $struct{$attr} = $val;
235 }
236 }
237 return \%struct;
238 }
239
240
241 142 ahitrov sub __encode_quotes {
242 my $str = shift;
243 my @in = split //, $str;
244 my $out = '';
245 my $quot = '';
246 foreach my $ch ( @in ) {
247 if ( ($ch eq '"' && $quot eq '"') || ($ch eq "'" && $quot eq "'") ) {
248 $quot = '';
249 } elsif ( ($ch eq "'" || $ch eq '"' ) && !$quot ) {
250 $quot = $ch;
251 } elsif ( ($ch eq '"' && $quot eq "'") ) {
252 $ch = '&quot;';
253 } elsif ( ($ch eq "'" && $quot eq '"') ) {
254 $ch = '&amp;';
255 } elsif ( ($ch eq ' ' && $quot) ) {
256 $ch = '&nbsp;';
257 }
258 $out .= $ch;
259 }
260 $out =~ s/'/"/sgi;
261 return $out;
262 }
263
264 146 ahitrov
265 sub image_replace {
266 my ($self, $img_params, $replace_struct) = @_;
267
268 my $img = $self->parse_html_tag('img '.$img_params);
269 if ( exists $replace_struct->{$img->{src}} ) {
270 my $new_image = $replace_struct->{$img->{src}};
271 if ( ref $new_image && exists $new_image->{filename} ) {
272 $img->{src} = $new_image->{filename};
273 } else {
274 $img->{src} = $new_image;
275 }
276 return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
277 } else {
278 return '';
279 }
280 }
281
282 99 ahitrov sub scheme {
283 my $uri = shift;
284 my $scheme;
285
286 $scheme = URI->new($uri)->scheme() || "file";
287
288 return $scheme;
289 }
290
291
292 1;