Line #	Revision	Author
1	99	ahitrov	package Contenido::Parser;
2
3			use strict;
4			use warnings;
5			use locale;
6
7			use Encode;
8			use URI;
9			use Data::Dumper;
10			use Contenido::Globals;
11			use LWP::UserAgent;
12			use Contenido::File::Scheme::FILE;
13			use Contenido::Parser::Util;
14
15	112	ahitrov	sub new {
16			my ($proto) = @_;
17			my $class = ref($proto) \|\| $proto;
18			my $self = {};
19			bless $self, $class;
20
21			return $self;
22			}
23
24
25	99	ahitrov	sub fetch {
26			my ($self, $input, %opts) = @_;
27
28			my ($fh, $content);
29	136	ahitrov	my $timeout = delete $opts{timeout} \|\| 10;
30	99	ahitrov	my $encoding = delete $opts{encoding};
31	211	ahitrov	my $user_agent = delete $opts{user_agent};
32	99	ahitrov	if (not ref $input) {
33			no strict "refs";
34			my $scheme = uc(scheme($input));
35			if ( $scheme eq 'FILE' ) {
36			$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
37			} else {
38			my $request = new HTTP::Request GET => $input;
39	211	ahitrov	warn "REQUEST: ".Dumper( $request ) if $DEBUG;
40	99	ahitrov	my $ua = new LWP::UserAgent;
41	136	ahitrov	$ua->timeout($timeout);
42	211	ahitrov	$ua->agent($user_agent \|\| 'Mozilla/5.0 Firefox/11.0');
43	99	ahitrov	my $res = $ua->request($request);
44			if ($res->is_success) {
45			$self->{headers} = $res->headers;
46			my $content_length = $res->headers->header('content-length');
47			my $content_type = $res->headers->header('content-type');
48	142	ahitrov	my $headers_string = $res->headers->as_string;
49			# warn $res->content_type_charset."\n\n";
50			# warn Dumper($res->headers) if $DEBUG;
51	99	ahitrov	$self->{content_type} = $content_type;
52	142	ahitrov	if ( $res->content_type_charset ) {
53			$encoding = Encode::find_encoding($res->content_type_charset)->name;
54	99	ahitrov	}
55			my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
56			$self->{base_url} = $base_url if $base_url;
57	142	ahitrov	$content = $res->decoded_content( charset => 'none' );
58			# warn "Charset: ".$res->content_charset."\n";
59	99	ahitrov	} else {
60	146	ahitrov	warn $res->status_line." \n" if $DEBUG;
61	99	ahitrov	$self->{success} = 0;
62			$self->{reason} = $res->status_line;
63			return $self;
64			}
65			}
66	113	ahitrov	} elsif ( ref $input eq 'Apache::Upload' ) {
67			$fh = $input->fh;
68			} elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) {
69	99	ahitrov	$fh = $input;
70			} elsif (ref $input eq "SCALAR") {
71			$fh = IO::Scalar->new($input);
72			} else {
73			warn("Path, scalar ref or fh needed");
74			$self->{success} = 0;
75			$self->{reason} = 'Path, scalar ref or fh needed';
76			return $self;
77			}
78
79			if ( ref $fh ) {
80			$content = <$fh>;
81			}
82			if ( $content ) {
83	162	ahitrov	warn "starting content decoding...\n" if $DEBUG;
84	142	ahitrov	if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html \|\| $self->{headers}->content_is_xhtml \|\| $self->{headers}->content_is_xml) ) {
85			unless ( $encoding ) {
86			$encoding = $self->__try_content_encoding( substr($content, 0, 350) );
87			}
88			if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) {
89			warn "Encoding from $encoding\n..." if $DEBUG;
90			Encode::from_to($content, $encoding, 'utf-8');
91			if ( exists $self->{headers} ) {
92			foreach my $header ( keys %{$self->{headers}} ) {
93			if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
94			foreach my $val ( @{$self->{headers}{$header}} ) {
95			Encode::from_to($val, $encoding, 'utf-8');
96			}
97			} else {
98			Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8');
99	112	ahitrov	}
100			}
101			}
102	142	ahitrov	} else {
103			# Encode::_utf8_off($content);
104			if ( exists $self->{headers} ) {
105			foreach my $header ( keys %{$self->{headers}} ) {
106			if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
107			foreach my $val ( @{$self->{headers}{$header}} ) {
108			Encode::_utf8_off($val);
109			}
110			} else {
111	162	ahitrov	warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n" if $DEBUG;
112	142	ahitrov	if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) {
113			Encode::_utf8_off($self->{headers}{$header});
114			# Encode::_utf8_on($self->{headers}{$header});
115			# $self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET);
116			# Encode::from_to($self->{headers}{$header}, $encoding, 'utf8');
117			}
118			}
119			}
120			}
121	99	ahitrov	}
122	142	ahitrov	$self->{encoding} = $encoding;
123			warn Dumper($self) if $DEBUG;
124			if ( $self->{headers}->content_is_html ) {
125			my $headers;
126			if ( $content =~ /<head.?>(.?)<\/head>/si ) {
127			$headers = $self->__parse_html_header( $1 );
128			}
129			if ( ref $headers eq 'ARRAY' && @$headers ) {
130			foreach my $header ( @$headers ) {
131			if ( $header->{tag} eq 'title' ) {
132			$self->{headers}{title} = $header->{content};
133			} elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) \|\| ($header->{href} && $header->{href} =~ /\.ico$/)) ) {
134			$self->{favicon} = $header->{href};
135			}
136			}
137			$self->{html_headers} = $headers;
138			}
139			}
140	99	ahitrov	}
141			$self->{content} = $content;
142			$self->{success} = 1;
143			} else {
144			$self->{success} = 0;
145			$self->{reason} = 'Content is empty';
146			}
147			return $self;
148			}
149
150			sub is_success {
151			my ($self, $val) = @_;
152
153			if ( defined $val ) {
154			$self->{success} = $val;
155			return $self;
156			} else {
157			return $self->{success};
158			}
159			}
160
161			sub __try_content_encoding {
162			my ($self, $input)= @_;
163			if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
164			return lc($1);
165	112	ahitrov	} elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
166			return lc($1);
167	99	ahitrov	} elsif ( $input =~ /(utf-8\|windows-1251\|koi8-r)/i ) {
168			return lc($1);
169			} else {
170			return undef;
171			}
172			}
173
174	142	ahitrov	sub __parse_html_header {
175			my ($self, $input)= @_;
176			my @tags;
177			$input =~ s/[\r\n\t]+/\ /sgi;
178	804	ahitrov	if ( $input =~ /<title.?>(.?)<\/title.*?>/si ) {
179	142	ahitrov	my $title = $1;
180			for ( $title ) {
181			s/^\s+//;
182			s/\s+$//;
183			}
184			push @tags, { tag => 'title', content => $title };
185			}
186			while ( $input =~ /<(.*?)\/?>/sgi ) {
187			my $tag = $1;
188			my $struct = {};
189			for ( $tag ) {
190			s/\ =\ /=/g;
191			$_ = __encode_quotes($_);
192			}
193			my @tag = split /\ +/, $tag;
194			$struct->{tag} = lc(shift @tag);
195			next unless ($struct->{tag} eq 'link' \|\| $struct->{tag} eq 'meta');
196			foreach my $str ( @tag ) {
197			if ( $str =~ /^(.?)=(.)$/ ) {
198			my $attr = $1;
199			my $val = $2;
200			for ( $val ) {
201			s/^"//;
202			s/"$//;
203			s/ /\ /sg;
204			}
205			$struct->{$attr} = $val;
206			}
207			}
208			push @tags, $struct;
209			}
210			return \@tags;
211			}
212
213	146	ahitrov	### Имеет дело с "ободранным" тегом,
214			# в котором отстутсвуют < и >
215			########################################
216			sub parse_html_tag {
217			my $self = shift;
218			my $tagstr = shift;
219
220			my %struct;
221			for ( $tagstr ) {
222			s/\ =\ /=/g;
223			$_ = __encode_quotes($_);
224			}
225			my @tag = split /\ +/, $tagstr;
226			$struct{tag} = lc(shift @tag);
227
228			foreach my $str ( @tag ) {
229			if ( $str =~ /^(.?)=(.)$/ ) {
230			my $attr = lc($1);
231			my $val = $2;
232			for ( $val ) {
233			s/^"//;
234			s/"$//;
235			s/ /\ /sg;
236			}
237			$struct{$attr} = $val;
238			}
239			}
240			return \%struct;
241			}
242
243
244	142	ahitrov	sub __encode_quotes {
245			my $str = shift;
246			my @in = split //, $str;
247			my $out = '';
248			my $quot = '';
249			foreach my $ch ( @in ) {
250			if ( ($ch eq '"' && $quot eq '"') \|\| ($ch eq "'" && $quot eq "'") ) {
251			$quot = '';
252			} elsif ( ($ch eq "'" \|\| $ch eq '"' ) && !$quot ) {
253			$quot = $ch;
254			} elsif ( ($ch eq '"' && $quot eq "'") ) {
255			$ch = '"';
256			} elsif ( ($ch eq "'" && $quot eq '"') ) {
257			$ch = '&';
258			} elsif ( ($ch eq ' ' && $quot) ) {
259			$ch = ' ';
260			}
261			$out .= $ch;
262			}
263			$out =~ s/'/"/sgi;
264			return $out;
265			}
266
267	146	ahitrov
268			sub image_replace {
269			my ($self, $img_params, $replace_struct) = @_;
270
271			my $img = $self->parse_html_tag('img '.$img_params);
272			if ( exists $replace_struct->{$img->{src}} ) {
273			my $new_image = $replace_struct->{$img->{src}};
274			if ( ref $new_image && exists $new_image->{filename} ) {
275			$img->{src} = $new_image->{filename};
276			} else {
277			$img->{src} = $new_image;
278			}
279			return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
280			} else {
281			return '';
282			}
283			}
284
285	99	ahitrov	sub scheme {
286			my $uri = shift;
287			my $scheme;
288
289			$scheme = URI->new($uri)->scheme() \|\| "file";
290
291			return $scheme;
292			}
293
294
295			1;

Repository List / Contenido / utf8 / core / lib / Contenido / Parser.pm @ r804