package Contenido::Parser; use strict; use warnings; use locale; use Encode; use URI; use Data::Dumper; use Contenido::Globals; use LWP::UserAgent; use Contenido::File::Scheme::FILE; use Contenido::Parser::Util; sub new { my ($proto) = @_; my $class = ref($proto) || $proto; my $self = {}; bless $self, $class; return $self; } sub fetch { my ($self, $input, %opts) = @_; my ($fh, $content); my $timeout = delete $opts{timeout} || 10; my $encoding = delete $opts{encoding}; my $user_agent = delete $opts{user_agent}; if (not ref $input) { no strict "refs"; my $scheme = uc(scheme($input)); if ( $scheme eq 'FILE' ) { $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input); } else { my $request = new HTTP::Request GET => $input; warn "REQUEST: ".Dumper( $request ) if $DEBUG; my $ua = new LWP::UserAgent; $ua->timeout($timeout); $ua->agent($user_agent || 'Mozilla/5.0 Firefox/11.0'); my $res = $ua->request($request); if ($res->is_success) { $self->{headers} = $res->headers; my $content_length = $res->headers->header('content-length'); my $content_type = $res->headers->header('content-type'); my $headers_string = $res->headers->as_string; # warn $res->content_type_charset."\n\n"; # warn Dumper($res->headers) if $DEBUG; $self->{content_type} = $content_type; if ( $res->content_type_charset ) { $encoding = Encode::find_encoding($res->content_type_charset)->name; } my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : ''; $self->{base_url} = $base_url if $base_url; $content = $res->decoded_content( charset => 'none' ); # warn "Charset: ".$res->content_charset."\n"; } else { warn $res->status_line." \n" if $DEBUG; $self->{success} = 0; $self->{reason} = $res->status_line; return $self; } } } elsif ( ref $input eq 'Apache::Upload' ) { $fh = $input->fh; } elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) { $fh = $input; } elsif (ref $input eq "SCALAR") { $fh = IO::Scalar->new($input); } else { warn("Path, scalar ref or fh needed"); $self->{success} = 0; $self->{reason} = 'Path, scalar ref or fh needed'; return $self; } if ( ref $fh ) { $content = <$fh>; } if ( $content ) { warn "starting content decoding...\n" if $DEBUG; if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html || $self->{headers}->content_is_xhtml || $self->{headers}->content_is_xml) ) { unless ( $encoding ) { $encoding = $self->__try_content_encoding( substr($content, 0, 350) ); } if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) { warn "Encoding from $encoding\n..." if $DEBUG; Encode::from_to($content, $encoding, 'utf-8'); if ( exists $self->{headers} ) { foreach my $header ( keys %{$self->{headers}} ) { if ( ref $self->{headers}{$header} eq 'ARRAY' ) { foreach my $val ( @{$self->{headers}{$header}} ) { Encode::from_to($val, $encoding, 'utf-8'); } } else { Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8'); } } } } else { # Encode::_utf8_off($content); if ( exists $self->{headers} ) { foreach my $header ( keys %{$self->{headers}} ) { if ( ref $self->{headers}{$header} eq 'ARRAY' ) { foreach my $val ( @{$self->{headers}{$header}} ) { Encode::_utf8_off($val); } } else { warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n" if $DEBUG; if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) { Encode::_utf8_off($self->{headers}{$header}); # Encode::_utf8_on($self->{headers}{$header}); # $self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET); # Encode::from_to($self->{headers}{$header}, $encoding, 'utf8'); } } } } } $self->{encoding} = $encoding; warn Dumper($self) if $DEBUG; if ( $self->{headers}->content_is_html ) { my $headers; if ( $content =~ /(.*?)<\/head>/si ) { $headers = $self->__parse_html_header( $1 ); } if ( ref $headers eq 'ARRAY' && @$headers ) { foreach my $header ( @$headers ) { if ( $header->{tag} eq 'title' ) { $self->{headers}{title} = $header->{content}; } elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) || ($header->{href} && $header->{href} =~ /\.ico$/)) ) { $self->{favicon} = $header->{href}; } } $self->{html_headers} = $headers; } } } $self->{content} = $content; $self->{success} = 1; } else { $self->{success} = 0; $self->{reason} = 'Content is empty'; } return $self; } sub is_success { my ($self, $val) = @_; if ( defined $val ) { $self->{success} = $val; return $self; } else { return $self->{success}; } } sub __try_content_encoding { my ($self, $input)= @_; if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { return lc($1); } elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { return lc($1); } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) { return lc($1); } else { return undef; } } sub __parse_html_header { my ($self, $input)= @_; my @tags; $input =~ s/[\r\n\t]+/\ /sgi; if ( $input =~ /(.*?)<\/title.*?>/si ) { my $title = $1; for ( $title ) { s/^\s+//; s/\s+$//; } push @tags, { tag => 'title', content => $title }; } while ( $input =~ /<(.*?)\/?>/sgi ) { my $tag = $1; my $struct = {}; for ( $tag ) { s/\ *=\ */=/g; $_ = __encode_quotes($_); } my @tag = split /\ +/, $tag; $struct->{tag} = lc(shift @tag); next unless ($struct->{tag} eq 'link' || $struct->{tag} eq 'meta'); foreach my $str ( @tag ) { if ( $str =~ /^(.*?)=(.*)$/ ) { my $attr = $1; my $val = $2; for ( $val ) { s/^"//; s/"$//; s/ /\ /sg; } $struct->{$attr} = $val; } } push @tags, $struct; } return \@tags; } ### Имеет дело с "ободранным" тегом, # в котором отстутсвуют < и > ######################################## sub parse_html_tag { my $self = shift; my $tagstr = shift; my %struct; for ( $tagstr ) { s/\ *=\ */=/g; $_ = __encode_quotes($_); } my @tag = split /\ +/, $tagstr; $struct{tag} = lc(shift @tag); foreach my $str ( @tag ) { if ( $str =~ /^(.*?)=(.*)$/ ) { my $attr = lc($1); my $val = $2; for ( $val ) { s/^"//; s/"$//; s/ /\ /sg; } $struct{$attr} = $val; } } return \%struct; } sub __encode_quotes { my $str = shift; my @in = split //, $str; my $out = ''; my $quot = ''; foreach my $ch ( @in ) { if ( ($ch eq '"' && $quot eq '"') || ($ch eq "'" && $quot eq "'") ) { $quot = ''; } elsif ( ($ch eq "'" || $ch eq '"' ) && !$quot ) { $quot = $ch; } elsif ( ($ch eq '"' && $quot eq "'") ) { $ch = '"'; } elsif ( ($ch eq "'" && $quot eq '"') ) { $ch = '&'; } elsif ( ($ch eq ' ' && $quot) ) { $ch = ' '; } $out .= $ch; } $out =~ s/'/"/sgi; return $out; } sub image_replace { my ($self, $img_params, $replace_struct) = @_; my $img = $self->parse_html_tag('img '.$img_params); if ( exists $replace_struct->{$img->{src}} ) { my $new_image = $replace_struct->{$img->{src}}; if ( ref $new_image && exists $new_image->{filename} ) { $img->{src} = $new_image->{filename}; } else { $img->{src} = $new_image; } return ''; } else { return ''; } } sub scheme { my $uri = shift; my $scheme; $scheme = URI->new($uri)->scheme() || "file"; return $scheme; } 1;