Revision 804 (by ahitrov, 2020/09/25 10:48:18) regex iterator bugfix

package Contenido::Parser;

use strict;
use warnings;
use locale;

use Encode;
use URI;
use Data::Dumper;
use Contenido::Globals;
use LWP::UserAgent;
use Contenido::File::Scheme::FILE;
use Contenido::Parser::Util;

sub new {
    my ($proto) = @_;
    my $class = ref($proto) || $proto;
    my $self = {};
    bless $self, $class;

    return $self;
}


sub fetch {
    my ($self, $input, %opts) = @_;

    my ($fh, $content);
    my $timeout = delete $opts{timeout} || 10;
    my $encoding = delete $opts{encoding};
    my $user_agent = delete $opts{user_agent};
    if (not ref $input) {
	no strict "refs";
	my $scheme = uc(scheme($input));
	if ( $scheme eq 'FILE' ) {
		$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
	} else {
		my $request = new HTTP::Request GET => $input;
		warn "REQUEST: ".Dumper( $request )		if $DEBUG;
		my $ua = new LWP::UserAgent;
		$ua->timeout($timeout);
		$ua->agent($user_agent || 'Mozilla/5.0 Firefox/11.0');
		my $res = $ua->request($request);
		if ($res->is_success) {
			$self->{headers} = $res->headers;
			my $content_length = $res->headers->header('content-length');
			my $content_type = $res->headers->header('content-type');
			my $headers_string = $res->headers->as_string;
#			warn $res->content_type_charset."\n\n";
#			warn Dumper($res->headers)		if $DEBUG;
			$self->{content_type} = $content_type;
			if ( $res->content_type_charset ) {
				$encoding = Encode::find_encoding($res->content_type_charset)->name;
			}
			my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
			$self->{base_url} = $base_url		if $base_url;
			$content = $res->decoded_content( charset => 'none' );
#			warn "Charset: ".$res->content_charset."\n";
		} else {
			warn $res->status_line." \n"		if $DEBUG;
			$self->{success} = 0;
			$self->{reason} = $res->status_line;
			return $self;
		}
	}
    } elsif ( ref $input eq 'Apache::Upload' ) {
	$fh = $input->fh;
    } elsif ((ref $input eq "GLOB") or (ref $input eq 'IO::File')) {
	$fh = $input;
    } elsif (ref $input eq "SCALAR") {
	$fh = IO::Scalar->new($input);
    } else {
	warn("Path, scalar ref or fh needed");
	$self->{success} = 0;
	$self->{reason} = 'Path, scalar ref or fh needed';
	return $self;
    }

    if ( ref $fh ) {
	$content = <$fh>;
    }
    if ( $content ) {
	warn "starting content decoding...\n"		if $DEBUG;
	if ( exists $self->{headers} && ref $self->{headers} && ($self->{headers}->content_is_html || $self->{headers}->content_is_xhtml || $self->{headers}->content_is_xml) ) {
		unless ( $encoding ) {
			$encoding = $self->__try_content_encoding( substr($content, 0, 350) );
		}
		if ( $encoding && $encoding ne 'utf-8' && $encoding ne 'utf-8-strict' ) {
			warn "Encoding from $encoding\n..."		if $DEBUG;
			Encode::from_to($content, $encoding, 'utf-8');
			if ( exists $self->{headers} ) {
				foreach my $header ( keys %{$self->{headers}} ) {
					if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
						foreach my $val ( @{$self->{headers}{$header}} ) {
							Encode::from_to($val, $encoding, 'utf-8');
						}
					} else {
						Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8');
					}
				}
			}
		} else {
#			Encode::_utf8_off($content);
			if ( exists $self->{headers} ) {
				foreach my $header ( keys %{$self->{headers}} ) {
					if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
						foreach my $val ( @{$self->{headers}{$header}} ) {
							Encode::_utf8_off($val);
						}
					} else {
						warn "Test: ".$self->{headers}{$header}.": check flag: ".Encode::is_utf8($self->{headers}{$header}).". check: ".Encode::is_utf8($self->{headers}{$header},1)."\n"	if $DEBUG;
						if ( Encode::is_utf8($self->{headers}{$header}) && Encode::is_utf8($self->{headers}{$header},1) ) {
							Encode::_utf8_off($self->{headers}{$header});
#							Encode::_utf8_on($self->{headers}{$header});
#							$self->{headers}{$header} = Encode::encode('utf8', $self->{headers}{$header}, Encode::FB_QUIET);
#							Encode::from_to($self->{headers}{$header}, $encoding, 'utf8');
						}
					}
				}
			}
		}
		$self->{encoding} = $encoding;
		warn Dumper($self)		if $DEBUG;
		if ( $self->{headers}->content_is_html ) {
			my $headers;
			if ( $content =~ /<head.*?>(.*?)<\/head>/si ) {
				$headers = $self->__parse_html_header( $1 );
			}
			if ( ref $headers eq 'ARRAY' && @$headers ) {
				foreach my $header ( @$headers ) {
					if ( $header->{tag} eq 'title' ) {
						$self->{headers}{title} = $header->{content};
					} elsif ( $header->{tag} eq 'meta' && (($header->{rel} && $header->{rel} =~ /icon/i) || ($header->{href} && $header->{href} =~ /\.ico$/)) ) {
						$self->{favicon} = $header->{href};
					}
				}
				$self->{html_headers} = $headers;
			}
		}
	}
	$self->{content} = $content;
	$self->{success} = 1;
    } else {
	$self->{success} = 0;
	$self->{reason} = 'Content is empty';
    }
    return $self;
}

sub is_success {
    my ($self, $val) = @_;

    if ( defined $val ) {
	$self->{success} = $val;
	return $self;
    } else {
	return $self->{success};
    }
}

sub __try_content_encoding {
    my ($self, $input)= @_;
    if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
	return lc($1);
    } elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
	return lc($1);
    } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
	return lc($1);
    } else {
	return undef;
    }
}

sub __parse_html_header {
    my ($self, $input)= @_;
    my @tags;
    $input =~ s/[\r\n\t]+/\ /sgi;
    if ( $input =~ /<title.*?>(.*?)<\/title.*?>/si ) {
	my $title = $1;
	for ( $title ) {
		s/^\s+//;
		s/\s+$//;
	}
	push @tags, { tag => 'title', content => $title };
    }
    while ( $input =~ /<(.*?)\/?>/sgi ) {
	my $tag = $1;
	my $struct = {};
	for ( $tag ) {
		s/\ *=\ */=/g;
		$_ = __encode_quotes($_);
	}
	my @tag = split /\ +/, $tag;
	$struct->{tag} = lc(shift @tag);
	next	unless ($struct->{tag} eq 'link' || $struct->{tag} eq 'meta');
	foreach my $str ( @tag ) {
		if ( $str =~ /^(.*?)=(.*)$/ ) {
			my $attr = $1;
			my $val = $2;
			for ( $val ) {
				s/^"//;
				s/"$//;
				s/&nbsp;/\ /sg;
			}
			$struct->{$attr} = $val;
		}
	}
	push @tags, $struct;
    }
    return \@tags;
}

### Имеет дело с "ободранным" тегом, 
#   в котором отстутсвуют < и >
########################################
sub parse_html_tag {
    my $self = shift;
    my $tagstr = shift;

    my %struct;
    for ( $tagstr ) {
	s/\ *=\ */=/g;
	$_ = __encode_quotes($_);
    }
    my @tag = split /\ +/, $tagstr;
    $struct{tag} = lc(shift @tag);

    foreach my $str ( @tag ) {
	if ( $str =~ /^(.*?)=(.*)$/ ) {
		my $attr = lc($1);
		my $val = $2;
		for ( $val ) {
			s/^"//;
			s/"$//;
			s/&nbsp;/\ /sg;
		}
		$struct{$attr} = $val;
	}
    }
    return \%struct;
}


sub __encode_quotes {
    my $str = shift;
    my @in = split //, $str;
    my $out = '';
    my $quot = '';
    foreach my $ch ( @in ) {
	if ( ($ch eq '"' && $quot eq '"') || ($ch eq "'" && $quot eq "'") ) {
		$quot = '';
	} elsif ( ($ch eq "'" || $ch eq '"' ) && !$quot ) {
		$quot = $ch;
	} elsif ( ($ch eq '"' && $quot eq "'") ) {
		$ch = '&quot;';
	} elsif ( ($ch eq "'" && $quot eq '"') ) {
		$ch = '&amp;';
	} elsif ( ($ch eq ' ' && $quot) ) {
		$ch = '&nbsp;';
	}
	$out .= $ch;
    }
    $out =~ s/'/"/sgi;
    return $out;
}


sub image_replace {
    my ($self, $img_params, $replace_struct) = @_;

    my $img = $self->parse_html_tag('img '.$img_params);
    if ( exists $replace_struct->{$img->{src}} ) {
	my $new_image = $replace_struct->{$img->{src}};
	if ( ref $new_image && exists $new_image->{filename} ) {
		$img->{src} = $new_image->{filename};
	} else {
		$img->{src} = $new_image;
	}
	return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
    } else {
	return '';
    }
}

sub scheme {
    my $uri = shift;
    my $scheme;

    $scheme = URI->new($uri)->scheme() || "file";

    return $scheme;
}


1;