Revision 101

Date:
2011/05/21 11:01:04
Author:
ahitrov
Revision Log:
Модификация HTML и RSS парсера
Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/core/lib/Contenido/Parser/HTML.pm

     
    10 10 use Utils::HTML;
    11 11 use Data::Dumper;
    12 12 use utf8;
    13 use Encode;
    13 14
    14
    15 15 my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
    16 16 head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
    17 17 basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
     
    53 53
    54 54 my $content;
    55 55 if ( $opts{content} ) {
    56 $content = delete $opts{content};
    56 $content = decode('utf-8', delete $opts{content});
    57 57 delete $self->{content};
    58 58 } elsif ( $self->{success} || $self->{content} ) {
    59 $content = delete $self->{content};
    59 $content = decode('utf-8', delete $self->{content});
    60 60 } else {
    61 61 $self->{success} = 0;
    62 62 return $self;
     
    66 66 my $strip_html = delete $opts{strip_html};
    67 67 my $debug = $DEBUG;
    68 68 my $gui = delete $opts{gui};
    69 my $header = delete $opts{header};
    70 my $description = delete $opts{description};
    69 my $header = decode('utf-8', delete $opts{header});
    70 warn "Header length: ".length($header)."\n";
    71 my $description = decode('utf-8', delete $opts{description});
    72 warn "Description length: ".length($description)."\n";
    71 73 my $minimum = delete $opts{min} || length $description;
    72 74
    73 75 my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
     
    120 122 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
    121 123 return $self->is_success(0) unless $gui;
    122 124 }
    123 @$chosen = sort { $a->{id} <=> $b->{id} } @$chosen;
    124 125 if ( $description ) {
    125 126 my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
    126 127 $chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
     
    165 166 }
    166 167 }
    167 168 $self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
    168 $self->{html} = $content;
    169 $self->{tree} = $shortcuts;
    169 # $self->{html} = $content;
    170 # $self->{tree} = $shortcuts;
    171 $self->{tree} = $tree;
    170 172 $self->{chosen} = $chosen;
    171 173 } else {
    172 174 $self->__post_rool ($chosen->[0], $post_rools, $description);
    173 175 $self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
    176 $self->{chosen} = $chosen;
    177 map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY';
    174 178 $tree = undef;
    175 179 foreach my $key ( keys %$shortcuts ) {
    176 180 delete $shortcuts->{$key};
     
    360 364 $last_text_tag->{type} = 'text';
    361 365 $last_text_tag->{parent} = $current;
    362 366 $last_text_tag->{level} = $level+1;
    367 $last_text_tag->{text} = $tag->{content};
    363 368 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
    364 369 push @{$current->{children}}, $last_text_tag;
    365 370 $current->{text_count}++;
    366 371 }
    367 372 $current->{text_value} += $tag->{count};
    368 373 splice @elems, 0, $tag->{count};
    374 # warn "Tag opened. Next text: [".join('',$elems[0..10])."]\n";
    369 375 } elsif ( ref $tag ) {
    370 376 if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
    371 377 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
     
    391 397 $tag->{parent} = $current;
    392 398 $tag->{level} = ++$level;
    393 399 $elem_hash{$tag->{id}} = $tag;
    400 push @{$current->{children}}, $tag;
    394 401 push @stack, $current;
    395 warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
    402 # warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
    396 403 $current = $tag;
    397 404 splice @elems, 0, $tag->{count};
    398 405 } else {
     
    411 418 $last_text_tag->{id} = $id++;
    412 419 $last_text_tag->{type} = 'text';
    413 420 $last_text_tag->{parent} = $current;
    421 $last_text_tag->{text} = $tag->{content};
    414 422 $last_text_tag->{level} = $level+1;
    415 423 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
    416 424 push @{$current->{children}}, $last_text_tag;
     
    466 474 $last_text_tag->{text} .= shift @elems;
    467 475 $last_text_tag->{count}++;
    468 476 } else {
    477 $last_text_tag = {};
    469 478 $last_text_tag->{text} = shift @elems;
    470 479 $last_text_tag->{count} = 1;
    471 480 $last_text_tag->{id} = $id++;
     
    490 499 my $i = 1;
    491 500 my %tag;
    492 501 my $tag = $content->[0];
    493 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    502 while ( $i < (scalar @$content - 1) && $content->[$i] ne '<' && $content->[$i] ne '>' ) {
    494 503 $tag .= $content->[$i];
    495 504 $i++;
    496 505 }
     
    500 509 content => $tag,
    501 510 count => $i,
    502 511 };
    503 } else {
    504 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
    505 my $val = $1;
    506 if ( $tag =~ /^<($val)\s*(.*)/i ) {
    507 $tag{type} = lc($1);
    508 my $args = $2;
    509 $tag{count} = $i+1;
    510 my %args;
    511 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
    512 $args{lc($1)} = $2;
    513 }
    514 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
    515 $args{lc($1)} = $2;
    516 }
    517 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
    518 $args{lc($1)} = $2;
    519 }
    520 foreach my $arg ( qw( name id class width align ) ) {
    521 $tag{params}{$arg} = $args{$arg} if exists $args{$arg};
    522 }
    523 return \%tag;
    524 } else {
    525 return {
    526 type => 'text',
    527 content => $tag,
    528 count => $i,
    529 };
    512 }
    513 $tag .= $content->[$i++];
    514 # warn "TAG: [$tag]\n";
    515
    516 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
    517 my $val = $1;
    518 if ( $tag =~ /^<($val)\s*(.*)/i ) {
    519 $tag{type} = lc($1);
    520 my $args = $2;
    521 $tag{count} = $i;
    522 my %args;
    523 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
    524 $args{lc($1)} = $2;
    530 525 }
    531 } else {
    526 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
    527 $args{lc($1)} = $2;
    528 }
    529 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
    530 $args{lc($1)} = $2;
    531 }
    532 foreach my $arg ( qw( name id class width align ) ) {
    533 $tag{params}{$arg} = $args{$arg} if exists $args{$arg};
    534 }
    535 return \%tag;
    536 } else {
    532 537 return {
    533 538 type => 'text',
    534 539 content => $tag,
    535 540 count => $i,
    536 541 };
    537 542 }
    543 } else {
    544 return {
    545 type => 'text',
    546 content => $tag,
    547 count => $i,
    548 };
    538 549 }
    539 550 }
    540 551
     
    544 555 my $i = 2;
    545 556 my %tag;
    546 557 my $tag = $content->[0].$content->[1];
    547 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    558 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < (scalar @$content-1) ) {
    548 559 $tag .= $content->[$i];
    549 560 $i++;
    550 561 }
     
    554 565 content => $tag,
    555 566 count => $i,
    556 567 };
    557 } else {
    558 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
    568 }
    569 $tag .= $content->[$i++];
    570 # warn "TAG END: [$tag]\n";
    571 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
    559 572 my $val = $1;
    560 if ( $tag =~ /^<\/($val)\s*?$/i ) {
    573 if ( $tag =~ /^<\/($val)[\s>]/i ) {
    561 574 $tag{type} = lc($1);
    562 $tag{count} = $i+1;
    575 $tag{count} = $i;
    563 576 return \%tag;
    564 577 } else {
    565 578 return {
     
    568 581 count => $i,
    569 582 };
    570 583 }
    571 } else {
    584 } else {
    572 585 return {
    573 586 type => 'text',
    574 587 content => $tag,
    575 588 count => $i,
    576 589 };
    577 }
    578 590 }
    579 591 }
    580 592
     
    583 595 my ($self, $structure, $base_url, $debug) = @_;
    584 596 return unless ref $structure eq 'HASH';
    585 597
    586 foreach my $tag ( values %$structure ) {
    587 next unless ref $tag && exists $tag->{text} && $tag->{text};
    598 foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
    588 599 my $text = $tag->{text};
    589 600 while ( $text =~ /<img (.*?)>/sgi ) {
    590 601 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
     
    592 603 my $img = {};
    593 604 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
    594 605 $img->{url} = $1;
    595 $img->{url} =~ s/[\r\t\n\x20]+$//;
    596 $img->{url} =~ s/^[\r\t\n\x20]+//;
    606 $img->{url} =~ s/[\r\t\n\ ]+$//;
    607 $img->{url} =~ s/^[\r\t\n\ ]+//;
    597 608 $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
    598 609 $img->{url} =~ s/\/+/\//sgi;
    599 610 $img->{url} =~ s/http:\//http:\/\//sgi;
     
    616 627 my ($self, $structure, $debug) = @_;
    617 628 return unless ref $structure eq 'HASH';
    618 629
    619 foreach my $tag ( values %$structure ) {
    620 next unless ref $tag && exists $tag->{text} && $tag->{text};
    630 foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
    621 631 my $text = $tag->{text};
    622 632 while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
    623 633 my $header_level = $1;
     
    648 658 }
    649 659
    650 660 my @ret;
    651 foreach my $tid ( sort { $a <=> $b } keys %$structure ) {
    652 my $tag = $structure->{$tid};
    653 next unless ref $tag && exists $tag->{text} && $tag->{text};
    654 next if $self->__exclude_rools($tag, \@exclude_rools);
    661 foreach my $tag ( sort { $a->{id} <=> $b->{id} } grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
    662 next if $self->__exclude_rools($tag->{parent}, \@exclude_rools);
    655 663
    656 664 if ( @rools ) {
    657 665 my $choose = 0;
    658 666 foreach my $rool ( @rools ) {
    659 667 my $matched = 1;
    660 668 foreach my $cond ( @{$rool->{condition}} ) {
    661 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
    669 unless ( exists $tag->{parent}{params}{$cond->{param}} && $tag->{parent}{params}{$cond->{param}} eq $cond->{value} ) {
    662 670 $matched = 0;
    663 671 }
    664 672 }
    665 673 $choose ||= $matched;
    666 674 }
    667 675 if ( $choose ) {
    668 $tag->{text} =~ s/^[\t\ \n\r]+//s;
    669 $tag->{text} =~ s/[\t\ \n\r]+$//s;
    670 $tag->{text} =~ s/[\t\ ]+/\ /sg;
    671 $tag->{text} =~ s/\r//sg;
    672 $tag->{text} =~ s/\n{2,}/\n\n/sg;
    673 $tag->{text} =~ s/<a.*?>//sgi;
    674 $tag->{text} =~ s/<\/a.*?>//sgi;
    675 $tag->{text} =~ s/\&\\x(\d+)//sgi;
    676 for ( $tag->{text} ) {
    677 s/^[\t\ \n\r]+//s;
    678 s/[\t\ \n\r]+$//s;
    679 s/[\t\ ]+/\ /sg;
    680 s/\r//sg;
    681 s/\n{2,}/\n\n/sg;
    682 s/\&\\x(\d+)//sgi;
    683 }
    676 684
    677 685 my $text = $tag->{text};
    678 686 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
    679 # $text = Contenido::Parser::Util::strip_html($text);
    687 $text = Contenido::Parser::Util::strip_html($text);
    688 $tag->{text_weight} = length($text);
    680 689 if ( length($text) >= $minimum ) {
    681 push @ret, $tag;
    690 for ( $tag->{text} ) {
    691 s/<a.*?>//sgi;
    692 s/<\/a.*?>//sgi;
    693 }
    694 push @ret, $tag;
    682 695 }
    683 696 }
    684 697 } else {
    698 for ( $tag->{text} ) {
    699 s/^[\t\ \n\r]+//s;
    700 s/[\t\ \n\r]+$//s;
    701 s/[\t\ ]+/\ /sg;
    702 s/\r//sg;
    703 s/\n{2,}/\n\n/sg;
    704 }
    685 705 my $text = $tag->{text};
    686 $text =~ s/^[\t\ \n\r]+//s;
    687 $text =~ s/[\t\ \n\r]+$//s;
    688 $text =~ s/[\t\ ]+/\ /sg;
    689 $text =~ s/\r//sg;
    690 $text =~ s/\n{2,}/\n\n/sg;
    691 # It does wrong job:
    692 # $text =~ s/([,!\?])(\S)/$1\ $2/sg;
    693 $tag->{text} = $text;
    694 706 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
    695 # $text = Contenido::Parser::Util::strip_html($text);
    707 $text = Contenido::Parser::Util::strip_html($text);
    708 $tag->{text_weight} = length($text);
    696 709 if ( length($text) >= $minimum ) {
    697 $tag->{text} =~ s/<a.*?>//sgi;
    698 $tag->{text} =~ s/<\/a.*?>//sgi;
    699 $tag->{text} =~ s/\&\\x(\d+)//sgi;
    710 for ( $tag->{text} ) {
    711 s/<a.*?>//sgi;
    712 s/<\/a.*?>//sgi;
    713 s/\&\\x(\d+)//sgi;
    714 }
    700 715 push @ret, $tag;
    701 716 }
    702 717 }
     
    836 851 # $unit->{text} = HTML::Entities::decode_entities($unit->{text});
    837 852 # $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
    838 853 for ( $unit->{text} ) {
    839 s/^[\x20\t\r\n]+//si;
    854 s/^[\ \t\r\n]+//si;
    840 855 s/^(\d+)\.(\d+)\.(\d+)//si;
    841 s/^[\x20\t\r\n]+//si;
    856 s/^[\ \t\r\n]+//si;
    842 857 s/^(\d+):(\d+)//si;
    843 s/^[\x20\t\r\n]+//si;
    858 s/^[\ \t\r\n]+//si;
    844 859 }
    845 860 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
    846 861 substr $unit->{text}, 0, length($header), '';
    847 $unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi;
    862 $unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
    848 863 }
    849 $unit->{text} =~ s/[\x20\t\r\n]+$//sgi;
    864 $unit->{text} =~ s/[\ \t\r\n]+$//sgi;
    850 865 }
    851 866 }
    852 867
     
    928 943 push @images, @img;
    929 944 }
    930 945 } else {
    931 next if ($tag->{level}+1) < $chosen->{level};
    932 next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) );
    946 next if ($tag->{level}+1) < $chosen->{parent}{level};
    947 next if $image_depth && ( $tag->{level} > ($chosen->{parent}{level} + $image_depth) );
    933 948
    934 949 my $ok = 0;
    935 my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2;
    936 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops;
    950 my $uphops = $tag->{level} > $chosen->{parent}{level} ? 1 : 2;
    951 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{parent}{level} + $uphops;
    937 952 next if ($hops - $uphops) > 4;
    938 953 my @img_parents = ($tag->{id});
    939 954 my $parent = $tag;
     
    941 956 $parent = $parent->{parent};
    942 957 push @img_parents, $parent->{id};
    943 958 }
    944 $parent = $chosen;;
    959 $parent = $chosen->{parent}{parent};
    945 960 for ( 0..$uphops ) {
    946 961 if ( grep { $parent->{id} == $_ } @img_parents ) {
    947 962 $ok = 1;
     
    951 966 }
    952 967 if ( $ok ) {
    953 968 my @img = grep { $self->__img_is_valid ($_) } map {
    954 my $img = rchannel::Image->new($_);
    955 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
    969 my $img = $_;
    970 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/;
    956 971 $img;
    957 972 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
    958 973
     
    972 987 sub __img_is_valid {
    973 988 my ($self, $img) = @_;
    974 989
    975 # return 1;
    990 return 1;
    976 991 if ( $img->check_online ) {
    977 992 my $delim = 0;
    978 993 my $w = $img->width;
  • utf8/core/lib/Contenido/Parser/RSS.pm

     
    68 68 $self->__check_rewrite ( item => $item, rools => $rss_rools );
    69 69 my $date = $self->__parse_date($item->{pubdate});
    70 70 my $pubdate = Contenido::DateTime->new( epoch => $date );
    71 $pubdate = $pubdate->ymd('-').' '.$pubdate->hms;
    71 # $pubdate = $pubdate->ymd('-').' '.$pubdate->hms;
    72 72 next if ref $item->{title};
    73 73 next if ref $item->{description};
    74 74 $self->__check_ignore ( item => $item, rools => $rss_rools );
     
    99 99 $link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/);
    100 100 $item->{description} = $self->__field_prepare ($item->{description});
    101 101 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
    102 my %image_href;
    102 103 my $description = $item->{description};
    103 104 if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) {
    104 105 $allow_fulltext = 1;
     
    214 215 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos;
    215 216 my @inlined_images;
    216 217 for ( $description, $fulltext ) {
    217 my $field = $_;
    218 while ( $field =~ /<img ([^>]+)>/sgi ) {
    219 my $image = $self->__parse_params( $1 );
    220 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src};
    221 }
    218 my $field = $_;
    219 while ( $field =~ /<img ([^>]+)>/sgi ) {
    220 my $image = $self->__parse_params( $1 );
    221 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src};
    222 }
    223 while ( $field =~ /<a ([^>]+)>/sgi ) {
    224 my $anchor = $self->__parse_params( $1 );
    225 if ( $anchor->{href} && $anchor->{href} =~ /\.(jpe?g|gif|png)$/ ) {
    226 push @inlined_images, { src => $anchor->{href} };
    227 }
    228 }
    222 229 }
    223 230 if ( @inlined_images ) {
    224 231 my %images = map { $_->{src} => $_ } @images, @inlined_images;
     
    1044 1051
    1045 1052 sub __field_prepare {
    1046 1053 my ($self, $text) = @_;
    1054 return unless $text;
    1047 1055
    1048 1056 $text =~ s/^[\n\r\ \t]+//;
    1049 1057 $text =~ s/[\n\r\ \t]+$//;
  • utf8/core/lib/Contenido/Parser/Util.pm

     
    26 26 return join "\n\n", grep length $_, @paragfs;
    27 27 }
    28 28
    29 sub strip_html {
    30 my $text = shift;
    31
    32 if ( ref $text ) {
    33 for ( $$text ) {
    34 s/<\/?[^>]+>//sgi;
    35 }
    36 } else {
    37 for ( $text ) {
    38 s/<\/?[^>]+>//sgi;
    39 }
    40 return $text;
    41 }
    42 }
    43
    29 44 1;