Revision 146

Date:
2011/10/05 08:24:53
Author:
ahitrov
Revision Log:
Parser functions and HTML Parser image search and replace
Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/core/lib/Contenido/Parser.pm

     
    54 54 $content = $res->decoded_content( charset => 'none' );
    55 55 # warn "Charset: ".$res->content_charset."\n";
    56 56 } else {
    57 warn $res->status_line." \n";
    57 warn $res->status_line." \n" if $DEBUG;
    58 58 $self->{success} = 0;
    59 59 $self->{reason} = $res->status_line;
    60 60 return $self;
     
    207 207 return \@tags;
    208 208 }
    209 209
    210 ### Имеет дело с "ободранным" тегом,
    211 # в котором отстутсвуют < и >
    212 ########################################
    213 sub parse_html_tag {
    214 my $self = shift;
    215 my $tagstr = shift;
    216
    217 my %struct;
    218 for ( $tagstr ) {
    219 s/\ *=\ */=/g;
    220 $_ = __encode_quotes($_);
    221 }
    222 my @tag = split /\ +/, $tagstr;
    223 $struct{tag} = lc(shift @tag);
    224
    225 foreach my $str ( @tag ) {
    226 if ( $str =~ /^(.*?)=(.*)$/ ) {
    227 my $attr = lc($1);
    228 my $val = $2;
    229 for ( $val ) {
    230 s/^"//;
    231 s/"$//;
    232 s/&nbsp;/\ /sg;
    233 }
    234 $struct{$attr} = $val;
    235 }
    236 }
    237 return \%struct;
    238 }
    239
    240
    210 241 sub __encode_quotes {
    211 242 my $str = shift;
    212 243 my @in = split //, $str;
     
    230 261 return $out;
    231 262 }
    232 263
    264
    265 sub image_replace {
    266 my ($self, $img_params, $replace_struct) = @_;
    267
    268 my $img = $self->parse_html_tag('img '.$img_params);
    269 if ( exists $replace_struct->{$img->{src}} ) {
    270 my $new_image = $replace_struct->{$img->{src}};
    271 if ( ref $new_image && exists $new_image->{filename} ) {
    272 $img->{src} = $new_image->{filename};
    273 } else {
    274 $img->{src} = $new_image;
    275 }
    276 return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
    277 } else {
    278 return '';
    279 }
    280 }
    281
    233 282 sub scheme {
    234 283 my $uri = shift;
    235 284 my $scheme;
  • utf8/core/lib/Contenido/Parser/HTML.pm

     
    110 110 warn "Make tree...\n" if $debug;
    111 111 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
    112 112
    113 $self->__extract_img ($shortcuts, $base_url, $debug);
    113 $self->__extract_img ($shortcuts, $base_url, $strip_html, $debug);
    114 114 $self->__extract_headers ($shortcuts, $header, $debug);
    115 115 warn "Getting big texts (min=$minimum)...\n" if $debug;
    116 116 my $chosen = $self->__dig_big_texts (
     
    141 141 chosen => $chosen,
    142 142 header => $header,
    143 143 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
    144 debug => $debug
    144 debug => $debug,
    145 strip_html => $strip_html,
    145 146 );
    146 147 if ( ref $parse_rools eq 'ARRAY' ) {
    147 148 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
     
    595 596
    596 597
    597 598 sub __extract_img {
    598 my ($self, $structure, $base_url, $debug) = @_;
    599 my ($self, $structure, $base_url, $strip_html, $debug) = @_;
    599 600 return unless ref $structure eq 'HASH';
    600 601
    601 602 foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
    602 my $text = $tag->{text};
    603 while ( $text =~ /<img (.*?)>/sgi ) {
    603 while ( $tag->{text} =~ /<img (.*?)\/?>/sgi ) {
    604 604 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
    605 605 my $params = $1;
    606 my $img = {};
    607 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
    608 $img->{url} = $1;
    609 $img->{url} =~ s/[\r\t\n\ ]+$//;
    610 $img->{url} =~ s/^[\r\t\n\ ]+//;
    611 $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
    612 $img->{url} =~ s/\/+/\//sgi;
    613 $img->{url} =~ s/http:\//http:\/\//sgi;
    614 $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
    615 $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
    616 $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
    606 my $img = $self->parse_html_tag('img '.$params);
    607 if ( exists $img->{src} && $img->{src} ) {
    608 my %img = ( src => $img->{src} );
    609 $img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m|^/| ? '' : '/').$img{src};
    610 $img{w} = $img->{width} if $img->{width};
    611 $img{h} = $img->{height} if $img->{height};
    612 $img{alt} = $img->{alt} if $img->{alt};
    613 $img{title} = $img->{title} if $img->{title};
    617 614 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    618 push @{ $tag->{images} }, $img;
    615 push @{ $tag->{images} }, \%img;
    616 }
    617 # if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
    618 # $img->{url} = $1;
    619 # $img->{url} =~ s/[\r\t\n\ ]+$//;
    620 # $img->{url} =~ s/^[\r\t\n\ ]+//;
    621 # $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
    622 # $img->{url} =~ s/\/+/\//sgi;
    623 # $img->{url} =~ s/http:\//http:\/\//sgi;
    624 # $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
    625 # $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
    626 # $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
    627 # $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    628 # push @{ $tag->{images} }, $img;
    619 629 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
    620 }
    630 # }
    621 631 }
    622 $text =~ s/<img (.*?)>//sgi;
    623 $tag->{text} = $text;
    624 $tag->{count} = length ($text);
    632 $tag->{text} =~ s/<img (.*?)>//sgi if $strip_html;
    633 $tag->{count} = length ($tag->{text});
    625 634 }
    626 635 }
    627 636
     
    716 725 s/\&\\x(\d+)//sgi;
    717 726 }
    718 727 push @ret, $tag;
    728 # $self->log_elem($tag);
    719 729 }
    720 730 }
    721 731 }
     
    815 825
    816 826 my $chosen = $opts{chosen};
    817 827 my $rooles = $opts{rools};
    818 my $header = $opts{header};
    828 my $header = $opts{header} || '';
    829 my $strip_html = $opts{strip_html};
    819 830
    820 831 foreach my $unit ( @$chosen ) {
    821 832 my %tags;
     
    861 872 s/^(\d+):(\d+)//si;
    862 873 s/^[\ \t\r\n]+//si;
    863 874 }
    864 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
    875 if ( $header && lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
    865 876 substr $unit->{text}, 0, length($header), '';
    866 877 $unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
    867 878 }
     
    1195 1206 }
    1196 1207
    1197 1208
    1209 sub log_elem {
    1210 my $self = shift;
    1211 my $elem = shift;
    1212 return unless ref $elem eq 'HASH';
    1213
    1214 my %elem;
    1215 map { $elem{$_} = $elem->{$_} } grep { $_ ne 'parent' } keys %$elem;
    1216 warn Dumper \%elem;
    1217 }
    1218
    1219
    1198 1220 1;
  • utf8/core/ports/all/libwww/GNUmakefile

     
    5 5
    6 6 #include ../../etc/perl.mk
    7 7
    8 PORTVERSION = 5.805
    8 PORTVERSION = 5.836
    9 9 DISTFILE = ${PORTNAME}-perl-${PORTVERSION}.tar.gz
    10 10 PERL_MAKEMAKER = yes
    11 11 MASTER_CPAN_SUBDIR = LWP