Revision 146
- Date:
- 2011/10/05 08:24:53
- Files:
Legend:
- Added
- Removed
- Modified
-
utf8/core/lib/Contenido/Parser.pm
54 54 $content = $res->decoded_content( charset => 'none' ); 55 55 # warn "Charset: ".$res->content_charset."\n"; 56 56 } else { 57 warn $res->status_line." \n"; 57 warn $res->status_line." \n" if $DEBUG; 58 58 $self->{success} = 0; 59 59 $self->{reason} = $res->status_line; 60 60 return $self; … … 207 207 return \@tags; 208 208 } 209 209 210 ### Имеет дело с "ободранным" тегом, 211 # в котором отстутсвуют < и > 212 ######################################## 213 sub parse_html_tag { 214 my $self = shift; 215 my $tagstr = shift; 216 217 my %struct; 218 for ( $tagstr ) { 219 s/\ *=\ */=/g; 220 $_ = __encode_quotes($_); 221 } 222 my @tag = split /\ +/, $tagstr; 223 $struct{tag} = lc(shift @tag); 224 225 foreach my $str ( @tag ) { 226 if ( $str =~ /^(.*?)=(.*)$/ ) { 227 my $attr = lc($1); 228 my $val = $2; 229 for ( $val ) { 230 s/^"//; 231 s/"$//; 232 s/ /\ /sg; 233 } 234 $struct{$attr} = $val; 235 } 236 } 237 return \%struct; 238 } 239 240 210 241 sub __encode_quotes { 211 242 my $str = shift; 212 243 my @in = split //, $str; … … 230 261 return $out; 231 262 } 232 263 264 265 sub image_replace { 266 my ($self, $img_params, $replace_struct) = @_; 267 268 my $img = $self->parse_html_tag('img '.$img_params); 269 if ( exists $replace_struct->{$img->{src}} ) { 270 my $new_image = $replace_struct->{$img->{src}}; 271 if ( ref $new_image && exists $new_image->{filename} ) { 272 $img->{src} = $new_image->{filename}; 273 } else { 274 $img->{src} = $new_image; 275 } 276 return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>'; 277 } else { 278 return ''; 279 } 280 } 281 233 282 sub scheme { 234 283 my $uri = shift; 235 284 my $scheme; -
utf8/core/lib/Contenido/Parser/HTML.pm
110 110 warn "Make tree...\n" if $debug; 111 111 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug); 112 112 113 $self->__extract_img ($shortcuts, $base_url, $debug); 113 $self->__extract_img ($shortcuts, $base_url, $strip_html, $debug); 114 114 $self->__extract_headers ($shortcuts, $header, $debug); 115 115 warn "Getting big texts (min=$minimum)...\n" if $debug; 116 116 my $chosen = $self->__dig_big_texts ( … … 141 141 chosen => $chosen, 142 142 header => $header, 143 143 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (), 144 debug => $debug 144 debug => $debug, 145 strip_html => $strip_html, 145 146 ); 146 147 if ( ref $parse_rools eq 'ARRAY' ) { 147 148 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools; … … 595 596 596 597 597 598 sub __extract_img { 598 my ($self, $structure, $base_url, $debug) = @_; 599 my ($self, $structure, $base_url, $strip_html, $debug) = @_; 599 600 return unless ref $structure eq 'HASH'; 600 601 601 602 foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) { 602 my $text = $tag->{text}; 603 while ( $text =~ /<img (.*?)>/sgi ) { 603 while ( $tag->{text} =~ /<img (.*?)\/?>/sgi ) { 604 604 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n"; 605 605 my $params = $1; 606 my $img = {}; 607 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) { 608 $img->{url} = $1; 609 $img->{url} =~ s/[\r\t\n\ ]+$//; 610 $img->{url} =~ s/^[\r\t\n\ ]+//; 611 $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/; 612 $img->{url} =~ s/\/+/\//sgi; 613 $img->{url} =~ s/http:\//http:\/\//sgi; 614 $img->{w} = $1 if $params =~ /width[\D]+(\d+)/; 615 $img->{h} = $1 if $params =~ /height[\D]+(\d+)/; 616 $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/; 606 my $img = $self->parse_html_tag('img '.$params); 607 if ( exists $img->{src} && $img->{src} ) { 608 my %img = ( src => $img->{src} ); 609 $img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m|^/| ? '' : '/').$img{src}; 610 $img{w} = $img->{width} if $img->{width}; 611 $img{h} = $img->{height} if $img->{height}; 612 $img{alt} = $img->{alt} if $img->{alt}; 613 $img{title} = $img->{title} if $img->{title}; 617 614 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY'; 618 push @{ $tag->{images} }, $img; 615 push @{ $tag->{images} }, \%img; 616 } 617 # if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) { 618 # $img->{url} = $1; 619 # $img->{url} =~ s/[\r\t\n\ ]+$//; 620 # $img->{url} =~ s/^[\r\t\n\ ]+//; 621 # $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/; 622 # $img->{url} =~ s/\/+/\//sgi; 623 # $img->{url} =~ s/http:\//http:\/\//sgi; 624 # $img->{w} = $1 if $params =~ /width[\D]+(\d+)/; 625 # $img->{h} = $1 if $params =~ /height[\D]+(\d+)/; 626 # $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/; 627 # $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY'; 628 # push @{ $tag->{images} }, $img; 619 629 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n"; 620 } 630 # } 621 631 } 622 $text =~ s/<img (.*?)>//sgi; 623 $tag->{text} = $text; 624 $tag->{count} = length ($text); 632 $tag->{text} =~ s/<img (.*?)>//sgi if $strip_html; 633 $tag->{count} = length ($tag->{text}); 625 634 } 626 635 } 627 636 … … 716 725 s/\&\\x(\d+)//sgi; 717 726 } 718 727 push @ret, $tag; 728 # $self->log_elem($tag); 719 729 } 720 730 } 721 731 } … … 815 825 816 826 my $chosen = $opts{chosen}; 817 827 my $rooles = $opts{rools}; 818 my $header = $opts{header}; 828 my $header = $opts{header} || ''; 829 my $strip_html = $opts{strip_html}; 819 830 820 831 foreach my $unit ( @$chosen ) { 821 832 my %tags; … … 861 872 s/^(\d+):(\d+)//si; 862 873 s/^[\ \t\r\n]+//si; 863 874 } 864 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) { 875 if ( $header && lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) { 865 876 substr $unit->{text}, 0, length($header), ''; 866 877 $unit->{text} =~ s/^[\.\ \t\r\n]+//sgi; 867 878 } … … 1195 1206 } 1196 1207 1197 1208 1209 sub log_elem { 1210 my $self = shift; 1211 my $elem = shift; 1212 return unless ref $elem eq 'HASH'; 1213 1214 my %elem; 1215 map { $elem{$_} = $elem->{$_} } grep { $_ ne 'parent' } keys %$elem; 1216 warn Dumper \%elem; 1217 } 1218 1219 1198 1220 1; -
utf8/core/ports/all/libwww/GNUmakefile
5 5 6 6 #include ../../etc/perl.mk 7 7 8 PORTVERSION = 5.805 8 PORTVERSION = 5.836 9 9 DISTFILE = ${PORTNAME}-perl-${PORTVERSION}.tar.gz 10 10 PERL_MAKEMAKER = yes 11 11 MASTER_CPAN_SUBDIR = LWP