Revision 147

Date:
2011/10/06 11:02:33
Author:
ahitrov
Revision Log:
New param transform => ['crop','WxH'] in image field description
HTML parser now can understand <a href="some.image.jpg"><img src="small.image.jpg"></a> structures
HTML parser don't cut <a>-tags without 'strip_html' param
Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/core/lib/Contenido/File.pm

     
    214 214 if ( ref $image_info && $image_info->{file_ext} ne $ext ) {
    215 215 rename $filename_tmp.'.'.$ext, $filename_tmp.'.'.$image_info->{file_ext};
    216 216 $ext = $image_info->{file_ext};
    217 } elsif ( !ref $image_info ) {
    218 unlink $filename_tmp.'.'.$ext;
    219 return undef;
    217 220 }
    221 my $transformed;
    222 if ( exists $prop->{transform} && ref $prop->{transform} eq 'ARRAY' && scalar @{$prop->{transform}} == 2 && $prop->{transform}[0] =~ /(crop|resize|shrink)/ ) {
    223 my $c_line;
    224 if ( $prop->{transform}[0] eq 'resize' ) {
    225 $c_line = $state->{'convert_binary'}.' -resize \''.$prop->{transform}[1].'\' -quality 80 '.$filename_tmp.'.'.$ext.' '.$filename_tmp.'.transformed.'.$ext;
    226 } elsif ( $prop->{transform}[0] eq 'crop' ) {
    227 my $shave_string;
    228 my ($nwidth, $nheight) = $prop->{transform}[1] =~ /(\d+)x(\d+)/i ? ($1, $2) : (0, 0);
    229 if ( ($image_info->{width} / $image_info->{height}) > ($nwidth / $nheight) ) {
    230 my $shave_pixels = (($image_info->{width} / $image_info->{height}) - ($nwidth / $nheight)) * $image_info->{height};
    231 $shave_string = ' -shave '.int($shave_pixels / 2).'x0';
    232 } elsif ( ($image_info->{height} / $image_info->{width}) > ($nheight / $nwidth) ) {
    233 my $shave_pixels = (($image_info->{height} / $image_info->{width}) - ($nheight / $nwidth)) * $image_info->{width};
    234 $shave_string = ' -shave 0x'.int($shave_pixels / 2);
    235 }
    236 if ( $shave_string ) {
    237 my $c_line = $state->{"convert_binary"}." $shave_string $filename_tmp.$ext $filename_tmp.shaved.$ext";
    238 my $result = `$c_line`;
    239 if (length $result > 0) {
    240 print "Contenido Error: При вызове '$c_line' произошла ошибка '$result' ($@)\n";
    241 return undef;
    242 }
    243 } else {
    244 my $c_line = "cp $filename_tmp.$ext $filename_tmp.shaved.$ext";
    245 my $result = `$c_line`;
    246 if (length $result > 0) {
    247 print "Contenido Error: При вызове '$c_line' произошла ошибка '$result' ($@)\n";
    248 return undef;
    249 }
    250 }
    251 $c_line = $state->{'convert_binary'}.' -geometry \''.$prop->{transform}[1].'!\' -quality 80 '.$filename_tmp.'.shaved.'.$ext.' '.$filename_tmp.'.transformed.'.$ext;
    252 } elsif ( $prop->{transform}[0] eq 'shrink' ) {
    253 $c_line = $state->{'convert_binary'}.' -geometry \''.$prop->{transform}[1].'!\' -quality 80 '.$filename_tmp.'.'.$ext.' '.$filename_tmp.'.transformed.'.$ext;
    254 }
    255 my $result = `$c_line`;
    256 $transformed = 1;
    257 unlink $filename_tmp.'.shaved.'.$ext if -e $filename_tmp.'.shaved.'.$ext;
    258 }
    218 259
    219 260 my $IMAGE;
    220 if ( store($filename.'.'.$ext, $filename_tmp.'.'.$ext) ) {
    261 my $stored = $transformed ? store($filename.'.'.$ext, $filename_tmp.'.transformed.'.$ext) : store($filename.'.'.$ext, $filename_tmp.'.'.$ext);
    262 if ( $stored ) {
    221 263 $IMAGE = {};
    222 # hashref slice assigning - жжесть
    223 @{$IMAGE}{'filename', 'width', 'height'} = (
    224 $filename.'.'.$ext,
    225 Image::Size::imgsize($filename_tmp.'.'.$ext),
    226 );
    264 if ( $transformed && -e $filename_tmp.'.transformed.'.$ext ) {
    265 # hashref slice assigning - жжесть
    266 @{$IMAGE}{'filename', 'width', 'height'} = (
    267 $filename.'.transformed.'.$ext,
    268 Image::Size::imgsize($filename_tmp.'.'.$ext),
    269 );
    270 unlink $filename_tmp.'.transformed.'.$ext;
    271 } else {
    272 # hashref slice assigning - жжесть
    273 @{$IMAGE}{'filename', 'width', 'height'} = (
    274 $filename.'.'.$ext,
    275 Image::Size::imgsize($filename_tmp.'.'.$ext),
    276 );
    277 }
    227 278
    228 279 foreach my $suffix (@preview) {
    229 280 my $c_line = $state->{'convert_binary'}.' -geometry \''.$suffix.'\' -quality 80 '.$filename_tmp.'.'.$ext.' '.$filename_tmp.'.'.$suffix.'.'.$ext;
  • utf8/core/lib/Contenido/Parser/HTML.pm

     
    79 79 my $post_rools = $self->__parse_rools (delete $opts{parser_end});
    80 80 warn Dumper ($post_rools) if $debug;
    81 81
    82 # warn "Experimental. Debug!!!\n" if $debug;
    82 ##### Experimental things sometimes transform to things for everyday use
    83 #########################################################################
    83 84 if ( ref $pre_rools eq 'ARRAY' ) {
    84 85 my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
    85 86 foreach my $set ( @sets ) {
     
    114 115 $self->__extract_headers ($shortcuts, $header, $debug);
    115 116 warn "Getting big texts (min=$minimum)...\n" if $debug;
    116 117 my $chosen = $self->__dig_big_texts (
    117 structure => $shortcuts,
    118 min => $minimum,
    118 structure => $shortcuts,
    119 min => $minimum,
    119 120 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
    120 debug => $debug );
    121 strip_html => $strip_html,
    122 debug => $debug );
    121 123 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
    122 124 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
    123 125 return $self->is_success(0) unless $gui;
     
    600 602 return unless ref $structure eq 'HASH';
    601 603
    602 604 foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
    603 while ( $tag->{text} =~ /<img (.*?)\/?>/sgi ) {
    605 while ( $tag->{text} =~ /<img[\ \t](.*?)\/?>/sgi ) {
    604 606 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
    605 607 my $params = $1;
    606 608 my $img = $self->parse_html_tag('img '.$params);
    607 609 if ( exists $img->{src} && $img->{src} ) {
    608 610 my %img = ( src => $img->{src} );
    609 611 $img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m|^/| ? '' : '/').$img{src};
    612 $img{type} = 'inner';
    610 613 $img{w} = $img->{width} if $img->{width};
    611 614 $img{h} = $img->{height} if $img->{height};
    612 615 $img{alt} = $img->{alt} if $img->{alt};
     
    614 617 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    615 618 push @{ $tag->{images} }, \%img;
    616 619 }
    617 # if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
    618 # $img->{url} = $1;
    619 # $img->{url} =~ s/[\r\t\n\ ]+$//;
    620 # $img->{url} =~ s/^[\r\t\n\ ]+//;
    621 # $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
    622 # $img->{url} =~ s/\/+/\//sgi;
    623 # $img->{url} =~ s/http:\//http:\/\//sgi;
    624 # $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
    625 # $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
    626 # $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
    627 # $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    628 # push @{ $tag->{images} }, $img;
    629 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
    630 # }
    631 620 }
    621 while ( $tag->{text} =~ /<a[\ \t](.*?)\/?>/sgi ) {
    622 my $params = $1;
    623 my $anc = $self->parse_html_tag('a '.$params);
    624 if ( exists $anc->{href} && $anc->{href} && $anc->{href} =~ /\.(jpe?g|gif|png|bmp|tiff?)$/ ) {
    625 my %img = ( src => $anc->{href} );
    626 $img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m|^/| ? '' : '/').$img{src};
    627 $img{type} = 'external';
    628 $img{title} = $anc->{title} if $anc->{title};
    629 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    630 push @{ $tag->{images} }, \%img;
    631 }
    632 }
    633 while ( $tag->{text} =~ /<a[\ \t](.*?)\/?><img[\ \t](.*?)\/?>/sgi ) {
    634 my $aparams = $1;
    635 my $iparams = $1;
    636 my $anc = $self->parse_html_tag('a '.$aparams);
    637 my $img = $self->parse_html_tag('img '.$iparams);
    638 if ( exists $anc->{href} && $anc->{href} && $anc->{href} =~ /\.(jpe?g|gif|png|bmp|tiff?)$/ ) {
    639 my @images = grep { $_->{src} eq $img->{src} } @{ $tag->{images} };
    640 map {
    641 $_->{ext_src} = $anc->{href};
    642 $_->{ext_url} = $anc->{href} =~ /^http[s]?:/ ? $anc->{href} : $base_url.($anc->{href} =~ m|^/| ? '' : '/').$anc->{href};
    643 } @images;
    644 }
    645 }
    632 646 $tag->{text} =~ s/<img (.*?)>//sgi if $strip_html;
    633 647 $tag->{count} = length ($tag->{text});
    634 648 }
     
    658 672 my $minimum = exists $opts{min} ? $opts{min} : undef;
    659 673 my $debug = exists $opts{debug} ? $opts{debug} : undef;
    660 674 my $rools = exists $opts{rools} ? $opts{rools} : undef;
    675 my $strip_html = exists $opts{strip_html} ? $opts{strip_html} : undef;
    661 676 return unless ref $structure eq 'HASH';
    662 677
    663 678 my @rools;
     
    699 714 $text = Contenido::Parser::Util::strip_html($text);
    700 715 $tag->{text_weight} = length($text);
    701 716 if ( length($text) >= $minimum ) {
    702 for ( $tag->{text} ) {
    717 for ( $tag->{text} && $strip_html ) {
    703 718 s/<a.*?>//sgi;
    704 719 s/<\/a.*?>//sgi;
    705 720 }
     
    720 735 $tag->{text_weight} = length($text);
    721 736 if ( length($text) >= $minimum ) {
    722 737 for ( $tag->{text} ) {
    723 s/<a.*?>//sgi;
    724 s/<\/a.*?>//sgi;
    738 if ( $strip_html ) {
    739 s/<a.*?>//sgi;
    740 s/<\/a.*?>//sgi;
    741 }
    725 742 s/\&\\x(\d+)//sgi;
    726 743 }
    727 744 push @ret, $tag;