Revision 99

Date:
2011/05/11 18:09:30
Author:
ahitrov
Revision Log:
Парсер
Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/core/lib/Contenido/Parser.pm

     
    1 package Contenido::Parser;
    2
    3 use strict;
    4 use warnings;
    5 use locale;
    6
    7 use Encode;
    8 use URI;
    9 use Data::Dumper;
    10 use Contenido::Globals;
    11 use LWP::UserAgent;
    12 use Contenido::File::Scheme::FILE;
    13 use Contenido::Parser::Util;
    14
    15 sub fetch {
    16 my ($self, $input, %opts) = @_;
    17
    18 my ($fh, $content);
    19 my $encoding = delete $opts{encoding};
    20 if (not ref $input) {
    21 no strict "refs";
    22 my $scheme = uc(scheme($input));
    23 if ( $scheme eq 'FILE' ) {
    24 $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
    25 } else {
    26 my $request = new HTTP::Request GET => $input;
    27 my $ua = new LWP::UserAgent;
    28 $ua->timeout(10);
    29 my $res = $ua->request($request);
    30 if ($res->is_success) {
    31 $self->{headers} = $res->headers;
    32 my $content_length = $res->headers->header('content-length');
    33 my $content_type = $res->headers->header('content-type');
    34 $self->{content_type} = $content_type;
    35 if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) {
    36 $encoding = $1;
    37 }
    38 my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
    39 $self->{base_url} = $base_url if $base_url;
    40 $content = $res->content;
    41 } else {
    42 warn $res->status_line." \n";
    43 $self->{success} = 0;
    44 $self->{reason} = $res->status_line;
    45 return $self;
    46 }
    47 }
    48 } elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) {
    49 $fh = $input;
    50 } elsif (ref $input eq "SCALAR") {
    51 $fh = IO::Scalar->new($input);
    52 } else {
    53 warn("Path, scalar ref or fh needed");
    54 $self->{success} = 0;
    55 $self->{reason} = 'Path, scalar ref or fh needed';
    56 return $self;
    57 }
    58
    59 if ( ref $fh ) {
    60 $content = <$fh>;
    61 }
    62 if ( $content ) {
    63 warn Dumper($self);
    64 unless ( $encoding ) {
    65 $encoding = $self->__try_content_encoding( substr($content, 0, 350) );
    66 $self->{encoding} = $encoding;
    67 if ( $encoding && $encoding ne 'utf-8' ) {
    68 Encode::from_to($content, $encoding, 'utf-8');
    69 }
    70 }
    71 $self->{content} = $content;
    72 $self->{success} = 1;
    73 } else {
    74 $self->{success} = 0;
    75 $self->{reason} = 'Content is empty';
    76 }
    77 return $self;
    78 }
    79
    80 sub is_success {
    81 my ($self, $val) = @_;
    82
    83 if ( defined $val ) {
    84 $self->{success} = $val;
    85 return $self;
    86 } else {
    87 return $self->{success};
    88 }
    89 }
    90
    91 sub __try_content_encoding {
    92 my ($self, $input)= @_;
    93 if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
    94 return lc($1);
    95 } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
    96 return lc($1);
    97 } else {
    98 return undef;
    99 }
    100 }
    101
    102 sub scheme {
    103 my $uri = shift;
    104 my $scheme;
    105
    106 $scheme = URI->new($uri)->scheme() || "file";
    107
    108 return $scheme;
    109 }
    110
    111
    112 1;
  • utf8/core/lib/Contenido/Parser/HTML.pm

     
    1 package Contenido::Parser::HTML;
    2
    3 use strict;
    4 use warnings;
    5 use locale;
    6
    7 use base 'Contenido::Parser';
    8
    9 use Contenido::Globals;
    10 use Utils::HTML;
    11 use Data::Dumper;
    12 use utf8;
    13
    14
    15 my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
    16 head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
    17 basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
    18 shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones
    19 dealer diler background bg news rss index none btn cards up footer noimage but link excel price
    20 mid graphic busket map girl space catalog bann headline hosting contact schedule redir email
    21 );
    22
    23 my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru
    24 mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net
    25 );
    26
    27 my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban
    28 icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt
    29 ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc
    30 );
    31
    32
    33 my @bad_dimensions = (
    34 { w => 120, h => 60 },
    35 { w => 468, h => 60 },
    36 { w => 120, h => 600 },
    37 { w => 88, h => 31 },
    38 );
    39
    40
    41 sub new {
    42 my ($proto) = @_;
    43 my $class = ref($proto) || $proto;
    44 my $self = {};
    45 bless $self, $class;
    46
    47 return $self;
    48 }
    49
    50
    51 sub parse {
    52 my ($self, %opts) = @_;
    53
    54 my $content;
    55 if ( $opts{content} ) {
    56 $content = delete $opts{content};
    57 delete $self->{content};
    58 } elsif ( $self->{success} || $self->{content} ) {
    59 $content = delete $self->{content};
    60 } else {
    61 $self->{success} = 0;
    62 return $self;
    63 }
    64
    65 my $base_url = delete $self->{base_url} || delete $opts{base_url};
    66 my $strip_html = delete $opts{strip_html};
    67 my $debug = $DEBUG;
    68 my $gui = delete $opts{gui};
    69 my $header = delete $opts{header};
    70 my $description = delete $opts{description};
    71 my $minimum = delete $opts{min} || length $description;
    72
    73 my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
    74 warn Dumper ($pre_rools) if $debug;
    75 my $parse_rools = $self->__parse_rools (delete $opts{parser_run});
    76 warn Dumper ($parse_rools) if $debug;
    77 my $post_rools = $self->__parse_rools (delete $opts{parser_end});
    78 warn Dumper ($post_rools) if $debug;
    79
    80 # warn "Experimental. Debug!!!\n" if $debug;
    81 if ( ref $pre_rools eq 'ARRAY' ) {
    82 my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
    83 foreach my $set ( @sets ) {
    84 if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) {
    85 my $value = $set->{condition}{value};
    86 unless ( $value =~ /\D/ ) {
    87 if ( $set->{subcommand} eq 'limit' ) {
    88 $minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value);
    89 } else {
    90 $minimum = int($value);
    91 }
    92 }
    93 }
    94 if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) {
    95 $description = $header;
    96 }
    97 }
    98 }
    99 $minimum ||= 300;
    100
    101 warn "Tag cleaning...\n" if $debug;
    102 $self->__clean_tags (\$content, $pre_rools);
    103 $content =~ s/>\s+</></sgi;
    104 warn "Image cleaning...\n" if $debug;
    105 $self->__clean_img (\$content);
    106 warn "Empty div cleaning...\n" if $debug;
    107 while ( $self->__clean_empty_div (\$content) ) {}
    108 warn "Make tree...\n" if $debug;
    109 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
    110
    111 $self->__extract_img ($shortcuts, $base_url, $debug);
    112 $self->__extract_headers ($shortcuts, $header, $debug);
    113 warn "Getting big texts (min=$minimum)...\n" if $debug;
    114 my $chosen = $self->__dig_big_texts (
    115 structure => $shortcuts,
    116 min => $minimum,
    117 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
    118 debug => $debug );
    119 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
    120 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
    121 return $self->is_success(0) unless $gui;
    122 }
    123 @$chosen = sort { $a->{id} <=> $b->{id} } @$chosen;
    124 if ( $description ) {
    125 my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
    126 $chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
    127 }
    128 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
    129 $self->{error_message} = 'I didn\'t find any valuable text';
    130 return $self->is_success(0) unless $gui;
    131 }
    132 if ( scalar @$chosen > 1 ) {
    133 $chosen = $self->__check_headers ($chosen, $header, $debug);
    134 }
    135 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
    136 $self->{error_message} = 'I didn\'t find any valuable text';
    137 return $self->is_success(0) unless $gui;
    138 }
    139 $self->__strip_html (
    140 chosen => $chosen,
    141 header => $header,
    142 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
    143 debug => $debug
    144 );
    145 if ( ref $parse_rools eq 'ARRAY' ) {
    146 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
    147 $self->__glue ( $chosen, $glue, $debug ) if ref $glue;
    148 }
    149 my $images = $self->__get_images (
    150 structure => $shortcuts,
    151 chosen => $chosen->[0],
    152 base_url => $base_url,
    153 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
    154 debug => $debug,
    155 );
    156 if ( ref $images eq 'ARRAY' && @$images ) {
    157 $self->images($images);
    158 $self->image($images->[0]);
    159 }
    160
    161 if ( $gui ) {
    162 if ( ref $chosen eq 'ARRAY' ) {
    163 foreach my $elem ( @$chosen ) {
    164 $self->__post_rool ($elem, $post_rools, $description);
    165 }
    166 }
    167 $self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
    168 $self->{html} = $content;
    169 $self->{tree} = $shortcuts;
    170 $self->{chosen} = $chosen;
    171 } else {
    172 $self->__post_rool ($chosen->[0], $post_rools, $description);
    173 $self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
    174 $tree = undef;
    175 foreach my $key ( keys %$shortcuts ) {
    176 delete $shortcuts->{$key};
    177 }
    178 $shortcuts = undef;
    179 $content = undef;
    180 }
    181 return $self->is_success(1);
    182 }
    183
    184 sub __clean_tags {
    185 my ($self, $content, $rools) = @_;
    186
    187 my @cut_rools;
    188 if ( ref $rools eq 'ARRAY' && @$rools) {
    189 @cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools;
    190 }
    191 my @clean_off_rools;
    192 if ( ref $rools eq 'ARRAY' && @$rools) {
    193 @clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools;
    194 }
    195 $$content =~ s/<!DOCTYPE(.*?)>//sgi;
    196 $$content =~ s/<!--(.*?)-->//sgi;
    197 $$content =~ s/<script(.*?)<\/script>//sgi;
    198 $$content =~ s/<hr(.*?)>//sgi;
    199 $$content =~ s/<noscript(.*?)<\/noscript>//sgi;
    200 $$content =~ s/<iframe(.*?)<\/iframe>//sgi;
    201 unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) {
    202 $$content =~ s/<noindex(.*?)<\/noindex>//sgi;
    203 } else {
    204 $$content =~ s/<\/?noindex(.*?)>//sgi;
    205 }
    206 $$content =~ s/<object(.*?)<\/object>//sgi;
    207 $$content =~ s/<embed(.*?)<\/embed>//sgi;
    208 $$content =~ s/<style(.*?)<\/style>//sgi;
    209 if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) {
    210 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
    211 $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
    212 $$content =~ s/<input([^>]*?)>//sgi;
    213 } else {
    214 $$content =~ s/<form(.*?)<\/form>//sgi;
    215 }
    216 foreach my $rool ( @clean_off_rools ) {
    217 next unless $rool->{condition}{param} eq 'tag';
    218 my $tag = $rool->{condition}{value};
    219 $$content =~ s/<$tag(.*?)<\/$tag>//sgi;
    220 }
    221 $$content =~ s/<head(.*?)<\/head>//sgi;
    222 $$content =~ s/\ style="(.*?)"//sgi;
    223
    224 $$content =~ s/<\/?span(.*?)>//sgi;
    225 $$content =~ s/<\/?font(.*?)>//sgi;
    226 $$content =~ s/<br(.*?)>/\n/sgi;
    227 $$content =~ s/<link(.*?)>//sgi;
    228 $$content =~ s/<spacer(.*?)>//sgi;
    229 $$content =~ s/<\!\?(.*?)\?>//sgi;
    230 # $$content =~ s/<a\s*?(.*?)>/\n/sgi;
    231 $$content =~ s/<\/p\s*>//sgi;
    232 # $$content =~ s/<\/a\s*>//sgi;
    233 $$content =~ s/<p\s*(.*?)>/\n\n/sgi;
    234 $$content =~ s/onclick="(.*?)"//sgi;
    235 $$content =~ s/onload="(.*?)"//sgi;
    236
    237 }
    238
    239 sub __clean_img {
    240 my ($self, $content) = @_;
    241
    242 my @garbage;
    243 my $i = 1;
    244
    245 while ( $$content =~ /(<img.*?>)/sgi ) {
    246 my $img = $1;
    247 my $src;
    248 if ( $img =~ /src=([^\x20|^>]+)/i ) {
    249 $src = $1;
    250 }
    251 my ($w, $h);
    252 if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) {
    253 $w = $1;
    254 }
    255 if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) {
    256 $h = $1;
    257 }
    258 my $delim = 0;
    259 if ( $w && $h ) {
    260 foreach my $pair ( @bad_dimensions ) {
    261 if ($w == $pair->{w} && $h == $pair->{h}) {
    262 $delim = 10;
    263 last;
    264 }
    265 }
    266 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
    267 }
    268 my $bad_name = __check_img_name ( $src );
    269 if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) {
    270 # warn "Bad name: [$src]\n";
    271 push @garbage, $src;
    272 }
    273 }
    274
    275 foreach my $src (@garbage) {
    276 $src =~ s|([*+?()/\\\$\[\]])|\\$1|sg;
    277 $$content =~ s/<img([^>]*?)src=$src([^>]+)>//si;
    278 }
    279 }
    280
    281
    282 sub __check_img_name {
    283 my $name = shift;
    284 my $test = $1 if $name =~ /\/([^\/]+)$/;
    285 if ( $test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i ) {
    286 return 1;
    287 }
    288 foreach my $word ( @PICNAME ) {
    289 if ( $test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si ) {
    290 return 1;
    291 }
    292 }
    293 foreach my $word ( @PICURL ) {
    294 if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) {
    295 return 1;
    296 }
    297 }
    298 foreach my $word ( @PICHOST ) {
    299 if ( index (lc($name), $word) >= 0 ) {
    300 return 1;
    301 }
    302 }
    303 return 0;
    304 }
    305
    306
    307 sub __clean_empty_div {
    308 my ($self, $content) = @_;
    309
    310 my $i = 0;
    311 while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) {
    312 $i++;
    313 }
    314
    315 return $i;
    316 }
    317
    318
    319 sub __make_tree {
    320 my ($self, $content, $rools, $debug) = @_;
    321
    322 my @elems = split (//,$$content);
    323 my @collaborate;
    324 if ( ref $rools eq 'ARRAY' && @$rools) {
    325 @collaborate = grep { $_->{command} eq 'collaborate' } @$rools;
    326 }
    327 my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 );
    328 my $id = 1;
    329 my $level = 0;
    330 my %tree = (
    331 root => {
    332 id => $id++,
    333 text => '',
    334 type => 'root',
    335 children=> [],
    336 parent => undef,
    337 level => $level,
    338 },
    339 );
    340 my @stack;
    341 my %elem_hash = ( 1 => $tree{root} );
    342 my $current = $tree{root};
    343 my $previous;
    344
    345 while ( @elems ) {
    346 if ($elems[0] =~ /[\ \t]/ && !$current){
    347 shift @elems;
    348 next;
    349 }
    350 if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) {
    351 my $tag = $self->__try_tag (\@elems);
    352 if ( ref $tag && $tag->{type} eq 'text' ) {
    353 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
    354 if ( ref $last_text_tag ) {
    355 $last_text_tag->{text} .= $tag->{content};
    356 $last_text_tag->{count} += $tag->{count};
    357 } else {
    358 $last_text_tag = $tag;
    359 $last_text_tag->{id} = $id++;
    360 $last_text_tag->{type} = 'text';
    361 $last_text_tag->{parent} = $current;
    362 $last_text_tag->{level} = $level+1;
    363 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
    364 push @{$current->{children}}, $last_text_tag;
    365 $current->{text_count}++;
    366 }
    367 $current->{text_value} += $tag->{count};
    368 splice @elems, 0, $tag->{count};
    369 } elsif ( ref $tag ) {
    370 if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
    371 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
    372 do {
    373 $current = pop @stack;
    374 $level = $current->{level};
    375 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
    376 } while ( ($current->{type} !~ /table|body|html/) && scalar @stack );
    377
    378 }
    379 if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) {
    380 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug;
    381 do {
    382 $current = pop @stack;
    383 $level = $current->{level};
    384 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
    385 } while ( ($current->{type} eq "table") && scalar @stack );
    386
    387 }
    388 $tag->{id} = $id++;
    389 $tag->{children} = [];
    390 $tag->{text_count} = 0;
    391 $tag->{parent} = $current;
    392 $tag->{level} = ++$level;
    393 $elem_hash{$tag->{id}} = $tag;
    394 push @stack, $current;
    395 warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
    396 $current = $tag;
    397 splice @elems, 0, $tag->{count};
    398 } else {
    399 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
    400 last;
    401 }
    402 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
    403 my $tag = $self->__try_end (\@elems);
    404 if ( ref $tag && $tag->{type} eq 'text' ) {
    405 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
    406 if ( ref $last_text_tag ) {
    407 $last_text_tag->{text} .= $tag->{content};
    408 $last_text_tag->{count} += $tag->{count};
    409 } else {
    410 $last_text_tag = $tag;
    411 $last_text_tag->{id} = $id++;
    412 $last_text_tag->{type} = 'text';
    413 $last_text_tag->{parent} = $current;
    414 $last_text_tag->{level} = $level+1;
    415 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
    416 push @{$current->{children}}, $last_text_tag;
    417 $current->{text_count}++;
    418 }
    419 $current->{text_value} += $tag->{count};
    420 splice @elems, 0, $tag->{count};
    421 } elsif ( ref $tag ) {
    422 if ( $current->{type} ne $tag->{type} ) {
    423 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
    424 # warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n";
    425 if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) {
    426 do {
    427 $current = pop @stack;
    428 $level = $current->{level};
    429 # warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
    430 } while ( ($current->{type} ne $tag->{type}) && scalar @stack );
    431 $current = pop @stack;
    432 $level = $current->{level};
    433 # warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
    434 }else{
    435 # warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
    436 }
    437 } else {
    438 if ( @collaborate ) {
    439 if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate)
    440 && $previous->{type} eq $current->{type} && $previous->{level} == $current->{level}
    441 && $previous->{text} && $current->{text} ) {
    442 $previous->{text} .= ' '.$current->{text};
    443 my $parent = $current->{parent};
    444 splice @{$parent->{children}}, -1, 1;
    445 delete $elem_hash{$current->{id}};
    446 $current = undef;
    447 } elsif ( !defined $previous && $current->{text} ) {
    448 $previous = $current;
    449 } else {
    450 $previous = undef;
    451 }
    452 }
    453 $current = pop @stack;
    454 $level = $current->{level};
    455 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
    456 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
    457 }
    458 splice @elems, 0, $tag->{count};
    459 } else {
    460 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
    461 last;
    462 }
    463 } else {
    464 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
    465 if ( ref $last_text_tag ) {
    466 $last_text_tag->{text} .= shift @elems;
    467 $last_text_tag->{count}++;
    468 } else {
    469 $last_text_tag->{text} = shift @elems;
    470 $last_text_tag->{count} = 1;
    471 $last_text_tag->{id} = $id++;
    472 $last_text_tag->{type} = 'text';
    473 $last_text_tag->{parent} = $current;
    474 $last_text_tag->{level} = $level+1;
    475 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
    476 push @{$current->{children}}, $last_text_tag;
    477 $current->{text_count}++;
    478 $current->{text_value} = 0;
    479 }
    480 $current->{text_value}++;
    481 }
    482 }
    483 return (\%tree, \%elem_hash);
    484 }
    485
    486
    487 sub __try_tag {
    488 my ($self, $content) = @_;
    489
    490 my $i = 1;
    491 my %tag;
    492 my $tag = $content->[0];
    493 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    494 $tag .= $content->[$i];
    495 $i++;
    496 }
    497 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
    498 return {
    499 type => 'text',
    500 content => $tag,
    501 count => $i,
    502 };
    503 } else {
    504 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
    505 my $val = $1;
    506 if ( $tag =~ /^<($val)\s*(.*)/i ) {
    507 $tag{type} = lc($1);
    508 my $args = $2;
    509 $tag{count} = $i+1;
    510 my %args;
    511 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
    512 $args{lc($1)} = $2;
    513 }
    514 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
    515 $args{lc($1)} = $2;
    516 }
    517 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
    518 $args{lc($1)} = $2;
    519 }
    520 foreach my $arg ( qw( name id class width align ) ) {
    521 $tag{params}{$arg} = $args{$arg} if exists $args{$arg};
    522 }
    523 return \%tag;
    524 } else {
    525 return {
    526 type => 'text',
    527 content => $tag,
    528 count => $i,
    529 };
    530 }
    531 } else {
    532 return {
    533 type => 'text',
    534 content => $tag,
    535 count => $i,
    536 };
    537 }
    538 }
    539 }
    540
    541 sub __try_end {
    542 my ($self, $content) = @_;
    543
    544 my $i = 2;
    545 my %tag;
    546 my $tag = $content->[0].$content->[1];
    547 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    548 $tag .= $content->[$i];
    549 $i++;
    550 }
    551 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
    552 return {
    553 type => 'text',
    554 content => $tag,
    555 count => $i,
    556 };
    557 } else {
    558 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
    559 my $val = $1;
    560 if ( $tag =~ /^<\/($val)\s*?$/i ) {
    561 $tag{type} = lc($1);
    562 $tag{count} = $i+1;
    563 return \%tag;
    564 } else {
    565 return {
    566 type => 'text',
    567 content => $tag,
    568 count => $i,
    569 };
    570 }
    571 } else {
    572 return {
    573 type => 'text',
    574 content => $tag,
    575 count => $i,
    576 };
    577 }
    578 }
    579 }
    580
    581
    582 sub __extract_img {
    583 my ($self, $structure, $base_url, $debug) = @_;
    584 return unless ref $structure eq 'HASH';
    585
    586 foreach my $tag ( values %$structure ) {
    587 next unless ref $tag && exists $tag->{text} && $tag->{text};
    588 my $text = $tag->{text};
    589 while ( $text =~ /<img (.*?)>/sgi ) {
    590 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
    591 my $params = $1;
    592 my $img = {};
    593 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
    594 $img->{url} = $1;
    595 $img->{url} =~ s/[\r\t\n\x20]+$//;
    596 $img->{url} =~ s/^[\r\t\n\x20]+//;
    597 $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
    598 $img->{url} =~ s/\/+/\//sgi;
    599 $img->{url} =~ s/http:\//http:\/\//sgi;
    600 $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
    601 $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
    602 $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
    603 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
    604 push @{ $tag->{images} }, $img;
    605 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
    606 }
    607 }
    608 $text =~ s/<img (.*?)>//sgi;
    609 $tag->{text} = $text;
    610 $tag->{count} = length ($text);
    611 }
    612 }
    613
    614
    615 sub __extract_headers {
    616 my ($self, $structure, $debug) = @_;
    617 return unless ref $structure eq 'HASH';
    618
    619 foreach my $tag ( values %$structure ) {
    620 next unless ref $tag && exists $tag->{text} && $tag->{text};
    621 my $text = $tag->{text};
    622 while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
    623 my $header_level = $1;
    624 my $header_text = $2;
    625 $tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY';
    626 push @{ $tag->{headers} }, { level => $header_level, text => $header_text };
    627 }
    628 }
    629
    630 }
    631
    632
    633 sub __dig_big_texts {
    634 my ($self, %opts) = @_;
    635 my $structure = exists $opts{structure} ? $opts{structure} : undef;
    636 my $minimum = exists $opts{min} ? $opts{min} : undef;
    637 my $debug = exists $opts{debug} ? $opts{debug} : undef;
    638 my $rools = exists $opts{rools} ? $opts{rools} : undef;
    639 return unless ref $structure eq 'HASH';
    640
    641 my @rools;
    642 if ( ref $rools eq 'ARRAY' && @$rools) {
    643 @rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools;
    644 }
    645 my @exclude_rools;
    646 if ( ref $rools eq 'ARRAY' && @$rools) {
    647 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools;
    648 }
    649
    650 my @ret;
    651 foreach my $tid ( sort { $a <=> $b } keys %$structure ) {
    652 my $tag = $structure->{$tid};
    653 next unless ref $tag && exists $tag->{text} && $tag->{text};
    654 next if $self->__exclude_rools($tag, \@exclude_rools);
    655
    656 if ( @rools ) {
    657 my $choose = 0;
    658 foreach my $rool ( @rools ) {
    659 my $matched = 1;
    660 foreach my $cond ( @{$rool->{condition}} ) {
    661 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
    662 $matched = 0;
    663 }
    664 }
    665 $choose ||= $matched;
    666 }
    667 if ( $choose ) {
    668 $tag->{text} =~ s/^[\t\ \n\r]+//s;
    669 $tag->{text} =~ s/[\t\ \n\r]+$//s;
    670 $tag->{text} =~ s/[\t\ ]+/\ /sg;
    671 $tag->{text} =~ s/\r//sg;
    672 $tag->{text} =~ s/\n{2,}/\n\n/sg;
    673 $tag->{text} =~ s/<a.*?>//sgi;
    674 $tag->{text} =~ s/<\/a.*?>//sgi;
    675 $tag->{text} =~ s/\&\\x(\d+)//sgi;
    676
    677 my $text = $tag->{text};
    678 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
    679 # $text = Contenido::Parser::Util::strip_html($text);
    680 if ( length($text) >= $minimum ) {
    681 push @ret, $tag;
    682 }
    683 }
    684 } else {
    685 my $text = $tag->{text};
    686 $text =~ s/^[\t\ \n\r]+//s;
    687 $text =~ s/[\t\ \n\r]+$//s;
    688 $text =~ s/[\t\ ]+/\ /sg;
    689 $text =~ s/\r//sg;
    690 $text =~ s/\n{2,}/\n\n/sg;
    691 # It does wrong job:
    692 # $text =~ s/([,!\?])(\S)/$1\ $2/sg;
    693 $tag->{text} = $text;
    694 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
    695 # $text = Contenido::Parser::Util::strip_html($text);
    696 if ( length($text) >= $minimum ) {
    697 $tag->{text} =~ s/<a.*?>//sgi;
    698 $tag->{text} =~ s/<\/a.*?>//sgi;
    699 $tag->{text} =~ s/\&\\x(\d+)//sgi;
    700 push @ret, $tag;
    701 }
    702 }
    703 }
    704 unless ( @ret ) {
    705 warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug;
    706 }
    707 return \@ret;
    708 }
    709
    710
    711
    712 sub __check_headers {
    713 my ($self, $chosen, $header, $debug) = @_;
    714 return unless ref $chosen eq 'ARRAY';
    715
    716 unless ( grep { exists $_->{headers} } @$chosen ) {
    717 warn "No headers found\n" if $debug;
    718 return $chosen;
    719 } else {
    720 # @$chosen = grep { exists $_->{headers} } @$chosen;
    721 }
    722 my @ret;
    723 foreach my $unit ( @$chosen ) {
    724 unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) {
    725 $unit->{header_identity} = 0;
    726 $unit->{header_min_level} = 32768;
    727 next;
    728 }
    729 my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}};
    730 my $min_level = $headers[0]->{level};
    731 $unit->{header_min_level} = $min_level;
    732 if ( $header ) {
    733 my $coeff = $self->__str_compare( $header, $headers[0]->{text} );
    734 $unit->{header_identity} = $coeff;
    735 }
    736 }
    737 # @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen;
    738 # return \@ret;
    739 return $chosen;
    740 }
    741
    742
    743
    744 sub __check_description {
    745 my ($self, $chosen, $desc, $debug) = @_;
    746 return unless ref $chosen eq 'ARRAY' && $desc;
    747
    748 my @ret;
    749 foreach my $unit ( @$chosen ) {
    750 if ( $desc ) {
    751 my $coeff = $self->__str_compare( $unit->{text}, $desc );
    752 warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug;
    753 $unit->{description_identity} = $coeff;
    754 }
    755 }
    756 @ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen;
    757 return \@ret;
    758 }
    759
    760
    761 # wtf, bastards! how come my code's used here? --ra
    762 # damn, it's not 100% your code already
    763 sub __str_compare {
    764 my ($self, $original, $applicant) = @_;
    765
    766 my $Al = __freq_list($original);
    767 return -1 unless defined $Al;
    768 my $Bl = __freq_list($applicant);
    769 return -1 unless defined $Bl;
    770 my $df = 0;
    771
    772 foreach my $word ( %$Bl ) {
    773 if ( exists $Al->{$word} ) {
    774 $df += $Al->{$word}
    775 } else {
    776 $df -= $Bl->{$word}
    777 }
    778 }
    779
    780 return $df;
    781 }
    782
    783 sub __freq_list {
    784
    785 my @d = grep { length($_) > 3 } split /\W/, $_[0];
    786 return undef unless @d;
    787 my $z = 1/scalar(@d); my %l = ();
    788 $l{$_} += $z for @d; \%l;
    789 }
    790
    791
    792
    793 sub __strip_html {
    794 my ($self, %opts) = @_;
    795 return unless ref $opts{chosen} eq 'ARRAY';
    796
    797 my $chosen = $opts{chosen};
    798 my $rooles = $opts{rools};
    799 my $header = $opts{header};
    800
    801 foreach my $unit ( @$chosen ) {
    802 my %tags;
    803 my $headers = $unit->{headers} if exists $unit->{headers};
    804 if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) {
    805 if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) {
    806 $unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi;
    807 $unit->{text} =~ s/^[\x20\t\r\n]+//si;
    808 } elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) {
    809 while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) {
    810 my $hdr = 'h'.$1;
    811 $unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si;
    812 }
    813 }
    814 }
    815 for ( $unit->{text} ) {
    816 s/></> </sg;
    817 s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi;
    818 s/<\/h(\d+)(.*?)>/\. /sgi;
    819 s/<h(\d+)(.*?)>/\n/sgi;
    820 s/&#38;/\&/sg;
    821 s/&amp;/\&/sgi;
    822 s/&#171;/«/sg;
    823 s/&#187;/»/sg;
    824 s/&#163;/£/sg;
    825 s/&#150;/&ndash;/sg;
    826 s/&#151;/&mdash;/sg;
    827 s/&#133;/\.\.\./sg;
    828 s/&#132;/"/sg;
    829 s/&#147;/"/sg;
    830 s/&#148;/"/sg;
    831 s/&#180;/'/sg;
    832 s/&#13;/\n/sg;
    833 s/&#34;/"/sg;
    834 s/&nbsp;/\ /sgi;
    835 }
    836 # $unit->{text} = HTML::Entities::decode_entities($unit->{text});
    837 # $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
    838 for ( $unit->{text} ) {
    839 s/^[\x20\t\r\n]+//si;
    840 s/^(\d+)\.(\d+)\.(\d+)//si;
    841 s/^[\x20\t\r\n]+//si;
    842 s/^(\d+):(\d+)//si;
    843 s/^[\x20\t\r\n]+//si;
    844 }
    845 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
    846 substr $unit->{text}, 0, length($header), '';
    847 $unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi;
    848 }
    849 $unit->{text} =~ s/[\x20\t\r\n]+$//sgi;
    850 }
    851 }
    852
    853
    854
    855 sub __glue {
    856 my ($self, $chosen, $glue, $debug) = @_;
    857 return unless ref $chosen eq 'ARRAY';
    858
    859 my $i = 0;
    860
    861 if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) {
    862 my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768;
    863 foreach my $unit ( @$chosen ) {
    864 next unless $i++;
    865 if ( $i <= $count ) {
    866 $chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text};
    867 }
    868 }
    869 } elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) {
    870 my $text = '';
    871 my $i = 0;
    872 foreach my $pos ( @{ $glue->{condition} } ) {
    873 $text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text};
    874 }
    875 $chosen->[0]->{text} = $text;
    876 }
    877 }
    878
    879
    880 sub __get_images {
    881 my ($self, %opts) = @_;
    882 my $structure = exists $opts{structure} ? $opts{structure} : undef;
    883 my $chosen = exists $opts{chosen} ? $opts{chosen} : undef;
    884 my $debug = exists $opts{debug} ? $opts{debug} : undef;
    885 my $rools = exists $opts{rools} ? $opts{rools} : undef;
    886 my $base_url = delete $opts{base_url};
    887 return unless ref $chosen && ref $structure;
    888
    889 return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools;
    890 my @use_rools;
    891 my @exclude_rools;
    892 my $no_validation = 0;
    893 if ( ref $rools eq 'ARRAY' && @$rools) {
    894 @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools;
    895 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools;
    896 $no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools;
    897 }
    898 my $image_depth;
    899 if ( ref $rools eq 'ARRAY' && @$rools) {
    900 my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools;
    901 $image_depth = $rools[-1]->{condition} if @rools;
    902 }
    903
    904 my @images;
    905 foreach my $tag ( values %$structure ) {
    906 next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY';
    907 next if $self->__exclude_rools($tag, \@exclude_rools);
    908
    909 if ( @use_rools ) {
    910 my $choose = 0;
    911 foreach my $rool ( @use_rools ) {
    912 my $matched = 1;
    913 foreach my $cond ( @{$rool->{condition}} ) {
    914 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
    915 $matched = 0;
    916 }
    917 }
    918 $matched = 0 if $self->__exclude_rools($tag, \@exclude_rools);
    919 $choose ||= $matched;
    920 }
    921 if ( $choose ) {
    922 my @img = grep { $no_validation || $self->__img_is_valid ($_) } map {
    923 my $img = rchannel::Image->new($_);
    924 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
    925 $img;
    926 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
    927
    928 push @images, @img;
    929 }
    930 } else {
    931 next if ($tag->{level}+1) < $chosen->{level};
    932 next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) );
    933
    934 my $ok = 0;
    935 my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2;
    936 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops;
    937 next if ($hops - $uphops) > 4;
    938 my @img_parents = ($tag->{id});
    939 my $parent = $tag;
    940 for ( 1..$hops ) {
    941 $parent = $parent->{parent};
    942 push @img_parents, $parent->{id};
    943 }
    944 $parent = $chosen;;
    945 for ( 0..$uphops ) {
    946 if ( grep { $parent->{id} == $_ } @img_parents ) {
    947 $ok = 1;
    948 last;
    949 }
    950 $parent = $parent->{parent};
    951 }
    952 if ( $ok ) {
    953 my @img = grep { $self->__img_is_valid ($_) } map {
    954 my $img = rchannel::Image->new($_);
    955 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
    956 $img;
    957 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
    958
    959 push @images, @img;
    960 }
    961 }
    962 }
    963 # warn Dumper (\@images);
    964 if ( @images ) {
    965 return \@images;
    966 } else {
    967 return undef;
    968 }
    969 }
    970
    971
    972 sub __img_is_valid {
    973 my ($self, $img) = @_;
    974
    975 # return 1;
    976 if ( $img->check_online ) {
    977 my $delim = 0;
    978 my $w = $img->width;
    979 my $h = $img->height;
    980 if ( $w && $h ) {
    981 foreach my $pair ( @bad_dimensions ) {
    982 if ($w == $pair->{w} && $h == $pair->{h}) {
    983 return undef;
    984 }
    985 }
    986 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
    987 if ( $w < 80 || $h < 80 || $delim > 2.5 ) {
    988 return undef;
    989 }
    990 }
    991 } else {
    992 # warn "Image ".$img->src." not found on server";
    993 return undef;
    994 }
    995 return 1;
    996 }
    997
    998
    999 sub __exclude_rools {
    1000 my ($self, $tag, $rools) = @_;
    1001 return undef unless ref $rools eq 'ARRAY' && @$rools;
    1002
    1003 my $choose = 0;
    1004 foreach my $rool ( @$rools ) {
    1005 my $matched = 1;
    1006 foreach my $cond ( @{$rool->{condition}} ) {
    1007 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
    1008 $matched = 0;
    1009 }
    1010 }
    1011 $choose ||= $matched;
    1012 }
    1013 return $choose;
    1014 }
    1015
    1016
    1017 sub __parse_rools {
    1018 my ($self, $rools) = @_;
    1019 return unless $rools;
    1020 $rools =~ s/\r//sgi;
    1021 my @rools = split /\n/, $rools;
    1022 return unless @rools;
    1023
    1024 my @parsed;
    1025 foreach my $rool ( @rools ) {
    1026 my %pr;
    1027 next if $rool =~ /^#/;
    1028 $rool =~ s/[\x20\t]+$//;
    1029 $rool =~ s/^[\x20\t]+//;
    1030 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
    1031 $pr{command} = lc($1);
    1032 my $params = $2;
    1033
    1034 if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
    1035 $pr{subcommand} = lc($1); $params = $2;
    1036 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/;
    1037 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
    1038 $pr{condition} = $params;
    1039 } elsif ( $pr{command} eq 'glue' ) {
    1040 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
    1041 $pr{subcommand} = $1; $params = $2;
    1042 next unless $pr{subcommand} =~ /^(first|all|order)$/;
    1043 if ( $pr{subcommand} eq 'order' ) {
    1044 my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params);
    1045 $pr{condition} = \@pars;
    1046 } else {
    1047 $pr{condition} = int($1);
    1048 }
    1049 } elsif ( $params =~ /(\d+)/i ) {
    1050 $pr{subcommand} = 'first';
    1051 $pr{condition} = int($1);
    1052 } else {
    1053 $pr{subcommand} = 'all';
    1054 }
    1055 } elsif ( $pr{command} eq 'trim' ) {
    1056 if ( $params =~ /(left|right)/i ) {
    1057 $pr{subcommand} = lc($1);
    1058 } else {
    1059 $pr{subcommand} = 'all';
    1060 }
    1061 } elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) {
    1062 $pr{condition} = $1;
    1063 } elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) {
    1064 $pr{command} = 'image_off';
    1065 } elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) {
    1066 $pr{subcommand} = lc($1); $params = $2;
    1067 next unless $pr{subcommand} =~ /^(depth)$/;
    1068 $pr{condition} = $params;
    1069 } elsif ( $pr{command} eq 'set' ) {
    1070 if ( $params =~ /^(limit)\s+(.*)$/si ) {
    1071 $pr{subcommand} = lc($1);
    1072 $params = $2;
    1073 }
    1074 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
    1075 $pr{condition} = { param => $1, value => $2 };
    1076 } else {
    1077 next;
    1078 }
    1079 } elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) {
    1080 $pr{command} = 'kill';
    1081 $pr{condition} = { param => $2, value => $1 };
    1082 } elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) {
    1083 $pr{command} = 'set';
    1084 $pr{condition} = { param => 'description', value => 'header' };
    1085 } elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) {
    1086 $pr{subcommand} = $1; $params = $2;
    1087 next unless $pr{subcommand} =~ /^(element|elem|image)$/;
    1088 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
    1089 my @conditions;
    1090 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
    1091 push @conditions, { param => lc($1), value => $2 }
    1092 }
    1093 $pr{condition} = \@conditions;
    1094 } elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) {
    1095 $pr{subcommand} = lc($1); $params = $2;
    1096 next unless $pr{subcommand} =~ /^(image|elem|element)$/;
    1097 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
    1098 my @conditions;
    1099 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
    1100 push @conditions, { param => lc($1), value => $2 }
    1101 }
    1102 $pr{condition} = \@conditions;
    1103 } elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) {
    1104 $pr{command} = 'dont';
    1105 $pr{subcommand} = lc($1); $params = $2;
    1106 next unless $pr{subcommand} =~ /^(cut|validate)$/;
    1107 my @conditions;
    1108 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
    1109 $pr{condition} = { param => lc($1), value => $2 };
    1110 } elsif ( $params =~ /(image)/i ) {
    1111 $pr{condition} = { param => lc($1) };
    1112 } else {
    1113 next;
    1114 }
    1115 } elsif ( $pr{command} eq 'clean' ) {
    1116 if ( $params =~ /^(off)\s+(.*)$/ ) {
    1117 $pr{subcommand} = lc($1); $params = $2;
    1118 }
    1119 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
    1120 $pr{condition} = { param => lc($1), value => $2 };
    1121 } elsif ( $params =~ /(span)/i ) {
    1122 $pr{condition} = { param => 'tag', value => lc($1) };
    1123 } else {
    1124 next;
    1125 }
    1126 } else {
    1127 next;
    1128 }
    1129 push @parsed, \%pr;
    1130 }
    1131 }
    1132 return ( scalar @parsed ? \@parsed : undef );
    1133 }
    1134
    1135
    1136 sub __post_rool {
    1137 my ($self, $element, $rools, $description) = @_;
    1138
    1139 if ( ref $rools eq 'ARRAY' && @$rools ) {
    1140 foreach my $rool ( @$rools ) {
    1141 if ( $rool->{command} eq 'cut' ) {
    1142 my $condition = $rool->{condition};
    1143 if ( $rool->{subcommand} eq 'off' ) {
    1144 $element->{text} =~ s/$condition//sgi;
    1145 } elsif ( $rool->{subcommand} eq 'from' ) {
    1146 my $pos = index $element->{text}, $condition;
    1147 if ( $pos >= 0 ) {
    1148 $element->{text} = substr $element->{text}, 0, $pos;
    1149 }
    1150 # $element->{text} =~ s/$condition(.*)$//si;
    1151 } elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) {
    1152 my $str = substr $description, 0, 12;
    1153 my $pos = index $element->{text}, $str;
    1154 if ( $pos >= 0 ) {
    1155 $element->{text} = substr $element->{text}, $pos, -1;
    1156 }
    1157 } elsif ( $rool->{subcommand} eq 'till' ) {
    1158 $element->{text} =~ s/^(.*?)($condition)//si;
    1159 } elsif ( $rool->{subcommand} eq 'untill' ) {
    1160 $element->{text} =~ s/^(.*?)($condition)/$2/si;
    1161 } elsif ( $rool->{subcommand} eq 'regex' ) {
    1162 $element->{text} =~ s/$condition//sgi;
    1163 }
    1164 } elsif ( $rool->{command} eq 'trim' ) {
    1165 if ( $rool->{subcommand} eq 'left' ) {
    1166 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
    1167 } elsif ( $rool->{subcommand} eq 'right' ) {
    1168 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
    1169 } else {
    1170 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
    1171 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
    1172 }
    1173 }
    1174 }
    1175 }
    1176 }
    1177
    1178
    1179 1;
  • utf8/core/lib/Contenido/Parser/RSS.pm

     
    4 4 use warnings;
    5 5 use locale;
    6 6
    7 use Contenido::Parser::Util;
    7 use base 'Contenido::Parser';
    8
    9 use Contenido::Globals;
    8 10 use Utils::HTML;
    9 #use Time::ParseDate;
    11 use Time::ParseDate;
    10 12 #use Date::Parse;
    11 13 use Data::Dumper;
    12 14 use Digest::MD5 qw(md5_hex);
     
    33 35 sub parse {
    34 36 my ($self, %opts) = @_;
    35 37
    36 my $content = delete $opts{content};
    37 my $base_url = delete $opts{base_url};
    38 my $strip_html = delete $opts{strip_html};
    38 my $content;
    39 if ( $opts{content} ) {
    40 $content = delete $opts{content};
    41 delete $self->{content};
    42 } elsif ( $self->{success} || $self->{content} ) {
    43 $content = delete $self->{content};
    44 } else {
    45 $self->{success} = 0;
    46 return $self;
    47 }
    48 my $base_url = delete $self->{base_url} || delete $opts{base_url};
    39 49 my $allow_global_fulltext = delete $opts{allow_fulltext} || 0;
    40 50 my $content_global_type = delete $opts{content_type} || 1;
    41 my $debug = delete $opts{debug};
    51 my $debug = $DEBUG;
    42 52 my $gui = delete $opts{gui};
    43 53 my $description_as_fulltext = delete $opts{description_as_fulltext};
    44 54 warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug;
     
    56 66 my $content_type = $content_global_type;
    57 67 my $allow_fulltext = $allow_global_fulltext;
    58 68 $self->__check_rewrite ( item => $item, rools => $rss_rools );
    59 my $date = Time::ParseDate::parsedate($item->{pubdate});
    60 my $pubdate = Class::Date::localdate(Date::Parse::str2time($item->{pubdate}));
    69 my $date = $self->__parse_date($item->{pubdate});
    70 my $pubdate = Contenido::DateTime->new( epoch => $date );
    71 $pubdate = $pubdate->ymd('-').' '.$pubdate->hms;
    61 72 next if ref $item->{title};
    62 73 next if ref $item->{description};
    63 74 $self->__check_ignore ( item => $item, rools => $rss_rools );
     
    154 165 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
    155 166 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att;
    156 167 @images = map {
    157 my $img = rchannel::Image->new($_);
    158 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; $img;
    168 my $img = $_;
    169 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; $img;
    159 170 } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
    160 171 }
    161 172 my @videos;
     
    201 212 }
    202 213 }
    203 214 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos;
    215 my @inlined_images;
    216 for ( $description, $fulltext ) {
    217 my $field = $_;
    218 while ( $field =~ /<img ([^>]+)>/sgi ) {
    219 my $image = $self->__parse_params( $1 );
    220 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src};
    221 }
    222 }
    223 if ( @inlined_images ) {
    224 my %images = map { $_->{src} => $_ } @images, @inlined_images;
    225 @images = values %images;
    226 }
    204 227 push @items, {
    205 228 'checksum' => md5_hex(encode_utf8($title.$description)),
    206 229 'ignore' => $item->{ignore} || 0,
     
    228 251 };
    229 252 }
    230 253 } else {
    231 $self->error_message($@ || 'Something wrong while parsing content');
    254 warn ($@ || 'Something wrong while parsing content');
    232 255 return $self->is_success(0);
    233 256 }
    234 257
    235 $self->items(\@items);
    258 $self->{items} = \@items;
    236 259 return $self->is_success(1);
    237 260 }
    238 261
     
    1023 1046 my ($self, $text) = @_;
    1024 1047
    1025 1048 # $text =~ s/^[\n\r\x20\t]+//;
    1049 $text =~ s/[\n\r\x20\t]+$//;
    1026 1050 $self->__cdata (\$text);
    1027 1051 $self->__extchar (\$text);
    1028 $text = HTML::Entities::decode_entities($text);
    1052 # $text = HTML::Entities::decode_entities($text);
    1029 1053
    1030 1054 # Remove linebreaks inside incorrectly breaked paragraphs
    1031 1055 if (length($text) > 100) {
     
    1048 1072 $text =~ s/<br[^>]*>/\n/sgi;
    1049 1073 $text =~ s/<p\s*>/\n\n/sgi;
    1050 1074 $text =~ s/<\/p\s*>//sgi;
    1051 $text = rchannel::Parser::Util::strip_html($text);
    1052 $text = rchannel::Parser::Util::text_cleanup($text);
    1075 # $text = Contenido::Parser::Util::strip_html($text);
    1076 # $text = Contenido::Parser::Util::text_cleanup($text);
    1053 1077 return $text;
    1054 1078 }
    1055 1079
     
    1060 1084 for ( $$textref ) {
    1061 1085 s/&#38;/\&/sg;
    1062 1086 s/\&amp;/\&/sgi;
    1063 s/&#171;/«/sg;
    1064 s/&#187;/»/sg;
    1065 s/&#163;/£/sg;
    1066 s/&#150;/&ndash;/sg;
    1067 s/&#151;/&mdash;/sg;
    1068 s/&#132;/"/sg;
    1069 s/&#147;/"/sg;
    1070 s/&#148;/"/sg;
    1071 s/&#180;/'/sg;
    1072 s/&#133;/\.\.\./sg;
    1073 s/&#13;/\n/sg;
    1074 s/&#34;/"/sg;
    1075 s/\xA0/\x20/sg;
    1087 s/\&amp;/\&/sgi;
    1088 s/\&quot;/"/sgi;
    1089 s/\&#171;/«/sg;
    1090 s/\&#187;/»/sg;
    1091 s/\&#163;/£/sg;
    1092 s/\&#150;/&ndash;/sg;
    1093 s/\&#151;/&mdash;/sg;
    1094 s/\&#132;/"/sg;
    1095 s/\&#147;/"/sg;
    1096 s/\&#148;/"/sg;
    1097 s/\&#180;/'/sg;
    1098 s/\&#133;/\.\.\./sg;
    1099 s/\&#13;/\n/sg;
    1100 s/\&#34;/"/sg;
    1076 1101 }
    1077 1102 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg;
    1078 1103 # $$textref =~ s/&gt;/>/sgi;
     
    1202 1227 }
    1203 1228
    1204 1229
    1230 sub __parse_date {
    1231 my $self = shift;
    1232 my $str = shift;
    1233
    1234 if ($str=~/(\d{2})(\d{2})(\d{4})T(\d{2})(\d{2})(\d{2})/){
    1235 return parsedate ("$3-$2-$1 $4:$5:$6");
    1236 } elsif ($str=~/(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})/){
    1237 return parsedate ("$1 $2");
    1238 } else {
    1239 return parsedate($str);
    1240 }
    1241 }
    1242
    1243
    1244
    1205 1245 # TODO IMAGES:
    1206 1246 # enclosure
    1207 1247 # media:content