Revision 99
- Date:
- 2011/05/11 18:09:30
- Files:
Legend:
- Added
- Removed
- Modified
-
utf8/core/lib/Contenido/Parser.pm
1 package Contenido::Parser; 2 3 use strict; 4 use warnings; 5 use locale; 6 7 use Encode; 8 use URI; 9 use Data::Dumper; 10 use Contenido::Globals; 11 use LWP::UserAgent; 12 use Contenido::File::Scheme::FILE; 13 use Contenido::Parser::Util; 14 15 sub fetch { 16 my ($self, $input, %opts) = @_; 17 18 my ($fh, $content); 19 my $encoding = delete $opts{encoding}; 20 if (not ref $input) { 21 no strict "refs"; 22 my $scheme = uc(scheme($input)); 23 if ( $scheme eq 'FILE' ) { 24 $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input); 25 } else { 26 my $request = new HTTP::Request GET => $input; 27 my $ua = new LWP::UserAgent; 28 $ua->timeout(10); 29 my $res = $ua->request($request); 30 if ($res->is_success) { 31 $self->{headers} = $res->headers; 32 my $content_length = $res->headers->header('content-length'); 33 my $content_type = $res->headers->header('content-type'); 34 $self->{content_type} = $content_type; 35 if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) { 36 $encoding = $1; 37 } 38 my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : ''; 39 $self->{base_url} = $base_url if $base_url; 40 $content = $res->content; 41 } else { 42 warn $res->status_line." \n"; 43 $self->{success} = 0; 44 $self->{reason} = $res->status_line; 45 return $self; 46 } 47 } 48 } elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) { 49 $fh = $input; 50 } elsif (ref $input eq "SCALAR") { 51 $fh = IO::Scalar->new($input); 52 } else { 53 warn("Path, scalar ref or fh needed"); 54 $self->{success} = 0; 55 $self->{reason} = 'Path, scalar ref or fh needed'; 56 return $self; 57 } 58 59 if ( ref $fh ) { 60 $content = <$fh>; 61 } 62 if ( $content ) { 63 warn Dumper($self); 64 unless ( $encoding ) { 65 $encoding = $self->__try_content_encoding( substr($content, 0, 350) ); 66 $self->{encoding} = $encoding; 67 if ( $encoding && $encoding ne 'utf-8' ) { 68 Encode::from_to($content, $encoding, 'utf-8'); 69 } 70 } 71 $self->{content} = $content; 72 $self->{success} = 1; 73 } else { 74 $self->{success} = 0; 75 $self->{reason} = 'Content is empty'; 76 } 77 return $self; 78 } 79 80 sub is_success { 81 my ($self, $val) = @_; 82 83 if ( defined $val ) { 84 $self->{success} = $val; 85 return $self; 86 } else { 87 return $self->{success}; 88 } 89 } 90 91 sub __try_content_encoding { 92 my ($self, $input)= @_; 93 if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { 94 return lc($1); 95 } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) { 96 return lc($1); 97 } else { 98 return undef; 99 } 100 } 101 102 sub scheme { 103 my $uri = shift; 104 my $scheme; 105 106 $scheme = URI->new($uri)->scheme() || "file"; 107 108 return $scheme; 109 } 110 111 112 1; -
utf8/core/lib/Contenido/Parser/HTML.pm
1 package Contenido::Parser::HTML; 2 3 use strict; 4 use warnings; 5 use locale; 6 7 use base 'Contenido::Parser'; 8 9 use Contenido::Globals; 10 use Utils::HTML; 11 use Data::Dumper; 12 use utf8; 13 14 15 my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery 16 head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart 17 basket main reply title corner address page buy pix pixel spacer fon welcome razd about back 18 shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones 19 dealer diler background bg news rss index none btn cards up footer noimage but link excel price 20 mid graphic busket map girl space catalog bann headline hosting contact schedule redir email 21 ); 22 23 my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru 24 mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net 25 ); 26 27 my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban 28 icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt 29 ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc 30 ); 31 32 33 my @bad_dimensions = ( 34 { w => 120, h => 60 }, 35 { w => 468, h => 60 }, 36 { w => 120, h => 600 }, 37 { w => 88, h => 31 }, 38 ); 39 40 41 sub new { 42 my ($proto) = @_; 43 my $class = ref($proto) || $proto; 44 my $self = {}; 45 bless $self, $class; 46 47 return $self; 48 } 49 50 51 sub parse { 52 my ($self, %opts) = @_; 53 54 my $content; 55 if ( $opts{content} ) { 56 $content = delete $opts{content}; 57 delete $self->{content}; 58 } elsif ( $self->{success} || $self->{content} ) { 59 $content = delete $self->{content}; 60 } else { 61 $self->{success} = 0; 62 return $self; 63 } 64 65 my $base_url = delete $self->{base_url} || delete $opts{base_url}; 66 my $strip_html = delete $opts{strip_html}; 67 my $debug = $DEBUG; 68 my $gui = delete $opts{gui}; 69 my $header = delete $opts{header}; 70 my $description = delete $opts{description}; 71 my $minimum = delete $opts{min} || length $description; 72 73 my $pre_rools = $self->__parse_rools (delete $opts{parser_pre}); 74 warn Dumper ($pre_rools) if $debug; 75 my $parse_rools = $self->__parse_rools (delete $opts{parser_run}); 76 warn Dumper ($parse_rools) if $debug; 77 my $post_rools = $self->__parse_rools (delete $opts{parser_end}); 78 warn Dumper ($post_rools) if $debug; 79 80 # warn "Experimental. Debug!!!\n" if $debug; 81 if ( ref $pre_rools eq 'ARRAY' ) { 82 my @sets = grep { $_->{command} eq 'set' } @$pre_rools; 83 foreach my $set ( @sets ) { 84 if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) { 85 my $value = $set->{condition}{value}; 86 unless ( $value =~ /\D/ ) { 87 if ( $set->{subcommand} eq 'limit' ) { 88 $minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value); 89 } else { 90 $minimum = int($value); 91 } 92 } 93 } 94 if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) { 95 $description = $header; 96 } 97 } 98 } 99 $minimum ||= 300; 100 101 warn "Tag cleaning...\n" if $debug; 102 $self->__clean_tags (\$content, $pre_rools); 103 $content =~ s/>\s+</></sgi; 104 warn "Image cleaning...\n" if $debug; 105 $self->__clean_img (\$content); 106 warn "Empty div cleaning...\n" if $debug; 107 while ( $self->__clean_empty_div (\$content) ) {} 108 warn "Make tree...\n" if $debug; 109 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug); 110 111 $self->__extract_img ($shortcuts, $base_url, $debug); 112 $self->__extract_headers ($shortcuts, $header, $debug); 113 warn "Getting big texts (min=$minimum)...\n" if $debug; 114 my $chosen = $self->__dig_big_texts ( 115 structure => $shortcuts, 116 min => $minimum, 117 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), 118 debug => $debug ); 119 unless ( ref $chosen eq 'ARRAY' && @$chosen ) { 120 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value'; 121 return $self->is_success(0) unless $gui; 122 } 123 @$chosen = sort { $a->{id} <=> $b->{id} } @$chosen; 124 if ( $description ) { 125 my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY'; 126 $chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools; 127 } 128 unless ( ref $chosen eq 'ARRAY' && @$chosen ) { 129 $self->{error_message} = 'I didn\'t find any valuable text'; 130 return $self->is_success(0) unless $gui; 131 } 132 if ( scalar @$chosen > 1 ) { 133 $chosen = $self->__check_headers ($chosen, $header, $debug); 134 } 135 unless ( ref $chosen eq 'ARRAY' && @$chosen ) { 136 $self->{error_message} = 'I didn\'t find any valuable text'; 137 return $self->is_success(0) unless $gui; 138 } 139 $self->__strip_html ( 140 chosen => $chosen, 141 header => $header, 142 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (), 143 debug => $debug 144 ); 145 if ( ref $parse_rools eq 'ARRAY' ) { 146 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools; 147 $self->__glue ( $chosen, $glue, $debug ) if ref $glue; 148 } 149 my $images = $self->__get_images ( 150 structure => $shortcuts, 151 chosen => $chosen->[0], 152 base_url => $base_url, 153 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), 154 debug => $debug, 155 ); 156 if ( ref $images eq 'ARRAY' && @$images ) { 157 $self->images($images); 158 $self->image($images->[0]); 159 } 160 161 if ( $gui ) { 162 if ( ref $chosen eq 'ARRAY' ) { 163 foreach my $elem ( @$chosen ) { 164 $self->__post_rool ($elem, $post_rools, $description); 165 } 166 } 167 $self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen; 168 $self->{html} = $content; 169 $self->{tree} = $shortcuts; 170 $self->{chosen} = $chosen; 171 } else { 172 $self->__post_rool ($chosen->[0], $post_rools, $description); 173 $self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text}); 174 $tree = undef; 175 foreach my $key ( keys %$shortcuts ) { 176 delete $shortcuts->{$key}; 177 } 178 $shortcuts = undef; 179 $content = undef; 180 } 181 return $self->is_success(1); 182 } 183 184 sub __clean_tags { 185 my ($self, $content, $rools) = @_; 186 187 my @cut_rools; 188 if ( ref $rools eq 'ARRAY' && @$rools) { 189 @cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools; 190 } 191 my @clean_off_rools; 192 if ( ref $rools eq 'ARRAY' && @$rools) { 193 @clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools; 194 } 195 $$content =~ s/<!DOCTYPE(.*?)>//sgi; 196 $$content =~ s/<!--(.*?)-->//sgi; 197 $$content =~ s/<script(.*?)<\/script>//sgi; 198 $$content =~ s/<hr(.*?)>//sgi; 199 $$content =~ s/<noscript(.*?)<\/noscript>//sgi; 200 $$content =~ s/<iframe(.*?)<\/iframe>//sgi; 201 unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) { 202 $$content =~ s/<noindex(.*?)<\/noindex>//sgi; 203 } else { 204 $$content =~ s/<\/?noindex(.*?)>//sgi; 205 } 206 $$content =~ s/<object(.*?)<\/object>//sgi; 207 $$content =~ s/<embed(.*?)<\/embed>//sgi; 208 $$content =~ s/<style(.*?)<\/style>//sgi; 209 if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) { 210 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi; 211 $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi; 212 $$content =~ s/<input([^>]*?)>//sgi; 213 } else { 214 $$content =~ s/<form(.*?)<\/form>//sgi; 215 } 216 foreach my $rool ( @clean_off_rools ) { 217 next unless $rool->{condition}{param} eq 'tag'; 218 my $tag = $rool->{condition}{value}; 219 $$content =~ s/<$tag(.*?)<\/$tag>//sgi; 220 } 221 $$content =~ s/<head(.*?)<\/head>//sgi; 222 $$content =~ s/\ style="(.*?)"//sgi; 223 224 $$content =~ s/<\/?span(.*?)>//sgi; 225 $$content =~ s/<\/?font(.*?)>//sgi; 226 $$content =~ s/<br(.*?)>/\n/sgi; 227 $$content =~ s/<link(.*?)>//sgi; 228 $$content =~ s/<spacer(.*?)>//sgi; 229 $$content =~ s/<\!\?(.*?)\?>//sgi; 230 # $$content =~ s/<a\s*?(.*?)>/\n/sgi; 231 $$content =~ s/<\/p\s*>//sgi; 232 # $$content =~ s/<\/a\s*>//sgi; 233 $$content =~ s/<p\s*(.*?)>/\n\n/sgi; 234 $$content =~ s/onclick="(.*?)"//sgi; 235 $$content =~ s/onload="(.*?)"//sgi; 236 237 } 238 239 sub __clean_img { 240 my ($self, $content) = @_; 241 242 my @garbage; 243 my $i = 1; 244 245 while ( $$content =~ /(<img.*?>)/sgi ) { 246 my $img = $1; 247 my $src; 248 if ( $img =~ /src=([^\x20|^>]+)/i ) { 249 $src = $1; 250 } 251 my ($w, $h); 252 if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) { 253 $w = $1; 254 } 255 if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) { 256 $h = $1; 257 } 258 my $delim = 0; 259 if ( $w && $h ) { 260 foreach my $pair ( @bad_dimensions ) { 261 if ($w == $pair->{w} && $h == $pair->{h}) { 262 $delim = 10; 263 last; 264 } 265 } 266 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; 267 } 268 my $bad_name = __check_img_name ( $src ); 269 if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) { 270 # warn "Bad name: [$src]\n"; 271 push @garbage, $src; 272 } 273 } 274 275 foreach my $src (@garbage) { 276 $src =~ s|([*+?()/\\\$\[\]])|\\$1|sg; 277 $$content =~ s/<img([^>]*?)src=$src([^>]+)>//si; 278 } 279 } 280 281 282 sub __check_img_name { 283 my $name = shift; 284 my $test = $1 if $name =~ /\/([^\/]+)$/; 285 if ( $test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i ) { 286 return 1; 287 } 288 foreach my $word ( @PICNAME ) { 289 if ( $test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si ) { 290 return 1; 291 } 292 } 293 foreach my $word ( @PICURL ) { 294 if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) { 295 return 1; 296 } 297 } 298 foreach my $word ( @PICHOST ) { 299 if ( index (lc($name), $word) >= 0 ) { 300 return 1; 301 } 302 } 303 return 0; 304 } 305 306 307 sub __clean_empty_div { 308 my ($self, $content) = @_; 309 310 my $i = 0; 311 while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) { 312 $i++; 313 } 314 315 return $i; 316 } 317 318 319 sub __make_tree { 320 my ($self, $content, $rools, $debug) = @_; 321 322 my @elems = split (//,$$content); 323 my @collaborate; 324 if ( ref $rools eq 'ARRAY' && @$rools) { 325 @collaborate = grep { $_->{command} eq 'collaborate' } @$rools; 326 } 327 my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 ); 328 my $id = 1; 329 my $level = 0; 330 my %tree = ( 331 root => { 332 id => $id++, 333 text => '', 334 type => 'root', 335 children=> [], 336 parent => undef, 337 level => $level, 338 }, 339 ); 340 my @stack; 341 my %elem_hash = ( 1 => $tree{root} ); 342 my $current = $tree{root}; 343 my $previous; 344 345 while ( @elems ) { 346 if ($elems[0] =~ /[\ \t]/ && !$current){ 347 shift @elems; 348 next; 349 } 350 if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) { 351 my $tag = $self->__try_tag (\@elems); 352 if ( ref $tag && $tag->{type} eq 'text' ) { 353 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; 354 if ( ref $last_text_tag ) { 355 $last_text_tag->{text} .= $tag->{content}; 356 $last_text_tag->{count} += $tag->{count}; 357 } else { 358 $last_text_tag = $tag; 359 $last_text_tag->{id} = $id++; 360 $last_text_tag->{type} = 'text'; 361 $last_text_tag->{parent} = $current; 362 $last_text_tag->{level} = $level+1; 363 $elem_hash{$last_text_tag->{id}} = $last_text_tag; 364 push @{$current->{children}}, $last_text_tag; 365 $current->{text_count}++; 366 } 367 $current->{text_value} += $tag->{count}; 368 splice @elems, 0, $tag->{count}; 369 } elsif ( ref $tag ) { 370 if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) { 371 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug; 372 do { 373 $current = pop @stack; 374 $level = $current->{level}; 375 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; 376 } while ( ($current->{type} !~ /table|body|html/) && scalar @stack ); 377 378 } 379 if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) { 380 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug; 381 do { 382 $current = pop @stack; 383 $level = $current->{level}; 384 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; 385 } while ( ($current->{type} eq "table") && scalar @stack ); 386 387 } 388 $tag->{id} = $id++; 389 $tag->{children} = []; 390 $tag->{text_count} = 0; 391 $tag->{parent} = $current; 392 $tag->{level} = ++$level; 393 $elem_hash{$tag->{id}} = $tag; 394 push @stack, $current; 395 warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n"; 396 $current = $tag; 397 splice @elems, 0, $tag->{count}; 398 } else { 399 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; 400 last; 401 } 402 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) { 403 my $tag = $self->__try_end (\@elems); 404 if ( ref $tag && $tag->{type} eq 'text' ) { 405 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; 406 if ( ref $last_text_tag ) { 407 $last_text_tag->{text} .= $tag->{content}; 408 $last_text_tag->{count} += $tag->{count}; 409 } else { 410 $last_text_tag = $tag; 411 $last_text_tag->{id} = $id++; 412 $last_text_tag->{type} = 'text'; 413 $last_text_tag->{parent} = $current; 414 $last_text_tag->{level} = $level+1; 415 $elem_hash{$last_text_tag->{id}} = $last_text_tag; 416 push @{$current->{children}}, $last_text_tag; 417 $current->{text_count}++; 418 } 419 $current->{text_value} += $tag->{count}; 420 splice @elems, 0, $tag->{count}; 421 } elsif ( ref $tag ) { 422 if ( $current->{type} ne $tag->{type} ) { 423 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug; 424 # warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n"; 425 if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) { 426 do { 427 $current = pop @stack; 428 $level = $current->{level}; 429 # warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; 430 } while ( ($current->{type} ne $tag->{type}) && scalar @stack ); 431 $current = pop @stack; 432 $level = $current->{level}; 433 # warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; 434 }else{ 435 # warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; 436 } 437 } else { 438 if ( @collaborate ) { 439 if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate) 440 && $previous->{type} eq $current->{type} && $previous->{level} == $current->{level} 441 && $previous->{text} && $current->{text} ) { 442 $previous->{text} .= ' '.$current->{text}; 443 my $parent = $current->{parent}; 444 splice @{$parent->{children}}, -1, 1; 445 delete $elem_hash{$current->{id}}; 446 $current = undef; 447 } elsif ( !defined $previous && $current->{text} ) { 448 $previous = $current; 449 } else { 450 $previous = undef; 451 } 452 } 453 $current = pop @stack; 454 $level = $current->{level}; 455 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text}; 456 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; 457 } 458 splice @elems, 0, $tag->{count}; 459 } else { 460 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; 461 last; 462 } 463 } else { 464 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; 465 if ( ref $last_text_tag ) { 466 $last_text_tag->{text} .= shift @elems; 467 $last_text_tag->{count}++; 468 } else { 469 $last_text_tag->{text} = shift @elems; 470 $last_text_tag->{count} = 1; 471 $last_text_tag->{id} = $id++; 472 $last_text_tag->{type} = 'text'; 473 $last_text_tag->{parent} = $current; 474 $last_text_tag->{level} = $level+1; 475 $elem_hash{$last_text_tag->{id}} = $last_text_tag; 476 push @{$current->{children}}, $last_text_tag; 477 $current->{text_count}++; 478 $current->{text_value} = 0; 479 } 480 $current->{text_value}++; 481 } 482 } 483 return (\%tree, \%elem_hash); 484 } 485 486 487 sub __try_tag { 488 my ($self, $content) = @_; 489 490 my $i = 1; 491 my %tag; 492 my $tag = $content->[0]; 493 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { 494 $tag .= $content->[$i]; 495 $i++; 496 } 497 if ( $content->[$i] eq '<' || $i >= scalar @$content ) { 498 return { 499 type => 'text', 500 content => $tag, 501 count => $i, 502 }; 503 } else { 504 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) { 505 my $val = $1; 506 if ( $tag =~ /^<($val)\s*(.*)/i ) { 507 $tag{type} = lc($1); 508 my $args = $2; 509 $tag{count} = $i+1; 510 my %args; 511 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) { 512 $args{lc($1)} = $2; 513 } 514 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) { 515 $args{lc($1)} = $2; 516 } 517 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) { 518 $args{lc($1)} = $2; 519 } 520 foreach my $arg ( qw( name id class width align ) ) { 521 $tag{params}{$arg} = $args{$arg} if exists $args{$arg}; 522 } 523 return \%tag; 524 } else { 525 return { 526 type => 'text', 527 content => $tag, 528 count => $i, 529 }; 530 } 531 } else { 532 return { 533 type => 'text', 534 content => $tag, 535 count => $i, 536 }; 537 } 538 } 539 } 540 541 sub __try_end { 542 my ($self, $content) = @_; 543 544 my $i = 2; 545 my %tag; 546 my $tag = $content->[0].$content->[1]; 547 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { 548 $tag .= $content->[$i]; 549 $i++; 550 } 551 if ( $content->[$i] eq '<' || $i >= scalar @$content ) { 552 return { 553 type => 'text', 554 content => $tag, 555 count => $i, 556 }; 557 } else { 558 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) { 559 my $val = $1; 560 if ( $tag =~ /^<\/($val)\s*?$/i ) { 561 $tag{type} = lc($1); 562 $tag{count} = $i+1; 563 return \%tag; 564 } else { 565 return { 566 type => 'text', 567 content => $tag, 568 count => $i, 569 }; 570 } 571 } else { 572 return { 573 type => 'text', 574 content => $tag, 575 count => $i, 576 }; 577 } 578 } 579 } 580 581 582 sub __extract_img { 583 my ($self, $structure, $base_url, $debug) = @_; 584 return unless ref $structure eq 'HASH'; 585 586 foreach my $tag ( values %$structure ) { 587 next unless ref $tag && exists $tag->{text} && $tag->{text}; 588 my $text = $tag->{text}; 589 while ( $text =~ /<img (.*?)>/sgi ) { 590 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n"; 591 my $params = $1; 592 my $img = {}; 593 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) { 594 $img->{url} = $1; 595 $img->{url} =~ s/[\r\t\n\x20]+$//; 596 $img->{url} =~ s/^[\r\t\n\x20]+//; 597 $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/; 598 $img->{url} =~ s/\/+/\//sgi; 599 $img->{url} =~ s/http:\//http:\/\//sgi; 600 $img->{w} = $1 if $params =~ /width[\D]+(\d+)/; 601 $img->{h} = $1 if $params =~ /height[\D]+(\d+)/; 602 $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/; 603 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY'; 604 push @{ $tag->{images} }, $img; 605 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n"; 606 } 607 } 608 $text =~ s/<img (.*?)>//sgi; 609 $tag->{text} = $text; 610 $tag->{count} = length ($text); 611 } 612 } 613 614 615 sub __extract_headers { 616 my ($self, $structure, $debug) = @_; 617 return unless ref $structure eq 'HASH'; 618 619 foreach my $tag ( values %$structure ) { 620 next unless ref $tag && exists $tag->{text} && $tag->{text}; 621 my $text = $tag->{text}; 622 while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) { 623 my $header_level = $1; 624 my $header_text = $2; 625 $tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY'; 626 push @{ $tag->{headers} }, { level => $header_level, text => $header_text }; 627 } 628 } 629 630 } 631 632 633 sub __dig_big_texts { 634 my ($self, %opts) = @_; 635 my $structure = exists $opts{structure} ? $opts{structure} : undef; 636 my $minimum = exists $opts{min} ? $opts{min} : undef; 637 my $debug = exists $opts{debug} ? $opts{debug} : undef; 638 my $rools = exists $opts{rools} ? $opts{rools} : undef; 639 return unless ref $structure eq 'HASH'; 640 641 my @rools; 642 if ( ref $rools eq 'ARRAY' && @$rools) { 643 @rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools; 644 } 645 my @exclude_rools; 646 if ( ref $rools eq 'ARRAY' && @$rools) { 647 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools; 648 } 649 650 my @ret; 651 foreach my $tid ( sort { $a <=> $b } keys %$structure ) { 652 my $tag = $structure->{$tid}; 653 next unless ref $tag && exists $tag->{text} && $tag->{text}; 654 next if $self->__exclude_rools($tag, \@exclude_rools); 655 656 if ( @rools ) { 657 my $choose = 0; 658 foreach my $rool ( @rools ) { 659 my $matched = 1; 660 foreach my $cond ( @{$rool->{condition}} ) { 661 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { 662 $matched = 0; 663 } 664 } 665 $choose ||= $matched; 666 } 667 if ( $choose ) { 668 $tag->{text} =~ s/^[\t\ \n\r]+//s; 669 $tag->{text} =~ s/[\t\ \n\r]+$//s; 670 $tag->{text} =~ s/[\t\ ]+/\ /sg; 671 $tag->{text} =~ s/\r//sg; 672 $tag->{text} =~ s/\n{2,}/\n\n/sg; 673 $tag->{text} =~ s/<a.*?>//sgi; 674 $tag->{text} =~ s/<\/a.*?>//sgi; 675 $tag->{text} =~ s/\&\\x(\d+)//sgi; 676 677 my $text = $tag->{text}; 678 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; 679 # $text = Contenido::Parser::Util::strip_html($text); 680 if ( length($text) >= $minimum ) { 681 push @ret, $tag; 682 } 683 } 684 } else { 685 my $text = $tag->{text}; 686 $text =~ s/^[\t\ \n\r]+//s; 687 $text =~ s/[\t\ \n\r]+$//s; 688 $text =~ s/[\t\ ]+/\ /sg; 689 $text =~ s/\r//sg; 690 $text =~ s/\n{2,}/\n\n/sg; 691 # It does wrong job: 692 # $text =~ s/([,!\?])(\S)/$1\ $2/sg; 693 $tag->{text} = $text; 694 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; 695 # $text = Contenido::Parser::Util::strip_html($text); 696 if ( length($text) >= $minimum ) { 697 $tag->{text} =~ s/<a.*?>//sgi; 698 $tag->{text} =~ s/<\/a.*?>//sgi; 699 $tag->{text} =~ s/\&\\x(\d+)//sgi; 700 push @ret, $tag; 701 } 702 } 703 } 704 unless ( @ret ) { 705 warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug; 706 } 707 return \@ret; 708 } 709 710 711 712 sub __check_headers { 713 my ($self, $chosen, $header, $debug) = @_; 714 return unless ref $chosen eq 'ARRAY'; 715 716 unless ( grep { exists $_->{headers} } @$chosen ) { 717 warn "No headers found\n" if $debug; 718 return $chosen; 719 } else { 720 # @$chosen = grep { exists $_->{headers} } @$chosen; 721 } 722 my @ret; 723 foreach my $unit ( @$chosen ) { 724 unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) { 725 $unit->{header_identity} = 0; 726 $unit->{header_min_level} = 32768; 727 next; 728 } 729 my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}}; 730 my $min_level = $headers[0]->{level}; 731 $unit->{header_min_level} = $min_level; 732 if ( $header ) { 733 my $coeff = $self->__str_compare( $header, $headers[0]->{text} ); 734 $unit->{header_identity} = $coeff; 735 } 736 } 737 # @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen; 738 # return \@ret; 739 return $chosen; 740 } 741 742 743 744 sub __check_description { 745 my ($self, $chosen, $desc, $debug) = @_; 746 return unless ref $chosen eq 'ARRAY' && $desc; 747 748 my @ret; 749 foreach my $unit ( @$chosen ) { 750 if ( $desc ) { 751 my $coeff = $self->__str_compare( $unit->{text}, $desc ); 752 warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug; 753 $unit->{description_identity} = $coeff; 754 } 755 } 756 @ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen; 757 return \@ret; 758 } 759 760 761 # wtf, bastards! how come my code's used here? --ra 762 # damn, it's not 100% your code already 763 sub __str_compare { 764 my ($self, $original, $applicant) = @_; 765 766 my $Al = __freq_list($original); 767 return -1 unless defined $Al; 768 my $Bl = __freq_list($applicant); 769 return -1 unless defined $Bl; 770 my $df = 0; 771 772 foreach my $word ( %$Bl ) { 773 if ( exists $Al->{$word} ) { 774 $df += $Al->{$word} 775 } else { 776 $df -= $Bl->{$word} 777 } 778 } 779 780 return $df; 781 } 782 783 sub __freq_list { 784 785 my @d = grep { length($_) > 3 } split /\W/, $_[0]; 786 return undef unless @d; 787 my $z = 1/scalar(@d); my %l = (); 788 $l{$_} += $z for @d; \%l; 789 } 790 791 792 793 sub __strip_html { 794 my ($self, %opts) = @_; 795 return unless ref $opts{chosen} eq 'ARRAY'; 796 797 my $chosen = $opts{chosen}; 798 my $rooles = $opts{rools}; 799 my $header = $opts{header}; 800 801 foreach my $unit ( @$chosen ) { 802 my %tags; 803 my $headers = $unit->{headers} if exists $unit->{headers}; 804 if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) { 805 if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) { 806 $unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi; 807 $unit->{text} =~ s/^[\x20\t\r\n]+//si; 808 } elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) { 809 while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) { 810 my $hdr = 'h'.$1; 811 $unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si; 812 } 813 } 814 } 815 for ( $unit->{text} ) { 816 s/></> </sg; 817 s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi; 818 s/<\/h(\d+)(.*?)>/\. /sgi; 819 s/<h(\d+)(.*?)>/\n/sgi; 820 s/&/\&/sg; 821 s/&/\&/sgi; 822 s/«/«/sg; 823 s/»/»/sg; 824 s/£/£/sg; 825 s/–/–/sg; 826 s/—/—/sg; 827 s/…/\.\.\./sg; 828 s/„/"/sg; 829 s/“/"/sg; 830 s/”/"/sg; 831 s/´/'/sg; 832 s/ /\n/sg; 833 s/"/"/sg; 834 s/ /\ /sgi; 835 } 836 # $unit->{text} = HTML::Entities::decode_entities($unit->{text}); 837 # $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text}); 838 for ( $unit->{text} ) { 839 s/^[\x20\t\r\n]+//si; 840 s/^(\d+)\.(\d+)\.(\d+)//si; 841 s/^[\x20\t\r\n]+//si; 842 s/^(\d+):(\d+)//si; 843 s/^[\x20\t\r\n]+//si; 844 } 845 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) { 846 substr $unit->{text}, 0, length($header), ''; 847 $unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi; 848 } 849 $unit->{text} =~ s/[\x20\t\r\n]+$//sgi; 850 } 851 } 852 853 854 855 sub __glue { 856 my ($self, $chosen, $glue, $debug) = @_; 857 return unless ref $chosen eq 'ARRAY'; 858 859 my $i = 0; 860 861 if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) { 862 my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768; 863 foreach my $unit ( @$chosen ) { 864 next unless $i++; 865 if ( $i <= $count ) { 866 $chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text}; 867 } 868 } 869 } elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) { 870 my $text = ''; 871 my $i = 0; 872 foreach my $pos ( @{ $glue->{condition} } ) { 873 $text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text}; 874 } 875 $chosen->[0]->{text} = $text; 876 } 877 } 878 879 880 sub __get_images { 881 my ($self, %opts) = @_; 882 my $structure = exists $opts{structure} ? $opts{structure} : undef; 883 my $chosen = exists $opts{chosen} ? $opts{chosen} : undef; 884 my $debug = exists $opts{debug} ? $opts{debug} : undef; 885 my $rools = exists $opts{rools} ? $opts{rools} : undef; 886 my $base_url = delete $opts{base_url}; 887 return unless ref $chosen && ref $structure; 888 889 return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools; 890 my @use_rools; 891 my @exclude_rools; 892 my $no_validation = 0; 893 if ( ref $rools eq 'ARRAY' && @$rools) { 894 @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools; 895 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools; 896 $no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools; 897 } 898 my $image_depth; 899 if ( ref $rools eq 'ARRAY' && @$rools) { 900 my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools; 901 $image_depth = $rools[-1]->{condition} if @rools; 902 } 903 904 my @images; 905 foreach my $tag ( values %$structure ) { 906 next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY'; 907 next if $self->__exclude_rools($tag, \@exclude_rools); 908 909 if ( @use_rools ) { 910 my $choose = 0; 911 foreach my $rool ( @use_rools ) { 912 my $matched = 1; 913 foreach my $cond ( @{$rool->{condition}} ) { 914 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { 915 $matched = 0; 916 } 917 } 918 $matched = 0 if $self->__exclude_rools($tag, \@exclude_rools); 919 $choose ||= $matched; 920 } 921 if ( $choose ) { 922 my @img = grep { $no_validation || $self->__img_is_valid ($_) } map { 923 my $img = rchannel::Image->new($_); 924 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; 925 $img; 926 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; 927 928 push @images, @img; 929 } 930 } else { 931 next if ($tag->{level}+1) < $chosen->{level}; 932 next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) ); 933 934 my $ok = 0; 935 my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2; 936 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops; 937 next if ($hops - $uphops) > 4; 938 my @img_parents = ($tag->{id}); 939 my $parent = $tag; 940 for ( 1..$hops ) { 941 $parent = $parent->{parent}; 942 push @img_parents, $parent->{id}; 943 } 944 $parent = $chosen;; 945 for ( 0..$uphops ) { 946 if ( grep { $parent->{id} == $_ } @img_parents ) { 947 $ok = 1; 948 last; 949 } 950 $parent = $parent->{parent}; 951 } 952 if ( $ok ) { 953 my @img = grep { $self->__img_is_valid ($_) } map { 954 my $img = rchannel::Image->new($_); 955 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; 956 $img; 957 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; 958 959 push @images, @img; 960 } 961 } 962 } 963 # warn Dumper (\@images); 964 if ( @images ) { 965 return \@images; 966 } else { 967 return undef; 968 } 969 } 970 971 972 sub __img_is_valid { 973 my ($self, $img) = @_; 974 975 # return 1; 976 if ( $img->check_online ) { 977 my $delim = 0; 978 my $w = $img->width; 979 my $h = $img->height; 980 if ( $w && $h ) { 981 foreach my $pair ( @bad_dimensions ) { 982 if ($w == $pair->{w} && $h == $pair->{h}) { 983 return undef; 984 } 985 } 986 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; 987 if ( $w < 80 || $h < 80 || $delim > 2.5 ) { 988 return undef; 989 } 990 } 991 } else { 992 # warn "Image ".$img->src." not found on server"; 993 return undef; 994 } 995 return 1; 996 } 997 998 999 sub __exclude_rools { 1000 my ($self, $tag, $rools) = @_; 1001 return undef unless ref $rools eq 'ARRAY' && @$rools; 1002 1003 my $choose = 0; 1004 foreach my $rool ( @$rools ) { 1005 my $matched = 1; 1006 foreach my $cond ( @{$rool->{condition}} ) { 1007 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { 1008 $matched = 0; 1009 } 1010 } 1011 $choose ||= $matched; 1012 } 1013 return $choose; 1014 } 1015 1016 1017 sub __parse_rools { 1018 my ($self, $rools) = @_; 1019 return unless $rools; 1020 $rools =~ s/\r//sgi; 1021 my @rools = split /\n/, $rools; 1022 return unless @rools; 1023 1024 my @parsed; 1025 foreach my $rool ( @rools ) { 1026 my %pr; 1027 next if $rool =~ /^#/; 1028 $rool =~ s/[\x20\t]+$//; 1029 $rool =~ s/^[\x20\t]+//; 1030 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) { 1031 $pr{command} = lc($1); 1032 my $params = $2; 1033 1034 if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) { 1035 $pr{subcommand} = lc($1); $params = $2; 1036 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/; 1037 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; 1038 $pr{condition} = $params; 1039 } elsif ( $pr{command} eq 'glue' ) { 1040 if ( $params =~ /^(\w+)\s+(.*)$/ ) { 1041 $pr{subcommand} = $1; $params = $2; 1042 next unless $pr{subcommand} =~ /^(first|all|order)$/; 1043 if ( $pr{subcommand} eq 'order' ) { 1044 my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params); 1045 $pr{condition} = \@pars; 1046 } else { 1047 $pr{condition} = int($1); 1048 } 1049 } elsif ( $params =~ /(\d+)/i ) { 1050 $pr{subcommand} = 'first'; 1051 $pr{condition} = int($1); 1052 } else { 1053 $pr{subcommand} = 'all'; 1054 } 1055 } elsif ( $pr{command} eq 'trim' ) { 1056 if ( $params =~ /(left|right)/i ) { 1057 $pr{subcommand} = lc($1); 1058 } else { 1059 $pr{subcommand} = 'all'; 1060 } 1061 } elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) { 1062 $pr{condition} = $1; 1063 } elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) { 1064 $pr{command} = 'image_off'; 1065 } elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) { 1066 $pr{subcommand} = lc($1); $params = $2; 1067 next unless $pr{subcommand} =~ /^(depth)$/; 1068 $pr{condition} = $params; 1069 } elsif ( $pr{command} eq 'set' ) { 1070 if ( $params =~ /^(limit)\s+(.*)$/si ) { 1071 $pr{subcommand} = lc($1); 1072 $params = $2; 1073 } 1074 if ( $params =~ /^(\w+)\s+(.*)$/ ) { 1075 $pr{condition} = { param => $1, value => $2 }; 1076 } else { 1077 next; 1078 } 1079 } elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) { 1080 $pr{command} = 'kill'; 1081 $pr{condition} = { param => $2, value => $1 }; 1082 } elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) { 1083 $pr{command} = 'set'; 1084 $pr{condition} = { param => 'description', value => 'header' }; 1085 } elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) { 1086 $pr{subcommand} = $1; $params = $2; 1087 next unless $pr{subcommand} =~ /^(element|elem|image)$/; 1088 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; 1089 my @conditions; 1090 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { 1091 push @conditions, { param => lc($1), value => $2 } 1092 } 1093 $pr{condition} = \@conditions; 1094 } elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) { 1095 $pr{subcommand} = lc($1); $params = $2; 1096 next unless $pr{subcommand} =~ /^(image|elem|element)$/; 1097 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; 1098 my @conditions; 1099 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { 1100 push @conditions, { param => lc($1), value => $2 } 1101 } 1102 $pr{condition} = \@conditions; 1103 } elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) { 1104 $pr{command} = 'dont'; 1105 $pr{subcommand} = lc($1); $params = $2; 1106 next unless $pr{subcommand} =~ /^(cut|validate)$/; 1107 my @conditions; 1108 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { 1109 $pr{condition} = { param => lc($1), value => $2 }; 1110 } elsif ( $params =~ /(image)/i ) { 1111 $pr{condition} = { param => lc($1) }; 1112 } else { 1113 next; 1114 } 1115 } elsif ( $pr{command} eq 'clean' ) { 1116 if ( $params =~ /^(off)\s+(.*)$/ ) { 1117 $pr{subcommand} = lc($1); $params = $2; 1118 } 1119 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { 1120 $pr{condition} = { param => lc($1), value => $2 }; 1121 } elsif ( $params =~ /(span)/i ) { 1122 $pr{condition} = { param => 'tag', value => lc($1) }; 1123 } else { 1124 next; 1125 } 1126 } else { 1127 next; 1128 } 1129 push @parsed, \%pr; 1130 } 1131 } 1132 return ( scalar @parsed ? \@parsed : undef ); 1133 } 1134 1135 1136 sub __post_rool { 1137 my ($self, $element, $rools, $description) = @_; 1138 1139 if ( ref $rools eq 'ARRAY' && @$rools ) { 1140 foreach my $rool ( @$rools ) { 1141 if ( $rool->{command} eq 'cut' ) { 1142 my $condition = $rool->{condition}; 1143 if ( $rool->{subcommand} eq 'off' ) { 1144 $element->{text} =~ s/$condition//sgi; 1145 } elsif ( $rool->{subcommand} eq 'from' ) { 1146 my $pos = index $element->{text}, $condition; 1147 if ( $pos >= 0 ) { 1148 $element->{text} = substr $element->{text}, 0, $pos; 1149 } 1150 # $element->{text} =~ s/$condition(.*)$//si; 1151 } elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) { 1152 my $str = substr $description, 0, 12; 1153 my $pos = index $element->{text}, $str; 1154 if ( $pos >= 0 ) { 1155 $element->{text} = substr $element->{text}, $pos, -1; 1156 } 1157 } elsif ( $rool->{subcommand} eq 'till' ) { 1158 $element->{text} =~ s/^(.*?)($condition)//si; 1159 } elsif ( $rool->{subcommand} eq 'untill' ) { 1160 $element->{text} =~ s/^(.*?)($condition)/$2/si; 1161 } elsif ( $rool->{subcommand} eq 'regex' ) { 1162 $element->{text} =~ s/$condition//sgi; 1163 } 1164 } elsif ( $rool->{command} eq 'trim' ) { 1165 if ( $rool->{subcommand} eq 'left' ) { 1166 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; 1167 } elsif ( $rool->{subcommand} eq 'right' ) { 1168 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; 1169 } else { 1170 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; 1171 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; 1172 } 1173 } 1174 } 1175 } 1176 } 1177 1178 1179 1; -
utf8/core/lib/Contenido/Parser/RSS.pm
4 4 use warnings; 5 5 use locale; 6 6 7 use Contenido::Parser::Util; 7 use base 'Contenido::Parser'; 8 9 use Contenido::Globals; 8 10 use Utils::HTML; 9 #use Time::ParseDate; 11 use Time::ParseDate; 10 12 #use Date::Parse; 11 13 use Data::Dumper; 12 14 use Digest::MD5 qw(md5_hex); … … 33 35 sub parse { 34 36 my ($self, %opts) = @_; 35 37 36 my $content = delete $opts{content}; 37 my $base_url = delete $opts{base_url}; 38 my $strip_html = delete $opts{strip_html}; 38 my $content; 39 if ( $opts{content} ) { 40 $content = delete $opts{content}; 41 delete $self->{content}; 42 } elsif ( $self->{success} || $self->{content} ) { 43 $content = delete $self->{content}; 44 } else { 45 $self->{success} = 0; 46 return $self; 47 } 48 my $base_url = delete $self->{base_url} || delete $opts{base_url}; 39 49 my $allow_global_fulltext = delete $opts{allow_fulltext} || 0; 40 50 my $content_global_type = delete $opts{content_type} || 1; 41 my $debug = delete $opts{debug}; 51 my $debug = $DEBUG; 42 52 my $gui = delete $opts{gui}; 43 53 my $description_as_fulltext = delete $opts{description_as_fulltext}; 44 54 warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug; … … 56 66 my $content_type = $content_global_type; 57 67 my $allow_fulltext = $allow_global_fulltext; 58 68 $self->__check_rewrite ( item => $item, rools => $rss_rools ); 59 my $date = Time::ParseDate::parsedate($item->{pubdate}); 60 my $pubdate = Class::Date::localdate(Date::Parse::str2time($item->{pubdate})); 69 my $date = $self->__parse_date($item->{pubdate}); 70 my $pubdate = Contenido::DateTime->new( epoch => $date ); 71 $pubdate = $pubdate->ymd('-').' '.$pubdate->hms; 61 72 next if ref $item->{title}; 62 73 next if ref $item->{description}; 63 74 $self->__check_ignore ( item => $item, rools => $rss_rools ); … … 154 165 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; 155 166 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att; 156 167 @images = map { 157 my $img = rchannel::Image->new($_); 158 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; $img; 168 my $img = $_; 169 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; $img; 159 170 } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; 160 171 } 161 172 my @videos; … … 201 212 } 202 213 } 203 214 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos; 215 my @inlined_images; 216 for ( $description, $fulltext ) { 217 my $field = $_; 218 while ( $field =~ /<img ([^>]+)>/sgi ) { 219 my $image = $self->__parse_params( $1 ); 220 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src}; 221 } 222 } 223 if ( @inlined_images ) { 224 my %images = map { $_->{src} => $_ } @images, @inlined_images; 225 @images = values %images; 226 } 204 227 push @items, { 205 228 'checksum' => md5_hex(encode_utf8($title.$description)), 206 229 'ignore' => $item->{ignore} || 0, … … 228 251 }; 229 252 } 230 253 } else { 231 $self->error_message($@ || 'Something wrong while parsing content'); 254 warn ($@ || 'Something wrong while parsing content'); 232 255 return $self->is_success(0); 233 256 } 234 257 235 $self->items(\@items); 258 $self->{items} = \@items; 236 259 return $self->is_success(1); 237 260 } 238 261 … … 1023 1046 my ($self, $text) = @_; 1024 1047 1025 1048 # $text =~ s/^[\n\r\x20\t]+//; 1049 $text =~ s/[\n\r\x20\t]+$//; 1026 1050 $self->__cdata (\$text); 1027 1051 $self->__extchar (\$text); 1028 $text = HTML::Entities::decode_entities($text); 1052 # $text = HTML::Entities::decode_entities($text); 1029 1053 1030 1054 # Remove linebreaks inside incorrectly breaked paragraphs 1031 1055 if (length($text) > 100) { … … 1048 1072 $text =~ s/<br[^>]*>/\n/sgi; 1049 1073 $text =~ s/<p\s*>/\n\n/sgi; 1050 1074 $text =~ s/<\/p\s*>//sgi; 1051 $text = rchannel::Parser::Util::strip_html($text); 1052 $text = rchannel::Parser::Util::text_cleanup($text); 1075 # $text = Contenido::Parser::Util::strip_html($text); 1076 # $text = Contenido::Parser::Util::text_cleanup($text); 1053 1077 return $text; 1054 1078 } 1055 1079 … … 1060 1084 for ( $$textref ) { 1061 1085 s/&/\&/sg; 1062 1086 s/\&/\&/sgi; 1063 s/«/«/sg; 1064 s/»/»/sg; 1065 s/£/£/sg; 1066 s/–/–/sg; 1067 s/—/—/sg; 1068 s/„/"/sg; 1069 s/“/"/sg; 1070 s/”/"/sg; 1071 s/´/'/sg; 1072 s/…/\.\.\./sg; 1073 s/ /\n/sg; 1074 s/"/"/sg; 1075 s/\xA0/\x20/sg; 1087 s/\&/\&/sgi; 1088 s/\"/"/sgi; 1089 s/\«/«/sg; 1090 s/\»/»/sg; 1091 s/\£/£/sg; 1092 s/\–/–/sg; 1093 s/\—/—/sg; 1094 s/\„/"/sg; 1095 s/\“/"/sg; 1096 s/\”/"/sg; 1097 s/\´/'/sg; 1098 s/\…/\.\.\./sg; 1099 s/\ /\n/sg; 1100 s/\"/"/sg; 1076 1101 } 1077 1102 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg; 1078 1103 # $$textref =~ s/>/>/sgi; … … 1202 1227 } 1203 1228 1204 1229 1230 sub __parse_date { 1231 my $self = shift; 1232 my $str = shift; 1233 1234 if ($str=~/(\d{2})(\d{2})(\d{4})T(\d{2})(\d{2})(\d{2})/){ 1235 return parsedate ("$3-$2-$1 $4:$5:$6"); 1236 } elsif ($str=~/(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})/){ 1237 return parsedate ("$1 $2"); 1238 } else { 1239 return parsedate($str); 1240 } 1241 } 1242 1243 1244 1205 1245 # TODO IMAGES: 1206 1246 # enclosure 1207 1247 # media:content