Line # Revision Author
1 99 ahitrov package Contenido::Parser::HTML;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 use base 'Contenido::Parser';
8
9 use Contenido::Globals;
10 use Utils::HTML;
11 use Data::Dumper;
12 use utf8;
13 101 ahitrov use Encode;
14 99 ahitrov
15 my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
16 head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
17 basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
18 shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones
19 dealer diler background bg news rss index none btn cards up footer noimage but link excel price
20 mid graphic busket map girl space catalog bann headline hosting contact schedule redir email
21 );
22
23 my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru
24 mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net
25 );
26
27 my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban
28 icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt
29 ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc
30 );
31
32
33 my @bad_dimensions = (
34 { w => 120, h => 60 },
35 { w => 468, h => 60 },
36 { w => 120, h => 600 },
37 { w => 88, h => 31 },
38 );
39
40
41 sub new {
42 my ($proto) = @_;
43 my $class = ref($proto) || $proto;
44 my $self = {};
45 bless $self, $class;
46
47 return $self;
48 }
49
50
51 sub parse {
52 my ($self, %opts) = @_;
53
54 my $content;
55 if ( $opts{content} ) {
56 101 ahitrov $content = decode('utf-8', delete $opts{content});
57 99 ahitrov delete $self->{content};
58 } elsif ( $self->{success} || $self->{content} ) {
59 101 ahitrov $content = decode('utf-8', delete $self->{content});
60 99 ahitrov } else {
61 $self->{success} = 0;
62 return $self;
63 }
64
65 my $base_url = delete $self->{base_url} || delete $opts{base_url};
66 my $strip_html = delete $opts{strip_html};
67 my $debug = $DEBUG;
68 my $gui = delete $opts{gui};
69 101 ahitrov my $header = decode('utf-8', delete $opts{header});
70 109 ahitrov warn "Header length: ".length($header || '')."\n" if $debug;
71 110 ahitrov my $description = decode('utf-8', delete $opts{description}) || '';
72 warn "Description length: ".length($description)."\n" if $debug;
73 99 ahitrov my $minimum = delete $opts{min} || length $description;
74
75 my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
76 warn Dumper ($pre_rools) if $debug;
77 my $parse_rools = $self->__parse_rools (delete $opts{parser_run});
78 warn Dumper ($parse_rools) if $debug;
79 my $post_rools = $self->__parse_rools (delete $opts{parser_end});
80 warn Dumper ($post_rools) if $debug;
81
82 # warn "Experimental. Debug!!!\n" if $debug;
83 if ( ref $pre_rools eq 'ARRAY' ) {
84 my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
85 foreach my $set ( @sets ) {
86 if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) {
87 my $value = $set->{condition}{value};
88 unless ( $value =~ /\D/ ) {
89 if ( $set->{subcommand} eq 'limit' ) {
90 $minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value);
91 } else {
92 $minimum = int($value);
93 }
94 }
95 }
96 if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) {
97 $description = $header;
98 }
99 }
100 }
101 $minimum ||= 300;
102
103 warn "Tag cleaning...\n" if $debug;
104 $self->__clean_tags (\$content, $pre_rools);
105 $content =~ s/>\s+</></sgi;
106 warn "Image cleaning...\n" if $debug;
107 $self->__clean_img (\$content);
108 warn "Empty div cleaning...\n" if $debug;
109 while ( $self->__clean_empty_div (\$content) ) {}
110 warn "Make tree...\n" if $debug;
111 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
112
113 146 ahitrov $self->__extract_img ($shortcuts, $base_url, $strip_html, $debug);
114 99 ahitrov $self->__extract_headers ($shortcuts, $header, $debug);
115 warn "Getting big texts (min=$minimum)...\n" if $debug;
116 my $chosen = $self->__dig_big_texts (
117 structure => $shortcuts,
118 min => $minimum,
119 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
120 debug => $debug );
121 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
122 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
123 return $self->is_success(0) unless $gui;
124 }
125 if ( $description ) {
126 my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
127 $chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
128 }
129 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
130 $self->{error_message} = 'I didn\'t find any valuable text';
131 return $self->is_success(0) unless $gui;
132 }
133 if ( scalar @$chosen > 1 ) {
134 $chosen = $self->__check_headers ($chosen, $header, $debug);
135 }
136 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
137 $self->{error_message} = 'I didn\'t find any valuable text';
138 return $self->is_success(0) unless $gui;
139 }
140 $self->__strip_html (
141 chosen => $chosen,
142 header => $header,
143 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
144 146 ahitrov debug => $debug,
145 strip_html => $strip_html,
146 99 ahitrov );
147 if ( ref $parse_rools eq 'ARRAY' ) {
148 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
149 $self->__glue ( $chosen, $glue, $debug ) if ref $glue;
150 }
151 107 ahitrov warn "Getting images...\n" if $debug;
152 99 ahitrov my $images = $self->__get_images (
153 structure => $shortcuts,
154 chosen => $chosen->[0],
155 base_url => $base_url,
156 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
157 debug => $debug,
158 );
159 if ( ref $images eq 'ARRAY' && @$images ) {
160 104 ahitrov $self->{images} = $images;
161 $self->{image} = $images->[0];
162 99 ahitrov }
163
164 if ( $gui ) {
165 if ( ref $chosen eq 'ARRAY' ) {
166 foreach my $elem ( @$chosen ) {
167 $self->__post_rool ($elem, $post_rools, $description);
168 }
169 }
170 $self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
171 101 ahitrov # $self->{html} = $content;
172 131 ahitrov $self->{tree} = $shortcuts;
173 # $self->{tree} = $tree;
174 99 ahitrov $self->{chosen} = $chosen;
175 } else {
176 $self->__post_rool ($chosen->[0], $post_rools, $description);
177 $self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
178 101 ahitrov $self->{chosen} = $chosen;
179 map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY';
180 99 ahitrov $tree = undef;
181 foreach my $key ( keys %$shortcuts ) {
182 delete $shortcuts->{$key};
183 }
184 $shortcuts = undef;
185 $content = undef;
186 }
187 return $self->is_success(1);
188 }
189
190 sub __clean_tags {
191 my ($self, $content, $rools) = @_;
192
193 my @cut_rools;
194 if ( ref $rools eq 'ARRAY' && @$rools) {
195 @cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools;
196 }
197 my @clean_off_rools;
198 if ( ref $rools eq 'ARRAY' && @$rools) {
199 @clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools;
200 }
201 $$content =~ s/<!DOCTYPE(.*?)>//sgi;
202 $$content =~ s/<!--(.*?)-->//sgi;
203 $$content =~ s/<script(.*?)<\/script>//sgi;
204 $$content =~ s/<hr(.*?)>//sgi;
205 $$content =~ s/<noscript(.*?)<\/noscript>//sgi;
206 $$content =~ s/<iframe(.*?)<\/iframe>//sgi;
207 unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) {
208 $$content =~ s/<noindex(.*?)<\/noindex>//sgi;
209 } else {
210 $$content =~ s/<\/?noindex(.*?)>//sgi;
211 }
212 $$content =~ s/<object(.*?)<\/object>//sgi;
213 $$content =~ s/<embed(.*?)<\/embed>//sgi;
214 $$content =~ s/<style(.*?)<\/style>//sgi;
215 if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) {
216 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
217 $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
218 $$content =~ s/<input([^>]*?)>//sgi;
219 } else {
220 $$content =~ s/<form(.*?)<\/form>//sgi;
221 104 ahitrov $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
222 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
223 99 ahitrov }
224 foreach my $rool ( @clean_off_rools ) {
225 next unless $rool->{condition}{param} eq 'tag';
226 my $tag = $rool->{condition}{value};
227 $$content =~ s/<$tag(.*?)<\/$tag>//sgi;
228 }
229 $$content =~ s/<head(.*?)<\/head>//sgi;
230 $$content =~ s/\ style="(.*?)"//sgi;
231
232 $$content =~ s/<\/?span(.*?)>//sgi;
233 $$content =~ s/<\/?font(.*?)>//sgi;
234 $$content =~ s/<br(.*?)>/\n/sgi;
235 $$content =~ s/<link(.*?)>//sgi;
236 $$content =~ s/<spacer(.*?)>//sgi;
237 $$content =~ s/<\!\?(.*?)\?>//sgi;
238 # $$content =~ s/<a\s*?(.*?)>/\n/sgi;
239 $$content =~ s/<\/p\s*>//sgi;
240 # $$content =~ s/<\/a\s*>//sgi;
241 $$content =~ s/<p\s*(.*?)>/\n\n/sgi;
242 $$content =~ s/onclick="(.*?)"//sgi;
243 $$content =~ s/onload="(.*?)"//sgi;
244
245 }
246
247 sub __clean_img {
248 my ($self, $content) = @_;
249
250 my @garbage;
251 my $i = 1;
252
253 while ( $$content =~ /(<img.*?>)/sgi ) {
254 my $img = $1;
255 my $src;
256 if ( $img =~ /src=([^\x20|^>]+)/i ) {
257 $src = $1;
258 }
259 my ($w, $h);
260 if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) {
261 $w = $1;
262 }
263 if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) {
264 $h = $1;
265 }
266 my $delim = 0;
267 if ( $w && $h ) {
268 foreach my $pair ( @bad_dimensions ) {
269 if ($w == $pair->{w} && $h == $pair->{h}) {
270 $delim = 10;
271 last;
272 }
273 }
274 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
275 }
276 my $bad_name = __check_img_name ( $src );
277 if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) {
278 # warn "Bad name: [$src]\n";
279 push @garbage, $src;
280 }
281 }
282
283 foreach my $src (@garbage) {
284 $src =~ s|([*+?()/\\\$\[\]])|\\$1|sg;
285 $$content =~ s/<img([^>]*?)src=$src([^>]+)>//si;
286 }
287 }
288
289
290 sub __check_img_name {
291 my $name = shift;
292 my $test = $1 if $name =~ /\/([^\/]+)$/;
293 107 ahitrov if ( $test && ($test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i) ) {
294 99 ahitrov return 1;
295 }
296 foreach my $word ( @PICNAME ) {
297 107 ahitrov if ( $test && ($test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si) ) {
298 99 ahitrov return 1;
299 }
300 }
301 foreach my $word ( @PICURL ) {
302 if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) {
303 return 1;
304 }
305 }
306 foreach my $word ( @PICHOST ) {
307 if ( index (lc($name), $word) >= 0 ) {
308 return 1;
309 }
310 }
311 return 0;
312 }
313
314
315 sub __clean_empty_div {
316 my ($self, $content) = @_;
317
318 my $i = 0;
319 while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) {
320 $i++;
321 }
322
323 return $i;
324 }
325
326
327 sub __make_tree {
328 my ($self, $content, $rools, $debug) = @_;
329
330 my @elems = split (//,$$content);
331 my @collaborate;
332 if ( ref $rools eq 'ARRAY' && @$rools) {
333 @collaborate = grep { $_->{command} eq 'collaborate' } @$rools;
334 }
335 my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 );
336 my $id = 1;
337 my $level = 0;
338 my %tree = (
339 root => {
340 id => $id++,
341 text => '',
342 type => 'root',
343 children=> [],
344 parent => undef,
345 level => $level,
346 },
347 );
348 my @stack;
349 my %elem_hash = ( 1 => $tree{root} );
350 my $current = $tree{root};
351 my $previous;
352
353 while ( @elems ) {
354 if ($elems[0] =~ /[\ \t]/ && !$current){
355 shift @elems;
356 next;
357 }
358 if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) {
359 my $tag = $self->__try_tag (\@elems);
360 if ( ref $tag && $tag->{type} eq 'text' ) {
361 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
362 if ( ref $last_text_tag ) {
363 $last_text_tag->{text} .= $tag->{content};
364 $last_text_tag->{count} += $tag->{count};
365 } else {
366 $last_text_tag = $tag;
367 $last_text_tag->{id} = $id++;
368 $last_text_tag->{type} = 'text';
369 $last_text_tag->{parent} = $current;
370 $last_text_tag->{level} = $level+1;
371 101 ahitrov $last_text_tag->{text} = $tag->{content};
372 99 ahitrov $elem_hash{$last_text_tag->{id}} = $last_text_tag;
373 push @{$current->{children}}, $last_text_tag;
374 $current->{text_count}++;
375 }
376 $current->{text_value} += $tag->{count};
377 splice @elems, 0, $tag->{count};
378 101 ahitrov # warn "Tag opened. Next text: [".join('',$elems[0..10])."]\n";
379 99 ahitrov } elsif ( ref $tag ) {
380 if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
381 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
382 do {
383 $current = pop @stack;
384 $level = $current->{level};
385 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
386 } while ( ($current->{type} !~ /table|body|html/) && scalar @stack );
387
388 }
389 if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) {
390 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug;
391 do {
392 $current = pop @stack;
393 $level = $current->{level};
394 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
395 } while ( ($current->{type} eq "table") && scalar @stack );
396
397 }
398 $tag->{id} = $id++;
399 $tag->{children} = [];
400 $tag->{text_count} = 0;
401 $tag->{parent} = $current;
402 $tag->{level} = ++$level;
403 $elem_hash{$tag->{id}} = $tag;
404 101 ahitrov push @{$current->{children}}, $tag;
405 99 ahitrov push @stack, $current;
406 101 ahitrov # warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
407 99 ahitrov $current = $tag;
408 splice @elems, 0, $tag->{count};
409 } else {
410 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
411 last;
412 }
413 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
414 my $tag = $self->__try_end (\@elems);
415 if ( ref $tag && $tag->{type} eq 'text' ) {
416 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
417 if ( ref $last_text_tag ) {
418 $last_text_tag->{text} .= $tag->{content};
419 $last_text_tag->{count} += $tag->{count};
420 } else {
421 $last_text_tag = $tag;
422 $last_text_tag->{id} = $id++;
423 $last_text_tag->{type} = 'text';
424 $last_text_tag->{parent} = $current;
425 101 ahitrov $last_text_tag->{text} = $tag->{content};
426 99 ahitrov $last_text_tag->{level} = $level+1;
427 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
428 push @{$current->{children}}, $last_text_tag;
429 $current->{text_count}++;
430 }
431 $current->{text_value} += $tag->{count};
432 splice @elems, 0, $tag->{count};
433 } elsif ( ref $tag ) {
434 if ( $current->{type} ne $tag->{type} ) {
435 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
436 # warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n";
437 if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) {
438 do {
439 $current = pop @stack;
440 $level = $current->{level};
441 # warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
442 } while ( ($current->{type} ne $tag->{type}) && scalar @stack );
443 $current = pop @stack;
444 $level = $current->{level};
445 # warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
446 }else{
447 # warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
448 }
449 } else {
450 if ( @collaborate ) {
451 if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate)
452 && $previous->{type} eq $current->{type} && $previous->{level} == $current->{level}
453 && $previous->{text} && $current->{text} ) {
454 $previous->{text} .= ' '.$current->{text};
455 my $parent = $current->{parent};
456 splice @{$parent->{children}}, -1, 1;
457 delete $elem_hash{$current->{id}};
458 $current = undef;
459 } elsif ( !defined $previous && $current->{text} ) {
460 $previous = $current;
461 } else {
462 $previous = undef;
463 }
464 }
465 $current = pop @stack;
466 $level = $current->{level};
467 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
468 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
469 }
470 splice @elems, 0, $tag->{count};
471 } else {
472 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
473 last;
474 }
475 } else {
476 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
477 if ( ref $last_text_tag ) {
478 $last_text_tag->{text} .= shift @elems;
479 $last_text_tag->{count}++;
480 } else {
481 101 ahitrov $last_text_tag = {};
482 99 ahitrov $last_text_tag->{text} = shift @elems;
483 $last_text_tag->{count} = 1;
484 $last_text_tag->{id} = $id++;
485 $last_text_tag->{type} = 'text';
486 $last_text_tag->{parent} = $current;
487 $last_text_tag->{level} = $level+1;
488 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
489 push @{$current->{children}}, $last_text_tag;
490 $current->{text_count}++;
491 $current->{text_value} = 0;
492 }
493 $current->{text_value}++;
494 }
495 }
496 return (\%tree, \%elem_hash);
497 }
498
499
500 sub __try_tag {
501 my ($self, $content) = @_;
502
503 my $i = 1;
504 my %tag;
505 my $tag = $content->[0];
506 101 ahitrov while ( $i < (scalar @$content - 1) && $content->[$i] ne '<' && $content->[$i] ne '>' ) {
507 99 ahitrov $tag .= $content->[$i];
508 $i++;
509 }
510 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
511 return {
512 type => 'text',
513 content => $tag,
514 count => $i,
515 };
516 101 ahitrov }
517 $tag .= $content->[$i++];
518 # warn "TAG: [$tag]\n";
519
520 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
521 my $val = $1;
522 if ( $tag =~ /^<($val)\s*(.*)/i ) {
523 $tag{type} = lc($1);
524 my $args = $2;
525 $tag{count} = $i;
526 my %args;
527 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
528 $args{lc($1)} = $2;
529 99 ahitrov }
530 101 ahitrov while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
531 $args{lc($1)} = $2;
532 }
533 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
534 $args{lc($1)} = $2;
535 }
536 foreach my $arg ( qw( name id class width align ) ) {
537 $tag{params}{$arg} = $args{$arg} if exists $args{$arg};
538 }
539 return \%tag;
540 } else {
541 99 ahitrov return {
542 type => 'text',
543 content => $tag,
544 count => $i,
545 };
546 }
547 101 ahitrov } else {
548 return {
549 type => 'text',
550 content => $tag,
551 count => $i,
552 };
553 99 ahitrov }
554 }
555
556 sub __try_end {
557 my ($self, $content) = @_;
558
559 my $i = 2;
560 my %tag;
561 my $tag = $content->[0].$content->[1];
562 101 ahitrov while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < (scalar @$content-1) ) {
563 99 ahitrov $tag .= $content->[$i];
564 $i++;
565 }
566 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
567 return {
568 type => 'text',
569 content => $tag,
570 count => $i,
571 };
572 101 ahitrov }
573 $tag .= $content->[$i++];
574 # warn "TAG END: [$tag]\n";
575 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
576 99 ahitrov my $val = $1;
577 101 ahitrov if ( $tag =~ /^<\/($val)[\s>]/i ) {
578 99 ahitrov $tag{type} = lc($1);
579 101 ahitrov $tag{count} = $i;
580 99 ahitrov return \%tag;
581 } else {
582 return {
583 type => 'text',
584 content => $tag,
585 count => $i,
586 };
587 }
588 101 ahitrov } else {
589 99 ahitrov return {
590 type => 'text',
591 content => $tag,
592 count => $i,
593 };
594 }
595 }
596
597
598 sub __extract_img {
599 146 ahitrov my ($self, $structure, $base_url, $strip_html, $debug) = @_;
600 99 ahitrov return unless ref $structure eq 'HASH';
601
602 101 ahitrov foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
603 146 ahitrov while ( $tag->{text} =~ /<img (.*?)\/?>/sgi ) {
604 99 ahitrov # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
605 my $params = $1;
606 146 ahitrov my $img = $self->parse_html_tag('img '.$params);
607 if ( exists $img->{src} && $img->{src} ) {
608 my %img = ( src => $img->{src} );
609 $img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m|^/| ? '' : '/').$img{src};
610 $img{w} = $img->{width} if $img->{width};
611 $img{h} = $img->{height} if $img->{height};
612 $img{alt} = $img->{alt} if $img->{alt};
613 $img{title} = $img->{title} if $img->{title};
614 99 ahitrov $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
615 146 ahitrov push @{ $tag->{images} }, \%img;
616 }
617 # if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
618 # $img->{url} = $1;
619 # $img->{url} =~ s/[\r\t\n\ ]+$//;
620 # $img->{url} =~ s/^[\r\t\n\ ]+//;
621 # $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
622 # $img->{url} =~ s/\/+/\//sgi;
623 # $img->{url} =~ s/http:\//http:\/\//sgi;
624 # $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
625 # $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
626 # $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
627 # $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
628 # push @{ $tag->{images} }, $img;
629 99 ahitrov # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
630 146 ahitrov # }
631 99 ahitrov }
632 146 ahitrov $tag->{text} =~ s/<img (.*?)>//sgi if $strip_html;
633 $tag->{count} = length ($tag->{text});
634 99 ahitrov }
635 }
636
637
638 sub __extract_headers {
639 my ($self, $structure, $debug) = @_;
640 return unless ref $structure eq 'HASH';
641
642 101 ahitrov foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
643 99 ahitrov my $text = $tag->{text};
644 while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
645 my $header_level = $1;
646 my $header_text = $2;
647 $tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY';
648 push @{ $tag->{headers} }, { level => $header_level, text => $header_text };
649 }
650 }
651
652 }
653
654
655 sub __dig_big_texts {
656 my ($self, %opts) = @_;
657 my $structure = exists $opts{structure} ? $opts{structure} : undef;
658 my $minimum = exists $opts{min} ? $opts{min} : undef;
659 my $debug = exists $opts{debug} ? $opts{debug} : undef;
660 my $rools = exists $opts{rools} ? $opts{rools} : undef;
661 return unless ref $structure eq 'HASH';
662
663 my @rools;
664 if ( ref $rools eq 'ARRAY' && @$rools) {
665 @rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools;
666 }
667 my @exclude_rools;
668 if ( ref $rools eq 'ARRAY' && @$rools) {
669 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools;
670 }
671
672 my @ret;
673 101 ahitrov foreach my $tag ( sort { $a->{id} <=> $b->{id} } grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
674 next if $self->__exclude_rools($tag->{parent}, \@exclude_rools);
675 99 ahitrov
676 if ( @rools ) {
677 my $choose = 0;
678 foreach my $rool ( @rools ) {
679 my $matched = 1;
680 foreach my $cond ( @{$rool->{condition}} ) {
681 101 ahitrov unless ( exists $tag->{parent}{params}{$cond->{param}} && $tag->{parent}{params}{$cond->{param}} eq $cond->{value} ) {
682 99 ahitrov $matched = 0;
683 }
684 }
685 $choose ||= $matched;
686 }
687 if ( $choose ) {
688 101 ahitrov for ( $tag->{text} ) {
689 s/^[\t\ \n\r]+//s;
690 s/[\t\ \n\r]+$//s;
691 s/[\t\ ]+/\ /sg;
692 s/\r//sg;
693 s/\n{2,}/\n\n/sg;
694 s/\&\\x(\d+)//sgi;
695 }
696 99 ahitrov
697 my $text = $tag->{text};
698 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
699 101 ahitrov $text = Contenido::Parser::Util::strip_html($text);
700 $tag->{text_weight} = length($text);
701 99 ahitrov if ( length($text) >= $minimum ) {
702 101 ahitrov for ( $tag->{text} ) {
703 s/<a.*?>//sgi;
704 s/<\/a.*?>//sgi;
705 }
706 push @ret, $tag;
707 99 ahitrov }
708 }
709 } else {
710 101 ahitrov for ( $tag->{text} ) {
711 s/^[\t\ \n\r]+//s;
712 s/[\t\ \n\r]+$//s;
713 s/[\t\ ]+/\ /sg;
714 s/\r//sg;
715 s/\n{2,}/\n\n/sg;
716 }
717 99 ahitrov my $text = $tag->{text};
718 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
719 101 ahitrov $text = Contenido::Parser::Util::strip_html($text);
720 $tag->{text_weight} = length($text);
721 99 ahitrov if ( length($text) >= $minimum ) {
722 101 ahitrov for ( $tag->{text} ) {
723 s/<a.*?>//sgi;
724 s/<\/a.*?>//sgi;
725 s/\&\\x(\d+)//sgi;
726 }
727 99 ahitrov push @ret, $tag;
728 146 ahitrov # $self->log_elem($tag);
729 99 ahitrov }
730 }
731 }
732 unless ( @ret ) {
733 warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug;
734 }
735 107 ahitrov warn "Digging done!\n" if $debug;
736 99 ahitrov return \@ret;
737 }
738
739
740
741 sub __check_headers {
742 my ($self, $chosen, $header, $debug) = @_;
743 return unless ref $chosen eq 'ARRAY';
744
745 unless ( grep { exists $_->{headers} } @$chosen ) {
746 warn "No headers found\n" if $debug;
747 return $chosen;
748 } else {
749 # @$chosen = grep { exists $_->{headers} } @$chosen;
750 }
751 my @ret;
752 foreach my $unit ( @$chosen ) {
753 unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) {
754 $unit->{header_identity} = 0;
755 $unit->{header_min_level} = 32768;
756 next;
757 }
758 my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}};
759 my $min_level = $headers[0]->{level};
760 $unit->{header_min_level} = $min_level;
761 if ( $header ) {
762 my $coeff = $self->__str_compare( $header, $headers[0]->{text} );
763 $unit->{header_identity} = $coeff;
764 }
765 }
766 # @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen;
767 # return \@ret;
768 return $chosen;
769 }
770
771
772
773 sub __check_description {
774 my ($self, $chosen, $desc, $debug) = @_;
775 return unless ref $chosen eq 'ARRAY' && $desc;
776
777 my @ret;
778 foreach my $unit ( @$chosen ) {
779 if ( $desc ) {
780 my $coeff = $self->__str_compare( $unit->{text}, $desc );
781 warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug;
782 $unit->{description_identity} = $coeff;
783 }
784 }
785 @ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen;
786 return \@ret;
787 }
788
789
790 # wtf, bastards! how come my code's used here? --ra
791 # damn, it's not 100% your code already
792 sub __str_compare {
793 my ($self, $original, $applicant) = @_;
794
795 my $Al = __freq_list($original);
796 return -1 unless defined $Al;
797 my $Bl = __freq_list($applicant);
798 return -1 unless defined $Bl;
799 my $df = 0;
800
801 foreach my $word ( %$Bl ) {
802 if ( exists $Al->{$word} ) {
803 104 ahitrov $df += $Al->{$word} || 0;
804 99 ahitrov } else {
805 104 ahitrov $df -= $Bl->{$word} || 0;
806 99 ahitrov }
807 }
808
809 return $df;
810 }
811
812 sub __freq_list {
813
814 my @d = grep { length($_) > 3 } split /\W/, $_[0];
815 return undef unless @d;
816 my $z = 1/scalar(@d); my %l = ();
817 $l{$_} += $z for @d; \%l;
818 }
819
820
821
822 sub __strip_html {
823 my ($self, %opts) = @_;
824 return unless ref $opts{chosen} eq 'ARRAY';
825
826 my $chosen = $opts{chosen};
827 my $rooles = $opts{rools};
828 146 ahitrov my $header = $opts{header} || '';
829 my $strip_html = $opts{strip_html};
830 99 ahitrov
831 foreach my $unit ( @$chosen ) {
832 my %tags;
833 my $headers = $unit->{headers} if exists $unit->{headers};
834 if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) {
835 if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) {
836 $unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi;
837 $unit->{text} =~ s/^[\x20\t\r\n]+//si;
838 } elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) {
839 while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) {
840 my $hdr = 'h'.$1;
841 $unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si;
842 }
843 }
844 }
845 for ( $unit->{text} ) {
846 s/></> </sg;
847 s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi;
848 s/<\/h(\d+)(.*?)>/\. /sgi;
849 s/<h(\d+)(.*?)>/\n/sgi;
850 s/&#38;/\&/sg;
851 s/&amp;/\&/sgi;
852 s/&#171;/«/sg;
853 s/&#187;/»/sg;
854 s/&#163;/£/sg;
855 s/&#150;/&ndash;/sg;
856 s/&#151;/&mdash;/sg;
857 s/&#133;/\.\.\./sg;
858 s/&#132;/"/sg;
859 s/&#147;/"/sg;
860 s/&#148;/"/sg;
861 s/&#180;/'/sg;
862 s/&#13;/\n/sg;
863 s/&#34;/"/sg;
864 s/&nbsp;/\ /sgi;
865 }
866 # $unit->{text} = HTML::Entities::decode_entities($unit->{text});
867 # $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
868 for ( $unit->{text} ) {
869 101 ahitrov s/^[\ \t\r\n]+//si;
870 99 ahitrov s/^(\d+)\.(\d+)\.(\d+)//si;
871 101 ahitrov s/^[\ \t\r\n]+//si;
872 99 ahitrov s/^(\d+):(\d+)//si;
873 101 ahitrov s/^[\ \t\r\n]+//si;
874 99 ahitrov }
875 146 ahitrov if ( $header && lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
876 99 ahitrov substr $unit->{text}, 0, length($header), '';
877 101 ahitrov $unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
878 99 ahitrov }
879 101 ahitrov $unit->{text} =~ s/[\ \t\r\n]+$//sgi;
880 99 ahitrov }
881 }
882
883
884
885 sub __glue {
886 my ($self, $chosen, $glue, $debug) = @_;
887 return unless ref $chosen eq 'ARRAY';
888
889 my $i = 0;
890
891 if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) {
892 my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768;
893 foreach my $unit ( @$chosen ) {
894 next unless $i++;
895 if ( $i <= $count ) {
896 $chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text};
897 }
898 }
899 } elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) {
900 my $text = '';
901 my $i = 0;
902 foreach my $pos ( @{ $glue->{condition} } ) {
903 $text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text};
904 }
905 $chosen->[0]->{text} = $text;
906 }
907 }
908
909
910 sub __get_images {
911 my ($self, %opts) = @_;
912 my $structure = exists $opts{structure} ? $opts{structure} : undef;
913 my $chosen = exists $opts{chosen} ? $opts{chosen} : undef;
914 my $debug = exists $opts{debug} ? $opts{debug} : undef;
915 my $rools = exists $opts{rools} ? $opts{rools} : undef;
916 my $base_url = delete $opts{base_url};
917 return unless ref $chosen && ref $structure;
918
919 return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools;
920 my @use_rools;
921 my @exclude_rools;
922 my $no_validation = 0;
923 if ( ref $rools eq 'ARRAY' && @$rools) {
924 @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools;
925 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools;
926 $no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools;
927 }
928 my $image_depth;
929 if ( ref $rools eq 'ARRAY' && @$rools) {
930 my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools;
931 $image_depth = $rools[-1]->{condition} if @rools;
932 }
933
934 my @images;
935 foreach my $tag ( values %$structure ) {
936 next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY';
937 next if $self->__exclude_rools($tag, \@exclude_rools);
938
939 if ( @use_rools ) {
940 my $choose = 0;
941 foreach my $rool ( @use_rools ) {
942 my $matched = 1;
943 foreach my $cond ( @{$rool->{condition}} ) {
944 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
945 $matched = 0;
946 }
947 }
948 $matched = 0 if $self->__exclude_rools($tag, \@exclude_rools);
949 $choose ||= $matched;
950 }
951 if ( $choose ) {
952 my @img = grep { $no_validation || $self->__img_is_valid ($_) } map {
953 my $img = rchannel::Image->new($_);
954 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
955 $img;
956 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
957
958 push @images, @img;
959 }
960 } else {
961 101 ahitrov next if ($tag->{level}+1) < $chosen->{parent}{level};
962 next if $image_depth && ( $tag->{level} > ($chosen->{parent}{level} + $image_depth) );
963 99 ahitrov
964 my $ok = 0;
965 101 ahitrov my $uphops = $tag->{level} > $chosen->{parent}{level} ? 1 : 2;
966 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{parent}{level} + $uphops;
967 99 ahitrov next if ($hops - $uphops) > 4;
968 my @img_parents = ($tag->{id});
969 my $parent = $tag;
970 for ( 1..$hops ) {
971 $parent = $parent->{parent};
972 push @img_parents, $parent->{id};
973 }
974 101 ahitrov $parent = $chosen->{parent}{parent};
975 99 ahitrov for ( 0..$uphops ) {
976 if ( grep { $parent->{id} == $_ } @img_parents ) {
977 $ok = 1;
978 last;
979 }
980 $parent = $parent->{parent};
981 }
982 if ( $ok ) {
983 my @img = grep { $self->__img_is_valid ($_) } map {
984 101 ahitrov my $img = $_;
985 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/;
986 99 ahitrov $img;
987 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
988
989 push @images, @img;
990 }
991 }
992 }
993 # warn Dumper (\@images);
994 if ( @images ) {
995 return \@images;
996 } else {
997 return undef;
998 }
999 }
1000
1001
1002 sub __img_is_valid {
1003 my ($self, $img) = @_;
1004
1005 101 ahitrov return 1;
1006 99 ahitrov if ( $img->check_online ) {
1007 my $delim = 0;
1008 my $w = $img->width;
1009 my $h = $img->height;
1010 if ( $w && $h ) {
1011 foreach my $pair ( @bad_dimensions ) {
1012 if ($w == $pair->{w} && $h == $pair->{h}) {
1013 return undef;
1014 }
1015 }
1016 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
1017 if ( $w < 80 || $h < 80 || $delim > 2.5 ) {
1018 return undef;
1019 }
1020 }
1021 } else {
1022 # warn "Image ".$img->src." not found on server";
1023 return undef;
1024 }
1025 return 1;
1026 }
1027
1028
1029 sub __exclude_rools {
1030 my ($self, $tag, $rools) = @_;
1031 return undef unless ref $rools eq 'ARRAY' && @$rools;
1032
1033 my $choose = 0;
1034 foreach my $rool ( @$rools ) {
1035 my $matched = 1;
1036 foreach my $cond ( @{$rool->{condition}} ) {
1037 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
1038 $matched = 0;
1039 }
1040 }
1041 $choose ||= $matched;
1042 }
1043 return $choose;
1044 }
1045
1046
1047 sub __parse_rools {
1048 my ($self, $rools) = @_;
1049 return unless $rools;
1050 $rools =~ s/\r//sgi;
1051 my @rools = split /\n/, $rools;
1052 return unless @rools;
1053
1054 my @parsed;
1055 foreach my $rool ( @rools ) {
1056 my %pr;
1057 next if $rool =~ /^#/;
1058 $rool =~ s/[\x20\t]+$//;
1059 $rool =~ s/^[\x20\t]+//;
1060 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
1061 $pr{command} = lc($1);
1062 my $params = $2;
1063
1064 if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
1065 $pr{subcommand} = lc($1); $params = $2;
1066 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/;
1067 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1068 $pr{condition} = $params;
1069 } elsif ( $pr{command} eq 'glue' ) {
1070 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1071 $pr{subcommand} = $1; $params = $2;
1072 next unless $pr{subcommand} =~ /^(first|all|order)$/;
1073 if ( $pr{subcommand} eq 'order' ) {
1074 my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params);
1075 $pr{condition} = \@pars;
1076 } else {
1077 $pr{condition} = int($1);
1078 }
1079 } elsif ( $params =~ /(\d+)/i ) {
1080 $pr{subcommand} = 'first';
1081 $pr{condition} = int($1);
1082 } else {
1083 $pr{subcommand} = 'all';
1084 }
1085 } elsif ( $pr{command} eq 'trim' ) {
1086 if ( $params =~ /(left|right)/i ) {
1087 $pr{subcommand} = lc($1);
1088 } else {
1089 $pr{subcommand} = 'all';
1090 }
1091 } elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) {
1092 $pr{condition} = $1;
1093 } elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) {
1094 $pr{command} = 'image_off';
1095 } elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) {
1096 $pr{subcommand} = lc($1); $params = $2;
1097 next unless $pr{subcommand} =~ /^(depth)$/;
1098 $pr{condition} = $params;
1099 } elsif ( $pr{command} eq 'set' ) {
1100 if ( $params =~ /^(limit)\s+(.*)$/si ) {
1101 $pr{subcommand} = lc($1);
1102 $params = $2;
1103 }
1104 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1105 $pr{condition} = { param => $1, value => $2 };
1106 } else {
1107 next;
1108 }
1109 } elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) {
1110 $pr{command} = 'kill';
1111 $pr{condition} = { param => $2, value => $1 };
1112 } elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) {
1113 $pr{command} = 'set';
1114 $pr{condition} = { param => 'description', value => 'header' };
1115 } elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) {
1116 $pr{subcommand} = $1; $params = $2;
1117 next unless $pr{subcommand} =~ /^(element|elem|image)$/;
1118 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1119 my @conditions;
1120 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1121 push @conditions, { param => lc($1), value => $2 }
1122 }
1123 $pr{condition} = \@conditions;
1124 } elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) {
1125 $pr{subcommand} = lc($1); $params = $2;
1126 next unless $pr{subcommand} =~ /^(image|elem|element)$/;
1127 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1128 my @conditions;
1129 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1130 push @conditions, { param => lc($1), value => $2 }
1131 }
1132 $pr{condition} = \@conditions;
1133 } elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) {
1134 $pr{command} = 'dont';
1135 $pr{subcommand} = lc($1); $params = $2;
1136 next unless $pr{subcommand} =~ /^(cut|validate)$/;
1137 my @conditions;
1138 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1139 $pr{condition} = { param => lc($1), value => $2 };
1140 } elsif ( $params =~ /(image)/i ) {
1141 $pr{condition} = { param => lc($1) };
1142 } else {
1143 next;
1144 }
1145 } elsif ( $pr{command} eq 'clean' ) {
1146 if ( $params =~ /^(off)\s+(.*)$/ ) {
1147 $pr{subcommand} = lc($1); $params = $2;
1148 }
1149 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1150 $pr{condition} = { param => lc($1), value => $2 };
1151 } elsif ( $params =~ /(span)/i ) {
1152 $pr{condition} = { param => 'tag', value => lc($1) };
1153 } else {
1154 next;
1155 }
1156 } else {
1157 next;
1158 }
1159 push @parsed, \%pr;
1160 }
1161 }
1162 return ( scalar @parsed ? \@parsed : undef );
1163 }
1164
1165
1166 sub __post_rool {
1167 my ($self, $element, $rools, $description) = @_;
1168
1169 if ( ref $rools eq 'ARRAY' && @$rools ) {
1170 foreach my $rool ( @$rools ) {
1171 if ( $rool->{command} eq 'cut' ) {
1172 my $condition = $rool->{condition};
1173 if ( $rool->{subcommand} eq 'off' ) {
1174 $element->{text} =~ s/$condition//sgi;
1175 } elsif ( $rool->{subcommand} eq 'from' ) {
1176 my $pos = index $element->{text}, $condition;
1177 if ( $pos >= 0 ) {
1178 $element->{text} = substr $element->{text}, 0, $pos;
1179 }
1180 # $element->{text} =~ s/$condition(.*)$//si;
1181 } elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) {
1182 my $str = substr $description, 0, 12;
1183 my $pos = index $element->{text}, $str;
1184 if ( $pos >= 0 ) {
1185 $element->{text} = substr $element->{text}, $pos, -1;
1186 }
1187 } elsif ( $rool->{subcommand} eq 'till' ) {
1188 $element->{text} =~ s/^(.*?)($condition)//si;
1189 } elsif ( $rool->{subcommand} eq 'untill' ) {
1190 $element->{text} =~ s/^(.*?)($condition)/$2/si;
1191 } elsif ( $rool->{subcommand} eq 'regex' ) {
1192 $element->{text} =~ s/$condition//sgi;
1193 }
1194 } elsif ( $rool->{command} eq 'trim' ) {
1195 if ( $rool->{subcommand} eq 'left' ) {
1196 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1197 } elsif ( $rool->{subcommand} eq 'right' ) {
1198 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1199 } else {
1200 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1201 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1202 }
1203 }
1204 }
1205 }
1206 }
1207
1208
1209 146 ahitrov sub log_elem {
1210 my $self = shift;
1211 my $elem = shift;
1212 return unless ref $elem eq 'HASH';
1213
1214 my %elem;
1215 map { $elem{$_} = $elem->{$_} } grep { $_ ne 'parent' } keys %$elem;
1216 warn Dumper \%elem;
1217 }
1218
1219
1220 99 ahitrov 1;