Line # Revision Author
1 99 ahitrov package Contenido::Parser::HTML;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 use base 'Contenido::Parser';
8
9 use Contenido::Globals;
10 use Utils::HTML;
11 use Data::Dumper;
12 use utf8;
13 101 ahitrov use Encode;
14 99 ahitrov
15 my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
16 head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
17 basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
18 shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones
19 dealer diler background bg news rss index none btn cards up footer noimage but link excel price
20 mid graphic busket map girl space catalog bann headline hosting contact schedule redir email
21 );
22
23 my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru
24 mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net
25 );
26
27 my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban
28 icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt
29 ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc
30 );
31
32
33 my @bad_dimensions = (
34 { w => 120, h => 60 },
35 { w => 468, h => 60 },
36 { w => 120, h => 600 },
37 { w => 88, h => 31 },
38 );
39
40
41 sub new {
42 my ($proto) = @_;
43 my $class = ref($proto) || $proto;
44 my $self = {};
45 bless $self, $class;
46
47 return $self;
48 }
49
50
51 sub parse {
52 my ($self, %opts) = @_;
53
54 my $content;
55 if ( $opts{content} ) {
56 101 ahitrov $content = decode('utf-8', delete $opts{content});
57 99 ahitrov delete $self->{content};
58 } elsif ( $self->{success} || $self->{content} ) {
59 101 ahitrov $content = decode('utf-8', delete $self->{content});
60 99 ahitrov } else {
61 $self->{success} = 0;
62 return $self;
63 }
64
65 my $base_url = delete $self->{base_url} || delete $opts{base_url};
66 my $strip_html = delete $opts{strip_html};
67 my $debug = $DEBUG;
68 my $gui = delete $opts{gui};
69 101 ahitrov my $header = decode('utf-8', delete $opts{header});
70 109 ahitrov warn "Header length: ".length($header || '')."\n" if $debug;
71 101 ahitrov my $description = decode('utf-8', delete $opts{description});
72 109 ahitrov warn "Description length: ".length($description || '')."\n" if $debug;
73 99 ahitrov my $minimum = delete $opts{min} || length $description;
74
75 my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
76 warn Dumper ($pre_rools) if $debug;
77 my $parse_rools = $self->__parse_rools (delete $opts{parser_run});
78 warn Dumper ($parse_rools) if $debug;
79 my $post_rools = $self->__parse_rools (delete $opts{parser_end});
80 warn Dumper ($post_rools) if $debug;
81
82 # warn "Experimental. Debug!!!\n" if $debug;
83 if ( ref $pre_rools eq 'ARRAY' ) {
84 my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
85 foreach my $set ( @sets ) {
86 if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) {
87 my $value = $set->{condition}{value};
88 unless ( $value =~ /\D/ ) {
89 if ( $set->{subcommand} eq 'limit' ) {
90 $minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value);
91 } else {
92 $minimum = int($value);
93 }
94 }
95 }
96 if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) {
97 $description = $header;
98 }
99 }
100 }
101 $minimum ||= 300;
102
103 warn "Tag cleaning...\n" if $debug;
104 $self->__clean_tags (\$content, $pre_rools);
105 $content =~ s/>\s+</></sgi;
106 warn "Image cleaning...\n" if $debug;
107 $self->__clean_img (\$content);
108 warn "Empty div cleaning...\n" if $debug;
109 while ( $self->__clean_empty_div (\$content) ) {}
110 warn "Make tree...\n" if $debug;
111 my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
112
113 $self->__extract_img ($shortcuts, $base_url, $debug);
114 $self->__extract_headers ($shortcuts, $header, $debug);
115 warn "Getting big texts (min=$minimum)...\n" if $debug;
116 my $chosen = $self->__dig_big_texts (
117 structure => $shortcuts,
118 min => $minimum,
119 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
120 debug => $debug );
121 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
122 $self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
123 return $self->is_success(0) unless $gui;
124 }
125 if ( $description ) {
126 my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
127 $chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
128 }
129 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
130 $self->{error_message} = 'I didn\'t find any valuable text';
131 return $self->is_success(0) unless $gui;
132 }
133 if ( scalar @$chosen > 1 ) {
134 $chosen = $self->__check_headers ($chosen, $header, $debug);
135 }
136 unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
137 $self->{error_message} = 'I didn\'t find any valuable text';
138 return $self->is_success(0) unless $gui;
139 }
140 $self->__strip_html (
141 chosen => $chosen,
142 header => $header,
143 ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
144 debug => $debug
145 );
146 if ( ref $parse_rools eq 'ARRAY' ) {
147 my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
148 $self->__glue ( $chosen, $glue, $debug ) if ref $glue;
149 }
150 107 ahitrov warn "Getting images...\n" if $debug;
151 99 ahitrov my $images = $self->__get_images (
152 structure => $shortcuts,
153 chosen => $chosen->[0],
154 base_url => $base_url,
155 ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
156 debug => $debug,
157 );
158 if ( ref $images eq 'ARRAY' && @$images ) {
159 104 ahitrov $self->{images} = $images;
160 $self->{image} = $images->[0];
161 99 ahitrov }
162
163 if ( $gui ) {
164 if ( ref $chosen eq 'ARRAY' ) {
165 foreach my $elem ( @$chosen ) {
166 $self->__post_rool ($elem, $post_rools, $description);
167 }
168 }
169 $self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
170 101 ahitrov # $self->{html} = $content;
171 # $self->{tree} = $shortcuts;
172 $self->{tree} = $tree;
173 99 ahitrov $self->{chosen} = $chosen;
174 } else {
175 $self->__post_rool ($chosen->[0], $post_rools, $description);
176 $self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
177 101 ahitrov $self->{chosen} = $chosen;
178 map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY';
179 99 ahitrov $tree = undef;
180 foreach my $key ( keys %$shortcuts ) {
181 delete $shortcuts->{$key};
182 }
183 $shortcuts = undef;
184 $content = undef;
185 }
186 return $self->is_success(1);
187 }
188
189 sub __clean_tags {
190 my ($self, $content, $rools) = @_;
191
192 my @cut_rools;
193 if ( ref $rools eq 'ARRAY' && @$rools) {
194 @cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools;
195 }
196 my @clean_off_rools;
197 if ( ref $rools eq 'ARRAY' && @$rools) {
198 @clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools;
199 }
200 $$content =~ s/<!DOCTYPE(.*?)>//sgi;
201 $$content =~ s/<!--(.*?)-->//sgi;
202 $$content =~ s/<script(.*?)<\/script>//sgi;
203 $$content =~ s/<hr(.*?)>//sgi;
204 $$content =~ s/<noscript(.*?)<\/noscript>//sgi;
205 $$content =~ s/<iframe(.*?)<\/iframe>//sgi;
206 unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) {
207 $$content =~ s/<noindex(.*?)<\/noindex>//sgi;
208 } else {
209 $$content =~ s/<\/?noindex(.*?)>//sgi;
210 }
211 $$content =~ s/<object(.*?)<\/object>//sgi;
212 $$content =~ s/<embed(.*?)<\/embed>//sgi;
213 $$content =~ s/<style(.*?)<\/style>//sgi;
214 if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) {
215 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
216 $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
217 $$content =~ s/<input([^>]*?)>//sgi;
218 } else {
219 $$content =~ s/<form(.*?)<\/form>//sgi;
220 104 ahitrov $$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
221 $$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
222 99 ahitrov }
223 foreach my $rool ( @clean_off_rools ) {
224 next unless $rool->{condition}{param} eq 'tag';
225 my $tag = $rool->{condition}{value};
226 $$content =~ s/<$tag(.*?)<\/$tag>//sgi;
227 }
228 $$content =~ s/<head(.*?)<\/head>//sgi;
229 $$content =~ s/\ style="(.*?)"//sgi;
230
231 $$content =~ s/<\/?span(.*?)>//sgi;
232 $$content =~ s/<\/?font(.*?)>//sgi;
233 $$content =~ s/<br(.*?)>/\n/sgi;
234 $$content =~ s/<link(.*?)>//sgi;
235 $$content =~ s/<spacer(.*?)>//sgi;
236 $$content =~ s/<\!\?(.*?)\?>//sgi;
237 # $$content =~ s/<a\s*?(.*?)>/\n/sgi;
238 $$content =~ s/<\/p\s*>//sgi;
239 # $$content =~ s/<\/a\s*>//sgi;
240 $$content =~ s/<p\s*(.*?)>/\n\n/sgi;
241 $$content =~ s/onclick="(.*?)"//sgi;
242 $$content =~ s/onload="(.*?)"//sgi;
243
244 }
245
246 sub __clean_img {
247 my ($self, $content) = @_;
248
249 my @garbage;
250 my $i = 1;
251
252 while ( $$content =~ /(<img.*?>)/sgi ) {
253 my $img = $1;
254 my $src;
255 if ( $img =~ /src=([^\x20|^>]+)/i ) {
256 $src = $1;
257 }
258 my ($w, $h);
259 if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) {
260 $w = $1;
261 }
262 if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) {
263 $h = $1;
264 }
265 my $delim = 0;
266 if ( $w && $h ) {
267 foreach my $pair ( @bad_dimensions ) {
268 if ($w == $pair->{w} && $h == $pair->{h}) {
269 $delim = 10;
270 last;
271 }
272 }
273 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
274 }
275 my $bad_name = __check_img_name ( $src );
276 if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) {
277 # warn "Bad name: [$src]\n";
278 push @garbage, $src;
279 }
280 }
281
282 foreach my $src (@garbage) {
283 $src =~ s|([*+?()/\\\$\[\]])|\\$1|sg;
284 $$content =~ s/<img([^>]*?)src=$src([^>]+)>//si;
285 }
286 }
287
288
289 sub __check_img_name {
290 my $name = shift;
291 my $test = $1 if $name =~ /\/([^\/]+)$/;
292 107 ahitrov if ( $test && ($test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i) ) {
293 99 ahitrov return 1;
294 }
295 foreach my $word ( @PICNAME ) {
296 107 ahitrov if ( $test && ($test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si) ) {
297 99 ahitrov return 1;
298 }
299 }
300 foreach my $word ( @PICURL ) {
301 if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) {
302 return 1;
303 }
304 }
305 foreach my $word ( @PICHOST ) {
306 if ( index (lc($name), $word) >= 0 ) {
307 return 1;
308 }
309 }
310 return 0;
311 }
312
313
314 sub __clean_empty_div {
315 my ($self, $content) = @_;
316
317 my $i = 0;
318 while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) {
319 $i++;
320 }
321
322 return $i;
323 }
324
325
326 sub __make_tree {
327 my ($self, $content, $rools, $debug) = @_;
328
329 my @elems = split (//,$$content);
330 my @collaborate;
331 if ( ref $rools eq 'ARRAY' && @$rools) {
332 @collaborate = grep { $_->{command} eq 'collaborate' } @$rools;
333 }
334 my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 );
335 my $id = 1;
336 my $level = 0;
337 my %tree = (
338 root => {
339 id => $id++,
340 text => '',
341 type => 'root',
342 children=> [],
343 parent => undef,
344 level => $level,
345 },
346 );
347 my @stack;
348 my %elem_hash = ( 1 => $tree{root} );
349 my $current = $tree{root};
350 my $previous;
351
352 while ( @elems ) {
353 if ($elems[0] =~ /[\ \t]/ && !$current){
354 shift @elems;
355 next;
356 }
357 if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) {
358 my $tag = $self->__try_tag (\@elems);
359 if ( ref $tag && $tag->{type} eq 'text' ) {
360 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
361 if ( ref $last_text_tag ) {
362 $last_text_tag->{text} .= $tag->{content};
363 $last_text_tag->{count} += $tag->{count};
364 } else {
365 $last_text_tag = $tag;
366 $last_text_tag->{id} = $id++;
367 $last_text_tag->{type} = 'text';
368 $last_text_tag->{parent} = $current;
369 $last_text_tag->{level} = $level+1;
370 101 ahitrov $last_text_tag->{text} = $tag->{content};
371 99 ahitrov $elem_hash{$last_text_tag->{id}} = $last_text_tag;
372 push @{$current->{children}}, $last_text_tag;
373 $current->{text_count}++;
374 }
375 $current->{text_value} += $tag->{count};
376 splice @elems, 0, $tag->{count};
377 101 ahitrov # warn "Tag opened. Next text: [".join('',$elems[0..10])."]\n";
378 99 ahitrov } elsif ( ref $tag ) {
379 if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
380 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
381 do {
382 $current = pop @stack;
383 $level = $current->{level};
384 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
385 } while ( ($current->{type} !~ /table|body|html/) && scalar @stack );
386
387 }
388 if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) {
389 # warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug;
390 do {
391 $current = pop @stack;
392 $level = $current->{level};
393 # warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
394 } while ( ($current->{type} eq "table") && scalar @stack );
395
396 }
397 $tag->{id} = $id++;
398 $tag->{children} = [];
399 $tag->{text_count} = 0;
400 $tag->{parent} = $current;
401 $tag->{level} = ++$level;
402 $elem_hash{$tag->{id}} = $tag;
403 101 ahitrov push @{$current->{children}}, $tag;
404 99 ahitrov push @stack, $current;
405 101 ahitrov # warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
406 99 ahitrov $current = $tag;
407 splice @elems, 0, $tag->{count};
408 } else {
409 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
410 last;
411 }
412 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
413 my $tag = $self->__try_end (\@elems);
414 if ( ref $tag && $tag->{type} eq 'text' ) {
415 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
416 if ( ref $last_text_tag ) {
417 $last_text_tag->{text} .= $tag->{content};
418 $last_text_tag->{count} += $tag->{count};
419 } else {
420 $last_text_tag = $tag;
421 $last_text_tag->{id} = $id++;
422 $last_text_tag->{type} = 'text';
423 $last_text_tag->{parent} = $current;
424 101 ahitrov $last_text_tag->{text} = $tag->{content};
425 99 ahitrov $last_text_tag->{level} = $level+1;
426 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
427 push @{$current->{children}}, $last_text_tag;
428 $current->{text_count}++;
429 }
430 $current->{text_value} += $tag->{count};
431 splice @elems, 0, $tag->{count};
432 } elsif ( ref $tag ) {
433 if ( $current->{type} ne $tag->{type} ) {
434 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
435 # warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n";
436 if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) {
437 do {
438 $current = pop @stack;
439 $level = $current->{level};
440 # warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
441 } while ( ($current->{type} ne $tag->{type}) && scalar @stack );
442 $current = pop @stack;
443 $level = $current->{level};
444 # warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
445 }else{
446 # warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
447 }
448 } else {
449 if ( @collaborate ) {
450 if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate)
451 && $previous->{type} eq $current->{type} && $previous->{level} == $current->{level}
452 && $previous->{text} && $current->{text} ) {
453 $previous->{text} .= ' '.$current->{text};
454 my $parent = $current->{parent};
455 splice @{$parent->{children}}, -1, 1;
456 delete $elem_hash{$current->{id}};
457 $current = undef;
458 } elsif ( !defined $previous && $current->{text} ) {
459 $previous = $current;
460 } else {
461 $previous = undef;
462 }
463 }
464 $current = pop @stack;
465 $level = $current->{level};
466 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
467 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
468 }
469 splice @elems, 0, $tag->{count};
470 } else {
471 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
472 last;
473 }
474 } else {
475 my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
476 if ( ref $last_text_tag ) {
477 $last_text_tag->{text} .= shift @elems;
478 $last_text_tag->{count}++;
479 } else {
480 101 ahitrov $last_text_tag = {};
481 99 ahitrov $last_text_tag->{text} = shift @elems;
482 $last_text_tag->{count} = 1;
483 $last_text_tag->{id} = $id++;
484 $last_text_tag->{type} = 'text';
485 $last_text_tag->{parent} = $current;
486 $last_text_tag->{level} = $level+1;
487 $elem_hash{$last_text_tag->{id}} = $last_text_tag;
488 push @{$current->{children}}, $last_text_tag;
489 $current->{text_count}++;
490 $current->{text_value} = 0;
491 }
492 $current->{text_value}++;
493 }
494 }
495 return (\%tree, \%elem_hash);
496 }
497
498
499 sub __try_tag {
500 my ($self, $content) = @_;
501
502 my $i = 1;
503 my %tag;
504 my $tag = $content->[0];
505 101 ahitrov while ( $i < (scalar @$content - 1) && $content->[$i] ne '<' && $content->[$i] ne '>' ) {
506 99 ahitrov $tag .= $content->[$i];
507 $i++;
508 }
509 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
510 return {
511 type => 'text',
512 content => $tag,
513 count => $i,
514 };
515 101 ahitrov }
516 $tag .= $content->[$i++];
517 # warn "TAG: [$tag]\n";
518
519 if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
520 my $val = $1;
521 if ( $tag =~ /^<($val)\s*(.*)/i ) {
522 $tag{type} = lc($1);
523 my $args = $2;
524 $tag{count} = $i;
525 my %args;
526 while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
527 $args{lc($1)} = $2;
528 99 ahitrov }
529 101 ahitrov while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
530 $args{lc($1)} = $2;
531 }
532 while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
533 $args{lc($1)} = $2;
534 }
535 foreach my $arg ( qw( name id class width align ) ) {
536 $tag{params}{$arg} = $args{$arg} if exists $args{$arg};
537 }
538 return \%tag;
539 } else {
540 99 ahitrov return {
541 type => 'text',
542 content => $tag,
543 count => $i,
544 };
545 }
546 101 ahitrov } else {
547 return {
548 type => 'text',
549 content => $tag,
550 count => $i,
551 };
552 99 ahitrov }
553 }
554
555 sub __try_end {
556 my ($self, $content) = @_;
557
558 my $i = 2;
559 my %tag;
560 my $tag = $content->[0].$content->[1];
561 101 ahitrov while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < (scalar @$content-1) ) {
562 99 ahitrov $tag .= $content->[$i];
563 $i++;
564 }
565 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
566 return {
567 type => 'text',
568 content => $tag,
569 count => $i,
570 };
571 101 ahitrov }
572 $tag .= $content->[$i++];
573 # warn "TAG END: [$tag]\n";
574 if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
575 99 ahitrov my $val = $1;
576 101 ahitrov if ( $tag =~ /^<\/($val)[\s>]/i ) {
577 99 ahitrov $tag{type} = lc($1);
578 101 ahitrov $tag{count} = $i;
579 99 ahitrov return \%tag;
580 } else {
581 return {
582 type => 'text',
583 content => $tag,
584 count => $i,
585 };
586 }
587 101 ahitrov } else {
588 99 ahitrov return {
589 type => 'text',
590 content => $tag,
591 count => $i,
592 };
593 }
594 }
595
596
597 sub __extract_img {
598 my ($self, $structure, $base_url, $debug) = @_;
599 return unless ref $structure eq 'HASH';
600
601 101 ahitrov foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
602 99 ahitrov my $text = $tag->{text};
603 while ( $text =~ /<img (.*?)>/sgi ) {
604 # warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
605 my $params = $1;
606 my $img = {};
607 if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
608 $img->{url} = $1;
609 101 ahitrov $img->{url} =~ s/[\r\t\n\ ]+$//;
610 $img->{url} =~ s/^[\r\t\n\ ]+//;
611 99 ahitrov $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
612 $img->{url} =~ s/\/+/\//sgi;
613 $img->{url} =~ s/http:\//http:\/\//sgi;
614 $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
615 $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
616 $img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
617 $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
618 push @{ $tag->{images} }, $img;
619 # warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
620 }
621 }
622 $text =~ s/<img (.*?)>//sgi;
623 $tag->{text} = $text;
624 $tag->{count} = length ($text);
625 }
626 }
627
628
629 sub __extract_headers {
630 my ($self, $structure, $debug) = @_;
631 return unless ref $structure eq 'HASH';
632
633 101 ahitrov foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
634 99 ahitrov my $text = $tag->{text};
635 while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
636 my $header_level = $1;
637 my $header_text = $2;
638 $tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY';
639 push @{ $tag->{headers} }, { level => $header_level, text => $header_text };
640 }
641 }
642
643 }
644
645
646 sub __dig_big_texts {
647 my ($self, %opts) = @_;
648 my $structure = exists $opts{structure} ? $opts{structure} : undef;
649 my $minimum = exists $opts{min} ? $opts{min} : undef;
650 my $debug = exists $opts{debug} ? $opts{debug} : undef;
651 my $rools = exists $opts{rools} ? $opts{rools} : undef;
652 return unless ref $structure eq 'HASH';
653
654 my @rools;
655 if ( ref $rools eq 'ARRAY' && @$rools) {
656 @rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools;
657 }
658 my @exclude_rools;
659 if ( ref $rools eq 'ARRAY' && @$rools) {
660 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools;
661 }
662
663 my @ret;
664 101 ahitrov foreach my $tag ( sort { $a->{id} <=> $b->{id} } grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
665 next if $self->__exclude_rools($tag->{parent}, \@exclude_rools);
666 99 ahitrov
667 if ( @rools ) {
668 my $choose = 0;
669 foreach my $rool ( @rools ) {
670 my $matched = 1;
671 foreach my $cond ( @{$rool->{condition}} ) {
672 101 ahitrov unless ( exists $tag->{parent}{params}{$cond->{param}} && $tag->{parent}{params}{$cond->{param}} eq $cond->{value} ) {
673 99 ahitrov $matched = 0;
674 }
675 }
676 $choose ||= $matched;
677 }
678 if ( $choose ) {
679 101 ahitrov for ( $tag->{text} ) {
680 s/^[\t\ \n\r]+//s;
681 s/[\t\ \n\r]+$//s;
682 s/[\t\ ]+/\ /sg;
683 s/\r//sg;
684 s/\n{2,}/\n\n/sg;
685 s/\&\\x(\d+)//sgi;
686 }
687 99 ahitrov
688 my $text = $tag->{text};
689 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
690 101 ahitrov $text = Contenido::Parser::Util::strip_html($text);
691 $tag->{text_weight} = length($text);
692 99 ahitrov if ( length($text) >= $minimum ) {
693 101 ahitrov for ( $tag->{text} ) {
694 s/<a.*?>//sgi;
695 s/<\/a.*?>//sgi;
696 }
697 push @ret, $tag;
698 99 ahitrov }
699 }
700 } else {
701 101 ahitrov for ( $tag->{text} ) {
702 s/^[\t\ \n\r]+//s;
703 s/[\t\ \n\r]+$//s;
704 s/[\t\ ]+/\ /sg;
705 s/\r//sg;
706 s/\n{2,}/\n\n/sg;
707 }
708 99 ahitrov my $text = $tag->{text};
709 $text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
710 101 ahitrov $text = Contenido::Parser::Util::strip_html($text);
711 $tag->{text_weight} = length($text);
712 99 ahitrov if ( length($text) >= $minimum ) {
713 101 ahitrov for ( $tag->{text} ) {
714 s/<a.*?>//sgi;
715 s/<\/a.*?>//sgi;
716 s/\&\\x(\d+)//sgi;
717 }
718 99 ahitrov push @ret, $tag;
719 }
720 }
721 }
722 unless ( @ret ) {
723 warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug;
724 }
725 107 ahitrov warn "Digging done!\n" if $debug;
726 99 ahitrov return \@ret;
727 }
728
729
730
731 sub __check_headers {
732 my ($self, $chosen, $header, $debug) = @_;
733 return unless ref $chosen eq 'ARRAY';
734
735 unless ( grep { exists $_->{headers} } @$chosen ) {
736 warn "No headers found\n" if $debug;
737 return $chosen;
738 } else {
739 # @$chosen = grep { exists $_->{headers} } @$chosen;
740 }
741 my @ret;
742 foreach my $unit ( @$chosen ) {
743 unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) {
744 $unit->{header_identity} = 0;
745 $unit->{header_min_level} = 32768;
746 next;
747 }
748 my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}};
749 my $min_level = $headers[0]->{level};
750 $unit->{header_min_level} = $min_level;
751 if ( $header ) {
752 my $coeff = $self->__str_compare( $header, $headers[0]->{text} );
753 $unit->{header_identity} = $coeff;
754 }
755 }
756 # @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen;
757 # return \@ret;
758 return $chosen;
759 }
760
761
762
763 sub __check_description {
764 my ($self, $chosen, $desc, $debug) = @_;
765 return unless ref $chosen eq 'ARRAY' && $desc;
766
767 my @ret;
768 foreach my $unit ( @$chosen ) {
769 if ( $desc ) {
770 my $coeff = $self->__str_compare( $unit->{text}, $desc );
771 warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug;
772 $unit->{description_identity} = $coeff;
773 }
774 }
775 @ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen;
776 return \@ret;
777 }
778
779
780 # wtf, bastards! how come my code's used here? --ra
781 # damn, it's not 100% your code already
782 sub __str_compare {
783 my ($self, $original, $applicant) = @_;
784
785 my $Al = __freq_list($original);
786 return -1 unless defined $Al;
787 my $Bl = __freq_list($applicant);
788 return -1 unless defined $Bl;
789 my $df = 0;
790
791 foreach my $word ( %$Bl ) {
792 if ( exists $Al->{$word} ) {
793 104 ahitrov $df += $Al->{$word} || 0;
794 99 ahitrov } else {
795 104 ahitrov $df -= $Bl->{$word} || 0;
796 99 ahitrov }
797 }
798
799 return $df;
800 }
801
802 sub __freq_list {
803
804 my @d = grep { length($_) > 3 } split /\W/, $_[0];
805 return undef unless @d;
806 my $z = 1/scalar(@d); my %l = ();
807 $l{$_} += $z for @d; \%l;
808 }
809
810
811
812 sub __strip_html {
813 my ($self, %opts) = @_;
814 return unless ref $opts{chosen} eq 'ARRAY';
815
816 my $chosen = $opts{chosen};
817 my $rooles = $opts{rools};
818 my $header = $opts{header};
819
820 foreach my $unit ( @$chosen ) {
821 my %tags;
822 my $headers = $unit->{headers} if exists $unit->{headers};
823 if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) {
824 if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) {
825 $unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi;
826 $unit->{text} =~ s/^[\x20\t\r\n]+//si;
827 } elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) {
828 while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) {
829 my $hdr = 'h'.$1;
830 $unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si;
831 }
832 }
833 }
834 for ( $unit->{text} ) {
835 s/></> </sg;
836 s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi;
837 s/<\/h(\d+)(.*?)>/\. /sgi;
838 s/<h(\d+)(.*?)>/\n/sgi;
839 s/&#38;/\&/sg;
840 s/&amp;/\&/sgi;
841 s/&#171;/«/sg;
842 s/&#187;/»/sg;
843 s/&#163;/£/sg;
844 s/&#150;/&ndash;/sg;
845 s/&#151;/&mdash;/sg;
846 s/&#133;/\.\.\./sg;
847 s/&#132;/"/sg;
848 s/&#147;/"/sg;
849 s/&#148;/"/sg;
850 s/&#180;/'/sg;
851 s/&#13;/\n/sg;
852 s/&#34;/"/sg;
853 s/&nbsp;/\ /sgi;
854 }
855 # $unit->{text} = HTML::Entities::decode_entities($unit->{text});
856 # $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
857 for ( $unit->{text} ) {
858 101 ahitrov s/^[\ \t\r\n]+//si;
859 99 ahitrov s/^(\d+)\.(\d+)\.(\d+)//si;
860 101 ahitrov s/^[\ \t\r\n]+//si;
861 99 ahitrov s/^(\d+):(\d+)//si;
862 101 ahitrov s/^[\ \t\r\n]+//si;
863 99 ahitrov }
864 if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
865 substr $unit->{text}, 0, length($header), '';
866 101 ahitrov $unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
867 99 ahitrov }
868 101 ahitrov $unit->{text} =~ s/[\ \t\r\n]+$//sgi;
869 99 ahitrov }
870 }
871
872
873
874 sub __glue {
875 my ($self, $chosen, $glue, $debug) = @_;
876 return unless ref $chosen eq 'ARRAY';
877
878 my $i = 0;
879
880 if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) {
881 my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768;
882 foreach my $unit ( @$chosen ) {
883 next unless $i++;
884 if ( $i <= $count ) {
885 $chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text};
886 }
887 }
888 } elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) {
889 my $text = '';
890 my $i = 0;
891 foreach my $pos ( @{ $glue->{condition} } ) {
892 $text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text};
893 }
894 $chosen->[0]->{text} = $text;
895 }
896 }
897
898
899 sub __get_images {
900 my ($self, %opts) = @_;
901 my $structure = exists $opts{structure} ? $opts{structure} : undef;
902 my $chosen = exists $opts{chosen} ? $opts{chosen} : undef;
903 my $debug = exists $opts{debug} ? $opts{debug} : undef;
904 my $rools = exists $opts{rools} ? $opts{rools} : undef;
905 my $base_url = delete $opts{base_url};
906 return unless ref $chosen && ref $structure;
907
908 return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools;
909 my @use_rools;
910 my @exclude_rools;
911 my $no_validation = 0;
912 if ( ref $rools eq 'ARRAY' && @$rools) {
913 @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools;
914 @exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools;
915 $no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools;
916 }
917 my $image_depth;
918 if ( ref $rools eq 'ARRAY' && @$rools) {
919 my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools;
920 $image_depth = $rools[-1]->{condition} if @rools;
921 }
922
923 my @images;
924 foreach my $tag ( values %$structure ) {
925 next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY';
926 next if $self->__exclude_rools($tag, \@exclude_rools);
927
928 if ( @use_rools ) {
929 my $choose = 0;
930 foreach my $rool ( @use_rools ) {
931 my $matched = 1;
932 foreach my $cond ( @{$rool->{condition}} ) {
933 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
934 $matched = 0;
935 }
936 }
937 $matched = 0 if $self->__exclude_rools($tag, \@exclude_rools);
938 $choose ||= $matched;
939 }
940 if ( $choose ) {
941 my @img = grep { $no_validation || $self->__img_is_valid ($_) } map {
942 my $img = rchannel::Image->new($_);
943 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
944 $img;
945 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
946
947 push @images, @img;
948 }
949 } else {
950 101 ahitrov next if ($tag->{level}+1) < $chosen->{parent}{level};
951 next if $image_depth && ( $tag->{level} > ($chosen->{parent}{level} + $image_depth) );
952 99 ahitrov
953 my $ok = 0;
954 101 ahitrov my $uphops = $tag->{level} > $chosen->{parent}{level} ? 1 : 2;
955 my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{parent}{level} + $uphops;
956 99 ahitrov next if ($hops - $uphops) > 4;
957 my @img_parents = ($tag->{id});
958 my $parent = $tag;
959 for ( 1..$hops ) {
960 $parent = $parent->{parent};
961 push @img_parents, $parent->{id};
962 }
963 101 ahitrov $parent = $chosen->{parent}{parent};
964 99 ahitrov for ( 0..$uphops ) {
965 if ( grep { $parent->{id} == $_ } @img_parents ) {
966 $ok = 1;
967 last;
968 }
969 $parent = $parent->{parent};
970 }
971 if ( $ok ) {
972 my @img = grep { $self->__img_is_valid ($_) } map {
973 101 ahitrov my $img = $_;
974 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/;
975 99 ahitrov $img;
976 } map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
977
978 push @images, @img;
979 }
980 }
981 }
982 # warn Dumper (\@images);
983 if ( @images ) {
984 return \@images;
985 } else {
986 return undef;
987 }
988 }
989
990
991 sub __img_is_valid {
992 my ($self, $img) = @_;
993
994 101 ahitrov return 1;
995 99 ahitrov if ( $img->check_online ) {
996 my $delim = 0;
997 my $w = $img->width;
998 my $h = $img->height;
999 if ( $w && $h ) {
1000 foreach my $pair ( @bad_dimensions ) {
1001 if ($w == $pair->{w} && $h == $pair->{h}) {
1002 return undef;
1003 }
1004 }
1005 $delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
1006 if ( $w < 80 || $h < 80 || $delim > 2.5 ) {
1007 return undef;
1008 }
1009 }
1010 } else {
1011 # warn "Image ".$img->src." not found on server";
1012 return undef;
1013 }
1014 return 1;
1015 }
1016
1017
1018 sub __exclude_rools {
1019 my ($self, $tag, $rools) = @_;
1020 return undef unless ref $rools eq 'ARRAY' && @$rools;
1021
1022 my $choose = 0;
1023 foreach my $rool ( @$rools ) {
1024 my $matched = 1;
1025 foreach my $cond ( @{$rool->{condition}} ) {
1026 unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
1027 $matched = 0;
1028 }
1029 }
1030 $choose ||= $matched;
1031 }
1032 return $choose;
1033 }
1034
1035
1036 sub __parse_rools {
1037 my ($self, $rools) = @_;
1038 return unless $rools;
1039 $rools =~ s/\r//sgi;
1040 my @rools = split /\n/, $rools;
1041 return unless @rools;
1042
1043 my @parsed;
1044 foreach my $rool ( @rools ) {
1045 my %pr;
1046 next if $rool =~ /^#/;
1047 $rool =~ s/[\x20\t]+$//;
1048 $rool =~ s/^[\x20\t]+//;
1049 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
1050 $pr{command} = lc($1);
1051 my $params = $2;
1052
1053 if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
1054 $pr{subcommand} = lc($1); $params = $2;
1055 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/;
1056 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1057 $pr{condition} = $params;
1058 } elsif ( $pr{command} eq 'glue' ) {
1059 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1060 $pr{subcommand} = $1; $params = $2;
1061 next unless $pr{subcommand} =~ /^(first|all|order)$/;
1062 if ( $pr{subcommand} eq 'order' ) {
1063 my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params);
1064 $pr{condition} = \@pars;
1065 } else {
1066 $pr{condition} = int($1);
1067 }
1068 } elsif ( $params =~ /(\d+)/i ) {
1069 $pr{subcommand} = 'first';
1070 $pr{condition} = int($1);
1071 } else {
1072 $pr{subcommand} = 'all';
1073 }
1074 } elsif ( $pr{command} eq 'trim' ) {
1075 if ( $params =~ /(left|right)/i ) {
1076 $pr{subcommand} = lc($1);
1077 } else {
1078 $pr{subcommand} = 'all';
1079 }
1080 } elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) {
1081 $pr{condition} = $1;
1082 } elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) {
1083 $pr{command} = 'image_off';
1084 } elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) {
1085 $pr{subcommand} = lc($1); $params = $2;
1086 next unless $pr{subcommand} =~ /^(depth)$/;
1087 $pr{condition} = $params;
1088 } elsif ( $pr{command} eq 'set' ) {
1089 if ( $params =~ /^(limit)\s+(.*)$/si ) {
1090 $pr{subcommand} = lc($1);
1091 $params = $2;
1092 }
1093 if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1094 $pr{condition} = { param => $1, value => $2 };
1095 } else {
1096 next;
1097 }
1098 } elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) {
1099 $pr{command} = 'kill';
1100 $pr{condition} = { param => $2, value => $1 };
1101 } elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) {
1102 $pr{command} = 'set';
1103 $pr{condition} = { param => 'description', value => 'header' };
1104 } elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) {
1105 $pr{subcommand} = $1; $params = $2;
1106 next unless $pr{subcommand} =~ /^(element|elem|image)$/;
1107 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1108 my @conditions;
1109 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1110 push @conditions, { param => lc($1), value => $2 }
1111 }
1112 $pr{condition} = \@conditions;
1113 } elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) {
1114 $pr{subcommand} = lc($1); $params = $2;
1115 next unless $pr{subcommand} =~ /^(image|elem|element)$/;
1116 $pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1117 my @conditions;
1118 while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1119 push @conditions, { param => lc($1), value => $2 }
1120 }
1121 $pr{condition} = \@conditions;
1122 } elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) {
1123 $pr{command} = 'dont';
1124 $pr{subcommand} = lc($1); $params = $2;
1125 next unless $pr{subcommand} =~ /^(cut|validate)$/;
1126 my @conditions;
1127 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1128 $pr{condition} = { param => lc($1), value => $2 };
1129 } elsif ( $params =~ /(image)/i ) {
1130 $pr{condition} = { param => lc($1) };
1131 } else {
1132 next;
1133 }
1134 } elsif ( $pr{command} eq 'clean' ) {
1135 if ( $params =~ /^(off)\s+(.*)$/ ) {
1136 $pr{subcommand} = lc($1); $params = $2;
1137 }
1138 if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1139 $pr{condition} = { param => lc($1), value => $2 };
1140 } elsif ( $params =~ /(span)/i ) {
1141 $pr{condition} = { param => 'tag', value => lc($1) };
1142 } else {
1143 next;
1144 }
1145 } else {
1146 next;
1147 }
1148 push @parsed, \%pr;
1149 }
1150 }
1151 return ( scalar @parsed ? \@parsed : undef );
1152 }
1153
1154
1155 sub __post_rool {
1156 my ($self, $element, $rools, $description) = @_;
1157
1158 if ( ref $rools eq 'ARRAY' && @$rools ) {
1159 foreach my $rool ( @$rools ) {
1160 if ( $rool->{command} eq 'cut' ) {
1161 my $condition = $rool->{condition};
1162 if ( $rool->{subcommand} eq 'off' ) {
1163 $element->{text} =~ s/$condition//sgi;
1164 } elsif ( $rool->{subcommand} eq 'from' ) {
1165 my $pos = index $element->{text}, $condition;
1166 if ( $pos >= 0 ) {
1167 $element->{text} = substr $element->{text}, 0, $pos;
1168 }
1169 # $element->{text} =~ s/$condition(.*)$//si;
1170 } elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) {
1171 my $str = substr $description, 0, 12;
1172 my $pos = index $element->{text}, $str;
1173 if ( $pos >= 0 ) {
1174 $element->{text} = substr $element->{text}, $pos, -1;
1175 }
1176 } elsif ( $rool->{subcommand} eq 'till' ) {
1177 $element->{text} =~ s/^(.*?)($condition)//si;
1178 } elsif ( $rool->{subcommand} eq 'untill' ) {
1179 $element->{text} =~ s/^(.*?)($condition)/$2/si;
1180 } elsif ( $rool->{subcommand} eq 'regex' ) {
1181 $element->{text} =~ s/$condition//sgi;
1182 }
1183 } elsif ( $rool->{command} eq 'trim' ) {
1184 if ( $rool->{subcommand} eq 'left' ) {
1185 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1186 } elsif ( $rool->{subcommand} eq 'right' ) {
1187 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1188 } else {
1189 $element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1190 $element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1191 }
1192 }
1193 }
1194 }
1195 }
1196
1197
1198 1;