1 |
99 |
ahitrov |
package Contenido::Parser::HTML; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use warnings; |
5 |
|
|
use locale; |
6 |
|
|
|
7 |
|
|
use base 'Contenido::Parser'; |
8 |
|
|
|
9 |
|
|
use Contenido::Globals; |
10 |
|
|
use Utils::HTML; |
11 |
|
|
use Data::Dumper; |
12 |
|
|
use utf8; |
13 |
|
|
|
14 |
|
|
|
15 |
|
|
my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery |
16 |
|
|
head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart |
17 |
|
|
basket main reply title corner address page buy pix pixel spacer fon welcome razd about back |
18 |
|
|
shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones |
19 |
|
|
dealer diler background bg news rss index none btn cards up footer noimage but link excel price |
20 |
|
|
mid graphic busket map girl space catalog bann headline hosting contact schedule redir email |
21 |
|
|
); |
22 |
|
|
|
23 |
|
|
my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru |
24 |
|
|
mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net |
25 |
|
|
); |
26 |
|
|
|
27 |
|
|
my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban |
28 |
|
|
icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt |
29 |
|
|
ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc |
30 |
|
|
); |
31 |
|
|
|
32 |
|
|
|
33 |
|
|
my @bad_dimensions = ( |
34 |
|
|
{ w => 120, h => 60 }, |
35 |
|
|
{ w => 468, h => 60 }, |
36 |
|
|
{ w => 120, h => 600 }, |
37 |
|
|
{ w => 88, h => 31 }, |
38 |
|
|
); |
39 |
|
|
|
40 |
|
|
|
41 |
|
|
sub new { |
42 |
|
|
my ($proto) = @_; |
43 |
|
|
my $class = ref($proto) || $proto; |
44 |
|
|
my $self = {}; |
45 |
|
|
bless $self, $class; |
46 |
|
|
|
47 |
|
|
return $self; |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
|
51 |
|
|
sub parse { |
52 |
|
|
my ($self, %opts) = @_; |
53 |
|
|
|
54 |
|
|
my $content; |
55 |
|
|
if ( $opts{content} ) { |
56 |
|
|
$content = delete $opts{content}; |
57 |
|
|
delete $self->{content}; |
58 |
|
|
} elsif ( $self->{success} || $self->{content} ) { |
59 |
|
|
$content = delete $self->{content}; |
60 |
|
|
} else { |
61 |
|
|
$self->{success} = 0; |
62 |
|
|
return $self; |
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
my $base_url = delete $self->{base_url} || delete $opts{base_url}; |
66 |
|
|
my $strip_html = delete $opts{strip_html}; |
67 |
|
|
my $debug = $DEBUG; |
68 |
|
|
my $gui = delete $opts{gui}; |
69 |
|
|
my $header = delete $opts{header}; |
70 |
|
|
my $description = delete $opts{description}; |
71 |
|
|
my $minimum = delete $opts{min} || length $description; |
72 |
|
|
|
73 |
|
|
my $pre_rools = $self->__parse_rools (delete $opts{parser_pre}); |
74 |
|
|
warn Dumper ($pre_rools) if $debug; |
75 |
|
|
my $parse_rools = $self->__parse_rools (delete $opts{parser_run}); |
76 |
|
|
warn Dumper ($parse_rools) if $debug; |
77 |
|
|
my $post_rools = $self->__parse_rools (delete $opts{parser_end}); |
78 |
|
|
warn Dumper ($post_rools) if $debug; |
79 |
|
|
|
80 |
|
|
# warn "Experimental. Debug!!!\n" if $debug; |
81 |
|
|
if ( ref $pre_rools eq 'ARRAY' ) { |
82 |
|
|
my @sets = grep { $_->{command} eq 'set' } @$pre_rools; |
83 |
|
|
foreach my $set ( @sets ) { |
84 |
|
|
if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) { |
85 |
|
|
my $value = $set->{condition}{value}; |
86 |
|
|
unless ( $value =~ /\D/ ) { |
87 |
|
|
if ( $set->{subcommand} eq 'limit' ) { |
88 |
|
|
$minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value); |
89 |
|
|
} else { |
90 |
|
|
$minimum = int($value); |
91 |
|
|
} |
92 |
|
|
} |
93 |
|
|
} |
94 |
|
|
if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) { |
95 |
|
|
$description = $header; |
96 |
|
|
} |
97 |
|
|
} |
98 |
|
|
} |
99 |
|
|
$minimum ||= 300; |
100 |
|
|
|
101 |
|
|
warn "Tag cleaning...\n" if $debug; |
102 |
|
|
$self->__clean_tags (\$content, $pre_rools); |
103 |
|
|
$content =~ s/>\s+</></sgi; |
104 |
|
|
warn "Image cleaning...\n" if $debug; |
105 |
|
|
$self->__clean_img (\$content); |
106 |
|
|
warn "Empty div cleaning...\n" if $debug; |
107 |
|
|
while ( $self->__clean_empty_div (\$content) ) {} |
108 |
|
|
warn "Make tree...\n" if $debug; |
109 |
|
|
my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug); |
110 |
|
|
|
111 |
|
|
$self->__extract_img ($shortcuts, $base_url, $debug); |
112 |
|
|
$self->__extract_headers ($shortcuts, $header, $debug); |
113 |
|
|
warn "Getting big texts (min=$minimum)...\n" if $debug; |
114 |
|
|
my $chosen = $self->__dig_big_texts ( |
115 |
|
|
structure => $shortcuts, |
116 |
|
|
min => $minimum, |
117 |
|
|
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), |
118 |
|
|
debug => $debug ); |
119 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
120 |
|
|
$self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value'; |
121 |
|
|
return $self->is_success(0) unless $gui; |
122 |
|
|
} |
123 |
|
|
@$chosen = sort { $a->{id} <=> $b->{id} } @$chosen; |
124 |
|
|
if ( $description ) { |
125 |
|
|
my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY'; |
126 |
|
|
$chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools; |
127 |
|
|
} |
128 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
129 |
|
|
$self->{error_message} = 'I didn\'t find any valuable text'; |
130 |
|
|
return $self->is_success(0) unless $gui; |
131 |
|
|
} |
132 |
|
|
if ( scalar @$chosen > 1 ) { |
133 |
|
|
$chosen = $self->__check_headers ($chosen, $header, $debug); |
134 |
|
|
} |
135 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
136 |
|
|
$self->{error_message} = 'I didn\'t find any valuable text'; |
137 |
|
|
return $self->is_success(0) unless $gui; |
138 |
|
|
} |
139 |
|
|
$self->__strip_html ( |
140 |
|
|
chosen => $chosen, |
141 |
|
|
header => $header, |
142 |
|
|
ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (), |
143 |
|
|
debug => $debug |
144 |
|
|
); |
145 |
|
|
if ( ref $parse_rools eq 'ARRAY' ) { |
146 |
|
|
my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools; |
147 |
|
|
$self->__glue ( $chosen, $glue, $debug ) if ref $glue; |
148 |
|
|
} |
149 |
|
|
my $images = $self->__get_images ( |
150 |
|
|
structure => $shortcuts, |
151 |
|
|
chosen => $chosen->[0], |
152 |
|
|
base_url => $base_url, |
153 |
|
|
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), |
154 |
|
|
debug => $debug, |
155 |
|
|
); |
156 |
|
|
if ( ref $images eq 'ARRAY' && @$images ) { |
157 |
|
|
$self->images($images); |
158 |
|
|
$self->image($images->[0]); |
159 |
|
|
} |
160 |
|
|
|
161 |
|
|
if ( $gui ) { |
162 |
|
|
if ( ref $chosen eq 'ARRAY' ) { |
163 |
|
|
foreach my $elem ( @$chosen ) { |
164 |
|
|
$self->__post_rool ($elem, $post_rools, $description); |
165 |
|
|
} |
166 |
|
|
} |
167 |
|
|
$self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen; |
168 |
|
|
$self->{html} = $content; |
169 |
|
|
$self->{tree} = $shortcuts; |
170 |
|
|
$self->{chosen} = $chosen; |
171 |
|
|
} else { |
172 |
|
|
$self->__post_rool ($chosen->[0], $post_rools, $description); |
173 |
|
|
$self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text}); |
174 |
|
|
$tree = undef; |
175 |
|
|
foreach my $key ( keys %$shortcuts ) { |
176 |
|
|
delete $shortcuts->{$key}; |
177 |
|
|
} |
178 |
|
|
$shortcuts = undef; |
179 |
|
|
$content = undef; |
180 |
|
|
} |
181 |
|
|
return $self->is_success(1); |
182 |
|
|
} |
183 |
|
|
|
184 |
|
|
sub __clean_tags { |
185 |
|
|
my ($self, $content, $rools) = @_; |
186 |
|
|
|
187 |
|
|
my @cut_rools; |
188 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
189 |
|
|
@cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools; |
190 |
|
|
} |
191 |
|
|
my @clean_off_rools; |
192 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
193 |
|
|
@clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools; |
194 |
|
|
} |
195 |
|
|
$$content =~ s/<!DOCTYPE(.*?)>//sgi; |
196 |
|
|
$$content =~ s/<!--(.*?)-->//sgi; |
197 |
|
|
$$content =~ s/<script(.*?)<\/script>//sgi; |
198 |
|
|
$$content =~ s/<hr(.*?)>//sgi; |
199 |
|
|
$$content =~ s/<noscript(.*?)<\/noscript>//sgi; |
200 |
|
|
$$content =~ s/<iframe(.*?)<\/iframe>//sgi; |
201 |
|
|
unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) { |
202 |
|
|
$$content =~ s/<noindex(.*?)<\/noindex>//sgi; |
203 |
|
|
} else { |
204 |
|
|
$$content =~ s/<\/?noindex(.*?)>//sgi; |
205 |
|
|
} |
206 |
|
|
$$content =~ s/<object(.*?)<\/object>//sgi; |
207 |
|
|
$$content =~ s/<embed(.*?)<\/embed>//sgi; |
208 |
|
|
$$content =~ s/<style(.*?)<\/style>//sgi; |
209 |
|
|
if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) { |
210 |
|
|
$$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi; |
211 |
|
|
$$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi; |
212 |
|
|
$$content =~ s/<input([^>]*?)>//sgi; |
213 |
|
|
} else { |
214 |
|
|
$$content =~ s/<form(.*?)<\/form>//sgi; |
215 |
|
|
} |
216 |
|
|
foreach my $rool ( @clean_off_rools ) { |
217 |
|
|
next unless $rool->{condition}{param} eq 'tag'; |
218 |
|
|
my $tag = $rool->{condition}{value}; |
219 |
|
|
$$content =~ s/<$tag(.*?)<\/$tag>//sgi; |
220 |
|
|
} |
221 |
|
|
$$content =~ s/<head(.*?)<\/head>//sgi; |
222 |
|
|
$$content =~ s/\ style="(.*?)"//sgi; |
223 |
|
|
|
224 |
|
|
$$content =~ s/<\/?span(.*?)>//sgi; |
225 |
|
|
$$content =~ s/<\/?font(.*?)>//sgi; |
226 |
|
|
$$content =~ s/<br(.*?)>/\n/sgi; |
227 |
|
|
$$content =~ s/<link(.*?)>//sgi; |
228 |
|
|
$$content =~ s/<spacer(.*?)>//sgi; |
229 |
|
|
$$content =~ s/<\!\?(.*?)\?>//sgi; |
230 |
|
|
# $$content =~ s/<a\s*?(.*?)>/\n/sgi; |
231 |
|
|
$$content =~ s/<\/p\s*>//sgi; |
232 |
|
|
# $$content =~ s/<\/a\s*>//sgi; |
233 |
|
|
$$content =~ s/<p\s*(.*?)>/\n\n/sgi; |
234 |
|
|
$$content =~ s/onclick="(.*?)"//sgi; |
235 |
|
|
$$content =~ s/onload="(.*?)"//sgi; |
236 |
|
|
|
237 |
|
|
} |
238 |
|
|
|
239 |
|
|
sub __clean_img { |
240 |
|
|
my ($self, $content) = @_; |
241 |
|
|
|
242 |
|
|
my @garbage; |
243 |
|
|
my $i = 1; |
244 |
|
|
|
245 |
|
|
while ( $$content =~ /(<img.*?>)/sgi ) { |
246 |
|
|
my $img = $1; |
247 |
|
|
my $src; |
248 |
|
|
if ( $img =~ /src=([^\x20|^>]+)/i ) { |
249 |
|
|
$src = $1; |
250 |
|
|
} |
251 |
|
|
my ($w, $h); |
252 |
|
|
if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) { |
253 |
|
|
$w = $1; |
254 |
|
|
} |
255 |
|
|
if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) { |
256 |
|
|
$h = $1; |
257 |
|
|
} |
258 |
|
|
my $delim = 0; |
259 |
|
|
if ( $w && $h ) { |
260 |
|
|
foreach my $pair ( @bad_dimensions ) { |
261 |
|
|
if ($w == $pair->{w} && $h == $pair->{h}) { |
262 |
|
|
$delim = 10; |
263 |
|
|
last; |
264 |
|
|
} |
265 |
|
|
} |
266 |
|
|
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; |
267 |
|
|
} |
268 |
|
|
my $bad_name = __check_img_name ( $src ); |
269 |
|
|
if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) { |
270 |
|
|
# warn "Bad name: [$src]\n"; |
271 |
|
|
push @garbage, $src; |
272 |
|
|
} |
273 |
|
|
} |
274 |
|
|
|
275 |
|
|
foreach my $src (@garbage) { |
276 |
|
|
$src =~ s|([*+?()/\\\$\[\]])|\\$1|sg; |
277 |
|
|
$$content =~ s/<img([^>]*?)src=$src([^>]+)>//si; |
278 |
|
|
} |
279 |
|
|
} |
280 |
|
|
|
281 |
|
|
|
282 |
|
|
sub __check_img_name { |
283 |
|
|
my $name = shift; |
284 |
|
|
my $test = $1 if $name =~ /\/([^\/]+)$/; |
285 |
|
|
if ( $test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i ) { |
286 |
|
|
return 1; |
287 |
|
|
} |
288 |
|
|
foreach my $word ( @PICNAME ) { |
289 |
|
|
if ( $test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si ) { |
290 |
|
|
return 1; |
291 |
|
|
} |
292 |
|
|
} |
293 |
|
|
foreach my $word ( @PICURL ) { |
294 |
|
|
if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) { |
295 |
|
|
return 1; |
296 |
|
|
} |
297 |
|
|
} |
298 |
|
|
foreach my $word ( @PICHOST ) { |
299 |
|
|
if ( index (lc($name), $word) >= 0 ) { |
300 |
|
|
return 1; |
301 |
|
|
} |
302 |
|
|
} |
303 |
|
|
return 0; |
304 |
|
|
} |
305 |
|
|
|
306 |
|
|
|
307 |
|
|
sub __clean_empty_div { |
308 |
|
|
my ($self, $content) = @_; |
309 |
|
|
|
310 |
|
|
my $i = 0; |
311 |
|
|
while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) { |
312 |
|
|
$i++; |
313 |
|
|
} |
314 |
|
|
|
315 |
|
|
return $i; |
316 |
|
|
} |
317 |
|
|
|
318 |
|
|
|
319 |
|
|
sub __make_tree { |
320 |
|
|
my ($self, $content, $rools, $debug) = @_; |
321 |
|
|
|
322 |
|
|
my @elems = split (//,$$content); |
323 |
|
|
my @collaborate; |
324 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
325 |
|
|
@collaborate = grep { $_->{command} eq 'collaborate' } @$rools; |
326 |
|
|
} |
327 |
|
|
my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 ); |
328 |
|
|
my $id = 1; |
329 |
|
|
my $level = 0; |
330 |
|
|
my %tree = ( |
331 |
|
|
root => { |
332 |
|
|
id => $id++, |
333 |
|
|
text => '', |
334 |
|
|
type => 'root', |
335 |
|
|
children=> [], |
336 |
|
|
parent => undef, |
337 |
|
|
level => $level, |
338 |
|
|
}, |
339 |
|
|
); |
340 |
|
|
my @stack; |
341 |
|
|
my %elem_hash = ( 1 => $tree{root} ); |
342 |
|
|
my $current = $tree{root}; |
343 |
|
|
my $previous; |
344 |
|
|
|
345 |
|
|
while ( @elems ) { |
346 |
|
|
if ($elems[0] =~ /[\ \t]/ && !$current){ |
347 |
|
|
shift @elems; |
348 |
|
|
next; |
349 |
|
|
} |
350 |
|
|
if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) { |
351 |
|
|
my $tag = $self->__try_tag (\@elems); |
352 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
353 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
354 |
|
|
if ( ref $last_text_tag ) { |
355 |
|
|
$last_text_tag->{text} .= $tag->{content}; |
356 |
|
|
$last_text_tag->{count} += $tag->{count}; |
357 |
|
|
} else { |
358 |
|
|
$last_text_tag = $tag; |
359 |
|
|
$last_text_tag->{id} = $id++; |
360 |
|
|
$last_text_tag->{type} = 'text'; |
361 |
|
|
$last_text_tag->{parent} = $current; |
362 |
|
|
$last_text_tag->{level} = $level+1; |
363 |
|
|
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
364 |
|
|
push @{$current->{children}}, $last_text_tag; |
365 |
|
|
$current->{text_count}++; |
366 |
|
|
} |
367 |
|
|
$current->{text_value} += $tag->{count}; |
368 |
|
|
splice @elems, 0, $tag->{count}; |
369 |
|
|
} elsif ( ref $tag ) { |
370 |
|
|
if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) { |
371 |
|
|
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug; |
372 |
|
|
do { |
373 |
|
|
$current = pop @stack; |
374 |
|
|
$level = $current->{level}; |
375 |
|
|
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
376 |
|
|
} while ( ($current->{type} !~ /table|body|html/) && scalar @stack ); |
377 |
|
|
|
378 |
|
|
} |
379 |
|
|
if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) { |
380 |
|
|
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug; |
381 |
|
|
do { |
382 |
|
|
$current = pop @stack; |
383 |
|
|
$level = $current->{level}; |
384 |
|
|
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
385 |
|
|
} while ( ($current->{type} eq "table") && scalar @stack ); |
386 |
|
|
|
387 |
|
|
} |
388 |
|
|
$tag->{id} = $id++; |
389 |
|
|
$tag->{children} = []; |
390 |
|
|
$tag->{text_count} = 0; |
391 |
|
|
$tag->{parent} = $current; |
392 |
|
|
$tag->{level} = ++$level; |
393 |
|
|
$elem_hash{$tag->{id}} = $tag; |
394 |
|
|
push @stack, $current; |
395 |
|
|
warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n"; |
396 |
|
|
$current = $tag; |
397 |
|
|
splice @elems, 0, $tag->{count}; |
398 |
|
|
} else { |
399 |
|
|
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; |
400 |
|
|
last; |
401 |
|
|
} |
402 |
|
|
} elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) { |
403 |
|
|
my $tag = $self->__try_end (\@elems); |
404 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
405 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
406 |
|
|
if ( ref $last_text_tag ) { |
407 |
|
|
$last_text_tag->{text} .= $tag->{content}; |
408 |
|
|
$last_text_tag->{count} += $tag->{count}; |
409 |
|
|
} else { |
410 |
|
|
$last_text_tag = $tag; |
411 |
|
|
$last_text_tag->{id} = $id++; |
412 |
|
|
$last_text_tag->{type} = 'text'; |
413 |
|
|
$last_text_tag->{parent} = $current; |
414 |
|
|
$last_text_tag->{level} = $level+1; |
415 |
|
|
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
416 |
|
|
push @{$current->{children}}, $last_text_tag; |
417 |
|
|
$current->{text_count}++; |
418 |
|
|
} |
419 |
|
|
$current->{text_value} += $tag->{count}; |
420 |
|
|
splice @elems, 0, $tag->{count}; |
421 |
|
|
} elsif ( ref $tag ) { |
422 |
|
|
if ( $current->{type} ne $tag->{type} ) { |
423 |
|
|
# warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug; |
424 |
|
|
# warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n"; |
425 |
|
|
if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) { |
426 |
|
|
do { |
427 |
|
|
$current = pop @stack; |
428 |
|
|
$level = $current->{level}; |
429 |
|
|
# warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
430 |
|
|
} while ( ($current->{type} ne $tag->{type}) && scalar @stack ); |
431 |
|
|
$current = pop @stack; |
432 |
|
|
$level = $current->{level}; |
433 |
|
|
# warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; |
434 |
|
|
}else{ |
435 |
|
|
# warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; |
436 |
|
|
} |
437 |
|
|
} else { |
438 |
|
|
if ( @collaborate ) { |
439 |
|
|
if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate) |
440 |
|
|
&& $previous->{type} eq $current->{type} && $previous->{level} == $current->{level} |
441 |
|
|
&& $previous->{text} && $current->{text} ) { |
442 |
|
|
$previous->{text} .= ' '.$current->{text}; |
443 |
|
|
my $parent = $current->{parent}; |
444 |
|
|
splice @{$parent->{children}}, -1, 1; |
445 |
|
|
delete $elem_hash{$current->{id}}; |
446 |
|
|
$current = undef; |
447 |
|
|
} elsif ( !defined $previous && $current->{text} ) { |
448 |
|
|
$previous = $current; |
449 |
|
|
} else { |
450 |
|
|
$previous = undef; |
451 |
|
|
} |
452 |
|
|
} |
453 |
|
|
$current = pop @stack; |
454 |
|
|
$level = $current->{level}; |
455 |
|
|
# warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text}; |
456 |
|
|
# warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
457 |
|
|
} |
458 |
|
|
splice @elems, 0, $tag->{count}; |
459 |
|
|
} else { |
460 |
|
|
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; |
461 |
|
|
last; |
462 |
|
|
} |
463 |
|
|
} else { |
464 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
465 |
|
|
if ( ref $last_text_tag ) { |
466 |
|
|
$last_text_tag->{text} .= shift @elems; |
467 |
|
|
$last_text_tag->{count}++; |
468 |
|
|
} else { |
469 |
|
|
$last_text_tag->{text} = shift @elems; |
470 |
|
|
$last_text_tag->{count} = 1; |
471 |
|
|
$last_text_tag->{id} = $id++; |
472 |
|
|
$last_text_tag->{type} = 'text'; |
473 |
|
|
$last_text_tag->{parent} = $current; |
474 |
|
|
$last_text_tag->{level} = $level+1; |
475 |
|
|
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
476 |
|
|
push @{$current->{children}}, $last_text_tag; |
477 |
|
|
$current->{text_count}++; |
478 |
|
|
$current->{text_value} = 0; |
479 |
|
|
} |
480 |
|
|
$current->{text_value}++; |
481 |
|
|
} |
482 |
|
|
} |
483 |
|
|
return (\%tree, \%elem_hash); |
484 |
|
|
} |
485 |
|
|
|
486 |
|
|
|
487 |
|
|
sub __try_tag { |
488 |
|
|
my ($self, $content) = @_; |
489 |
|
|
|
490 |
|
|
my $i = 1; |
491 |
|
|
my %tag; |
492 |
|
|
my $tag = $content->[0]; |
493 |
|
|
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { |
494 |
|
|
$tag .= $content->[$i]; |
495 |
|
|
$i++; |
496 |
|
|
} |
497 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
498 |
|
|
return { |
499 |
|
|
type => 'text', |
500 |
|
|
content => $tag, |
501 |
|
|
count => $i, |
502 |
|
|
}; |
503 |
|
|
} else { |
504 |
|
|
if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) { |
505 |
|
|
my $val = $1; |
506 |
|
|
if ( $tag =~ /^<($val)\s*(.*)/i ) { |
507 |
|
|
$tag{type} = lc($1); |
508 |
|
|
my $args = $2; |
509 |
|
|
$tag{count} = $i+1; |
510 |
|
|
my %args; |
511 |
|
|
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) { |
512 |
|
|
$args{lc($1)} = $2; |
513 |
|
|
} |
514 |
|
|
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) { |
515 |
|
|
$args{lc($1)} = $2; |
516 |
|
|
} |
517 |
|
|
while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) { |
518 |
|
|
$args{lc($1)} = $2; |
519 |
|
|
} |
520 |
|
|
foreach my $arg ( qw( name id class width align ) ) { |
521 |
|
|
$tag{params}{$arg} = $args{$arg} if exists $args{$arg}; |
522 |
|
|
} |
523 |
|
|
return \%tag; |
524 |
|
|
} else { |
525 |
|
|
return { |
526 |
|
|
type => 'text', |
527 |
|
|
content => $tag, |
528 |
|
|
count => $i, |
529 |
|
|
}; |
530 |
|
|
} |
531 |
|
|
} else { |
532 |
|
|
return { |
533 |
|
|
type => 'text', |
534 |
|
|
content => $tag, |
535 |
|
|
count => $i, |
536 |
|
|
}; |
537 |
|
|
} |
538 |
|
|
} |
539 |
|
|
} |
540 |
|
|
|
541 |
|
|
sub __try_end { |
542 |
|
|
my ($self, $content) = @_; |
543 |
|
|
|
544 |
|
|
my $i = 2; |
545 |
|
|
my %tag; |
546 |
|
|
my $tag = $content->[0].$content->[1]; |
547 |
|
|
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { |
548 |
|
|
$tag .= $content->[$i]; |
549 |
|
|
$i++; |
550 |
|
|
} |
551 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
552 |
|
|
return { |
553 |
|
|
type => 'text', |
554 |
|
|
content => $tag, |
555 |
|
|
count => $i, |
556 |
|
|
}; |
557 |
|
|
} else { |
558 |
|
|
if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) { |
559 |
|
|
my $val = $1; |
560 |
|
|
if ( $tag =~ /^<\/($val)\s*?$/i ) { |
561 |
|
|
$tag{type} = lc($1); |
562 |
|
|
$tag{count} = $i+1; |
563 |
|
|
return \%tag; |
564 |
|
|
} else { |
565 |
|
|
return { |
566 |
|
|
type => 'text', |
567 |
|
|
content => $tag, |
568 |
|
|
count => $i, |
569 |
|
|
}; |
570 |
|
|
} |
571 |
|
|
} else { |
572 |
|
|
return { |
573 |
|
|
type => 'text', |
574 |
|
|
content => $tag, |
575 |
|
|
count => $i, |
576 |
|
|
}; |
577 |
|
|
} |
578 |
|
|
} |
579 |
|
|
} |
580 |
|
|
|
581 |
|
|
|
582 |
|
|
sub __extract_img { |
583 |
|
|
my ($self, $structure, $base_url, $debug) = @_; |
584 |
|
|
return unless ref $structure eq 'HASH'; |
585 |
|
|
|
586 |
|
|
foreach my $tag ( values %$structure ) { |
587 |
|
|
next unless ref $tag && exists $tag->{text} && $tag->{text}; |
588 |
|
|
my $text = $tag->{text}; |
589 |
|
|
while ( $text =~ /<img (.*?)>/sgi ) { |
590 |
|
|
# warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n"; |
591 |
|
|
my $params = $1; |
592 |
|
|
my $img = {}; |
593 |
|
|
if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) { |
594 |
|
|
$img->{url} = $1; |
595 |
|
|
$img->{url} =~ s/[\r\t\n\x20]+$//; |
596 |
|
|
$img->{url} =~ s/^[\r\t\n\x20]+//; |
597 |
|
|
$img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/; |
598 |
|
|
$img->{url} =~ s/\/+/\//sgi; |
599 |
|
|
$img->{url} =~ s/http:\//http:\/\//sgi; |
600 |
|
|
$img->{w} = $1 if $params =~ /width[\D]+(\d+)/; |
601 |
|
|
$img->{h} = $1 if $params =~ /height[\D]+(\d+)/; |
602 |
|
|
$img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/; |
603 |
|
|
$tag->{images} = [] unless ref $tag->{images} eq 'ARRAY'; |
604 |
|
|
push @{ $tag->{images} }, $img; |
605 |
|
|
# warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n"; |
606 |
|
|
} |
607 |
|
|
} |
608 |
|
|
$text =~ s/<img (.*?)>//sgi; |
609 |
|
|
$tag->{text} = $text; |
610 |
|
|
$tag->{count} = length ($text); |
611 |
|
|
} |
612 |
|
|
} |
613 |
|
|
|
614 |
|
|
|
615 |
|
|
sub __extract_headers { |
616 |
|
|
my ($self, $structure, $debug) = @_; |
617 |
|
|
return unless ref $structure eq 'HASH'; |
618 |
|
|
|
619 |
|
|
foreach my $tag ( values %$structure ) { |
620 |
|
|
next unless ref $tag && exists $tag->{text} && $tag->{text}; |
621 |
|
|
my $text = $tag->{text}; |
622 |
|
|
while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) { |
623 |
|
|
my $header_level = $1; |
624 |
|
|
my $header_text = $2; |
625 |
|
|
$tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY'; |
626 |
|
|
push @{ $tag->{headers} }, { level => $header_level, text => $header_text }; |
627 |
|
|
} |
628 |
|
|
} |
629 |
|
|
|
630 |
|
|
} |
631 |
|
|
|
632 |
|
|
|
633 |
|
|
sub __dig_big_texts { |
634 |
|
|
my ($self, %opts) = @_; |
635 |
|
|
my $structure = exists $opts{structure} ? $opts{structure} : undef; |
636 |
|
|
my $minimum = exists $opts{min} ? $opts{min} : undef; |
637 |
|
|
my $debug = exists $opts{debug} ? $opts{debug} : undef; |
638 |
|
|
my $rools = exists $opts{rools} ? $opts{rools} : undef; |
639 |
|
|
return unless ref $structure eq 'HASH'; |
640 |
|
|
|
641 |
|
|
my @rools; |
642 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
643 |
|
|
@rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools; |
644 |
|
|
} |
645 |
|
|
my @exclude_rools; |
646 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
647 |
|
|
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools; |
648 |
|
|
} |
649 |
|
|
|
650 |
|
|
my @ret; |
651 |
|
|
foreach my $tid ( sort { $a <=> $b } keys %$structure ) { |
652 |
|
|
my $tag = $structure->{$tid}; |
653 |
|
|
next unless ref $tag && exists $tag->{text} && $tag->{text}; |
654 |
|
|
next if $self->__exclude_rools($tag, \@exclude_rools); |
655 |
|
|
|
656 |
|
|
if ( @rools ) { |
657 |
|
|
my $choose = 0; |
658 |
|
|
foreach my $rool ( @rools ) { |
659 |
|
|
my $matched = 1; |
660 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
661 |
|
|
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { |
662 |
|
|
$matched = 0; |
663 |
|
|
} |
664 |
|
|
} |
665 |
|
|
$choose ||= $matched; |
666 |
|
|
} |
667 |
|
|
if ( $choose ) { |
668 |
|
|
$tag->{text} =~ s/^[\t\ \n\r]+//s; |
669 |
|
|
$tag->{text} =~ s/[\t\ \n\r]+$//s; |
670 |
|
|
$tag->{text} =~ s/[\t\ ]+/\ /sg; |
671 |
|
|
$tag->{text} =~ s/\r//sg; |
672 |
|
|
$tag->{text} =~ s/\n{2,}/\n\n/sg; |
673 |
|
|
$tag->{text} =~ s/<a.*?>//sgi; |
674 |
|
|
$tag->{text} =~ s/<\/a.*?>//sgi; |
675 |
|
|
$tag->{text} =~ s/\&\\x(\d+)//sgi; |
676 |
|
|
|
677 |
|
|
my $text = $tag->{text}; |
678 |
|
|
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; |
679 |
|
|
# $text = Contenido::Parser::Util::strip_html($text); |
680 |
|
|
if ( length($text) >= $minimum ) { |
681 |
|
|
push @ret, $tag; |
682 |
|
|
} |
683 |
|
|
} |
684 |
|
|
} else { |
685 |
|
|
my $text = $tag->{text}; |
686 |
|
|
$text =~ s/^[\t\ \n\r]+//s; |
687 |
|
|
$text =~ s/[\t\ \n\r]+$//s; |
688 |
|
|
$text =~ s/[\t\ ]+/\ /sg; |
689 |
|
|
$text =~ s/\r//sg; |
690 |
|
|
$text =~ s/\n{2,}/\n\n/sg; |
691 |
|
|
# It does wrong job: |
692 |
|
|
# $text =~ s/([,!\?])(\S)/$1\ $2/sg; |
693 |
|
|
$tag->{text} = $text; |
694 |
|
|
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; |
695 |
|
|
# $text = Contenido::Parser::Util::strip_html($text); |
696 |
|
|
if ( length($text) >= $minimum ) { |
697 |
|
|
$tag->{text} =~ s/<a.*?>//sgi; |
698 |
|
|
$tag->{text} =~ s/<\/a.*?>//sgi; |
699 |
|
|
$tag->{text} =~ s/\&\\x(\d+)//sgi; |
700 |
|
|
push @ret, $tag; |
701 |
|
|
} |
702 |
|
|
} |
703 |
|
|
} |
704 |
|
|
unless ( @ret ) { |
705 |
|
|
warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug; |
706 |
|
|
} |
707 |
|
|
return \@ret; |
708 |
|
|
} |
709 |
|
|
|
710 |
|
|
|
711 |
|
|
|
712 |
|
|
sub __check_headers { |
713 |
|
|
my ($self, $chosen, $header, $debug) = @_; |
714 |
|
|
return unless ref $chosen eq 'ARRAY'; |
715 |
|
|
|
716 |
|
|
unless ( grep { exists $_->{headers} } @$chosen ) { |
717 |
|
|
warn "No headers found\n" if $debug; |
718 |
|
|
return $chosen; |
719 |
|
|
} else { |
720 |
|
|
# @$chosen = grep { exists $_->{headers} } @$chosen; |
721 |
|
|
} |
722 |
|
|
my @ret; |
723 |
|
|
foreach my $unit ( @$chosen ) { |
724 |
|
|
unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) { |
725 |
|
|
$unit->{header_identity} = 0; |
726 |
|
|
$unit->{header_min_level} = 32768; |
727 |
|
|
next; |
728 |
|
|
} |
729 |
|
|
my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}}; |
730 |
|
|
my $min_level = $headers[0]->{level}; |
731 |
|
|
$unit->{header_min_level} = $min_level; |
732 |
|
|
if ( $header ) { |
733 |
|
|
my $coeff = $self->__str_compare( $header, $headers[0]->{text} ); |
734 |
|
|
$unit->{header_identity} = $coeff; |
735 |
|
|
} |
736 |
|
|
} |
737 |
|
|
# @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen; |
738 |
|
|
# return \@ret; |
739 |
|
|
return $chosen; |
740 |
|
|
} |
741 |
|
|
|
742 |
|
|
|
743 |
|
|
|
744 |
|
|
sub __check_description { |
745 |
|
|
my ($self, $chosen, $desc, $debug) = @_; |
746 |
|
|
return unless ref $chosen eq 'ARRAY' && $desc; |
747 |
|
|
|
748 |
|
|
my @ret; |
749 |
|
|
foreach my $unit ( @$chosen ) { |
750 |
|
|
if ( $desc ) { |
751 |
|
|
my $coeff = $self->__str_compare( $unit->{text}, $desc ); |
752 |
|
|
warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug; |
753 |
|
|
$unit->{description_identity} = $coeff; |
754 |
|
|
} |
755 |
|
|
} |
756 |
|
|
@ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen; |
757 |
|
|
return \@ret; |
758 |
|
|
} |
759 |
|
|
|
760 |
|
|
|
761 |
|
|
# wtf, bastards! how come my code's used here? --ra |
762 |
|
|
# damn, it's not 100% your code already |
763 |
|
|
sub __str_compare { |
764 |
|
|
my ($self, $original, $applicant) = @_; |
765 |
|
|
|
766 |
|
|
my $Al = __freq_list($original); |
767 |
|
|
return -1 unless defined $Al; |
768 |
|
|
my $Bl = __freq_list($applicant); |
769 |
|
|
return -1 unless defined $Bl; |
770 |
|
|
my $df = 0; |
771 |
|
|
|
772 |
|
|
foreach my $word ( %$Bl ) { |
773 |
|
|
if ( exists $Al->{$word} ) { |
774 |
|
|
$df += $Al->{$word} |
775 |
|
|
} else { |
776 |
|
|
$df -= $Bl->{$word} |
777 |
|
|
} |
778 |
|
|
} |
779 |
|
|
|
780 |
|
|
return $df; |
781 |
|
|
} |
782 |
|
|
|
783 |
|
|
sub __freq_list { |
784 |
|
|
|
785 |
|
|
my @d = grep { length($_) > 3 } split /\W/, $_[0]; |
786 |
|
|
return undef unless @d; |
787 |
|
|
my $z = 1/scalar(@d); my %l = (); |
788 |
|
|
$l{$_} += $z for @d; \%l; |
789 |
|
|
} |
790 |
|
|
|
791 |
|
|
|
792 |
|
|
|
793 |
|
|
sub __strip_html { |
794 |
|
|
my ($self, %opts) = @_; |
795 |
|
|
return unless ref $opts{chosen} eq 'ARRAY'; |
796 |
|
|
|
797 |
|
|
my $chosen = $opts{chosen}; |
798 |
|
|
my $rooles = $opts{rools}; |
799 |
|
|
my $header = $opts{header}; |
800 |
|
|
|
801 |
|
|
foreach my $unit ( @$chosen ) { |
802 |
|
|
my %tags; |
803 |
|
|
my $headers = $unit->{headers} if exists $unit->{headers}; |
804 |
|
|
if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) { |
805 |
|
|
if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) { |
806 |
|
|
$unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi; |
807 |
|
|
$unit->{text} =~ s/^[\x20\t\r\n]+//si; |
808 |
|
|
} elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) { |
809 |
|
|
while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) { |
810 |
|
|
my $hdr = 'h'.$1; |
811 |
|
|
$unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si; |
812 |
|
|
} |
813 |
|
|
} |
814 |
|
|
} |
815 |
|
|
for ( $unit->{text} ) { |
816 |
|
|
s/></> </sg; |
817 |
|
|
s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi; |
818 |
|
|
s/<\/h(\d+)(.*?)>/\. /sgi; |
819 |
|
|
s/<h(\d+)(.*?)>/\n/sgi; |
820 |
|
|
s/&/\&/sg; |
821 |
|
|
s/&/\&/sgi; |
822 |
|
|
s/«/«/sg; |
823 |
|
|
s/»/»/sg; |
824 |
|
|
s/£/£/sg; |
825 |
|
|
s/–/–/sg; |
826 |
|
|
s/—/—/sg; |
827 |
|
|
s/…/\.\.\./sg; |
828 |
|
|
s/„/"/sg; |
829 |
|
|
s/“/"/sg; |
830 |
|
|
s/”/"/sg; |
831 |
|
|
s/´/'/sg; |
832 |
|
|
s/ /\n/sg; |
833 |
|
|
s/"/"/sg; |
834 |
|
|
s/ /\ /sgi; |
835 |
|
|
} |
836 |
|
|
# $unit->{text} = HTML::Entities::decode_entities($unit->{text}); |
837 |
|
|
# $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text}); |
838 |
|
|
for ( $unit->{text} ) { |
839 |
|
|
s/^[\x20\t\r\n]+//si; |
840 |
|
|
s/^(\d+)\.(\d+)\.(\d+)//si; |
841 |
|
|
s/^[\x20\t\r\n]+//si; |
842 |
|
|
s/^(\d+):(\d+)//si; |
843 |
|
|
s/^[\x20\t\r\n]+//si; |
844 |
|
|
} |
845 |
|
|
if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) { |
846 |
|
|
substr $unit->{text}, 0, length($header), ''; |
847 |
|
|
$unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi; |
848 |
|
|
} |
849 |
|
|
$unit->{text} =~ s/[\x20\t\r\n]+$//sgi; |
850 |
|
|
} |
851 |
|
|
} |
852 |
|
|
|
853 |
|
|
|
854 |
|
|
|
855 |
|
|
sub __glue { |
856 |
|
|
my ($self, $chosen, $glue, $debug) = @_; |
857 |
|
|
return unless ref $chosen eq 'ARRAY'; |
858 |
|
|
|
859 |
|
|
my $i = 0; |
860 |
|
|
|
861 |
|
|
if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) { |
862 |
|
|
my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768; |
863 |
|
|
foreach my $unit ( @$chosen ) { |
864 |
|
|
next unless $i++; |
865 |
|
|
if ( $i <= $count ) { |
866 |
|
|
$chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text}; |
867 |
|
|
} |
868 |
|
|
} |
869 |
|
|
} elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) { |
870 |
|
|
my $text = ''; |
871 |
|
|
my $i = 0; |
872 |
|
|
foreach my $pos ( @{ $glue->{condition} } ) { |
873 |
|
|
$text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text}; |
874 |
|
|
} |
875 |
|
|
$chosen->[0]->{text} = $text; |
876 |
|
|
} |
877 |
|
|
} |
878 |
|
|
|
879 |
|
|
|
880 |
|
|
sub __get_images { |
881 |
|
|
my ($self, %opts) = @_; |
882 |
|
|
my $structure = exists $opts{structure} ? $opts{structure} : undef; |
883 |
|
|
my $chosen = exists $opts{chosen} ? $opts{chosen} : undef; |
884 |
|
|
my $debug = exists $opts{debug} ? $opts{debug} : undef; |
885 |
|
|
my $rools = exists $opts{rools} ? $opts{rools} : undef; |
886 |
|
|
my $base_url = delete $opts{base_url}; |
887 |
|
|
return unless ref $chosen && ref $structure; |
888 |
|
|
|
889 |
|
|
return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools; |
890 |
|
|
my @use_rools; |
891 |
|
|
my @exclude_rools; |
892 |
|
|
my $no_validation = 0; |
893 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
894 |
|
|
@use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools; |
895 |
|
|
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools; |
896 |
|
|
$no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools; |
897 |
|
|
} |
898 |
|
|
my $image_depth; |
899 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
900 |
|
|
my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools; |
901 |
|
|
$image_depth = $rools[-1]->{condition} if @rools; |
902 |
|
|
} |
903 |
|
|
|
904 |
|
|
my @images; |
905 |
|
|
foreach my $tag ( values %$structure ) { |
906 |
|
|
next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY'; |
907 |
|
|
next if $self->__exclude_rools($tag, \@exclude_rools); |
908 |
|
|
|
909 |
|
|
if ( @use_rools ) { |
910 |
|
|
my $choose = 0; |
911 |
|
|
foreach my $rool ( @use_rools ) { |
912 |
|
|
my $matched = 1; |
913 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
914 |
|
|
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { |
915 |
|
|
$matched = 0; |
916 |
|
|
} |
917 |
|
|
} |
918 |
|
|
$matched = 0 if $self->__exclude_rools($tag, \@exclude_rools); |
919 |
|
|
$choose ||= $matched; |
920 |
|
|
} |
921 |
|
|
if ( $choose ) { |
922 |
|
|
my @img = grep { $no_validation || $self->__img_is_valid ($_) } map { |
923 |
|
|
my $img = rchannel::Image->new($_); |
924 |
|
|
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; |
925 |
|
|
$img; |
926 |
|
|
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; |
927 |
|
|
|
928 |
|
|
push @images, @img; |
929 |
|
|
} |
930 |
|
|
} else { |
931 |
|
|
next if ($tag->{level}+1) < $chosen->{level}; |
932 |
|
|
next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) ); |
933 |
|
|
|
934 |
|
|
my $ok = 0; |
935 |
|
|
my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2; |
936 |
|
|
my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops; |
937 |
|
|
next if ($hops - $uphops) > 4; |
938 |
|
|
my @img_parents = ($tag->{id}); |
939 |
|
|
my $parent = $tag; |
940 |
|
|
for ( 1..$hops ) { |
941 |
|
|
$parent = $parent->{parent}; |
942 |
|
|
push @img_parents, $parent->{id}; |
943 |
|
|
} |
944 |
|
|
$parent = $chosen;; |
945 |
|
|
for ( 0..$uphops ) { |
946 |
|
|
if ( grep { $parent->{id} == $_ } @img_parents ) { |
947 |
|
|
$ok = 1; |
948 |
|
|
last; |
949 |
|
|
} |
950 |
|
|
$parent = $parent->{parent}; |
951 |
|
|
} |
952 |
|
|
if ( $ok ) { |
953 |
|
|
my @img = grep { $self->__img_is_valid ($_) } map { |
954 |
|
|
my $img = rchannel::Image->new($_); |
955 |
|
|
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; |
956 |
|
|
$img; |
957 |
|
|
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; |
958 |
|
|
|
959 |
|
|
push @images, @img; |
960 |
|
|
} |
961 |
|
|
} |
962 |
|
|
} |
963 |
|
|
# warn Dumper (\@images); |
964 |
|
|
if ( @images ) { |
965 |
|
|
return \@images; |
966 |
|
|
} else { |
967 |
|
|
return undef; |
968 |
|
|
} |
969 |
|
|
} |
970 |
|
|
|
971 |
|
|
|
972 |
|
|
sub __img_is_valid { |
973 |
|
|
my ($self, $img) = @_; |
974 |
|
|
|
975 |
|
|
# return 1; |
976 |
|
|
if ( $img->check_online ) { |
977 |
|
|
my $delim = 0; |
978 |
|
|
my $w = $img->width; |
979 |
|
|
my $h = $img->height; |
980 |
|
|
if ( $w && $h ) { |
981 |
|
|
foreach my $pair ( @bad_dimensions ) { |
982 |
|
|
if ($w == $pair->{w} && $h == $pair->{h}) { |
983 |
|
|
return undef; |
984 |
|
|
} |
985 |
|
|
} |
986 |
|
|
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; |
987 |
|
|
if ( $w < 80 || $h < 80 || $delim > 2.5 ) { |
988 |
|
|
return undef; |
989 |
|
|
} |
990 |
|
|
} |
991 |
|
|
} else { |
992 |
|
|
# warn "Image ".$img->src." not found on server"; |
993 |
|
|
return undef; |
994 |
|
|
} |
995 |
|
|
return 1; |
996 |
|
|
} |
997 |
|
|
|
998 |
|
|
|
999 |
|
|
sub __exclude_rools { |
1000 |
|
|
my ($self, $tag, $rools) = @_; |
1001 |
|
|
return undef unless ref $rools eq 'ARRAY' && @$rools; |
1002 |
|
|
|
1003 |
|
|
my $choose = 0; |
1004 |
|
|
foreach my $rool ( @$rools ) { |
1005 |
|
|
my $matched = 1; |
1006 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
1007 |
|
|
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { |
1008 |
|
|
$matched = 0; |
1009 |
|
|
} |
1010 |
|
|
} |
1011 |
|
|
$choose ||= $matched; |
1012 |
|
|
} |
1013 |
|
|
return $choose; |
1014 |
|
|
} |
1015 |
|
|
|
1016 |
|
|
|
1017 |
|
|
sub __parse_rools { |
1018 |
|
|
my ($self, $rools) = @_; |
1019 |
|
|
return unless $rools; |
1020 |
|
|
$rools =~ s/\r//sgi; |
1021 |
|
|
my @rools = split /\n/, $rools; |
1022 |
|
|
return unless @rools; |
1023 |
|
|
|
1024 |
|
|
my @parsed; |
1025 |
|
|
foreach my $rool ( @rools ) { |
1026 |
|
|
my %pr; |
1027 |
|
|
next if $rool =~ /^#/; |
1028 |
|
|
$rool =~ s/[\x20\t]+$//; |
1029 |
|
|
$rool =~ s/^[\x20\t]+//; |
1030 |
|
|
if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) { |
1031 |
|
|
$pr{command} = lc($1); |
1032 |
|
|
my $params = $2; |
1033 |
|
|
|
1034 |
|
|
if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1035 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1036 |
|
|
next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/; |
1037 |
|
|
$params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; |
1038 |
|
|
$pr{condition} = $params; |
1039 |
|
|
} elsif ( $pr{command} eq 'glue' ) { |
1040 |
|
|
if ( $params =~ /^(\w+)\s+(.*)$/ ) { |
1041 |
|
|
$pr{subcommand} = $1; $params = $2; |
1042 |
|
|
next unless $pr{subcommand} =~ /^(first|all|order)$/; |
1043 |
|
|
if ( $pr{subcommand} eq 'order' ) { |
1044 |
|
|
my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params); |
1045 |
|
|
$pr{condition} = \@pars; |
1046 |
|
|
} else { |
1047 |
|
|
$pr{condition} = int($1); |
1048 |
|
|
} |
1049 |
|
|
} elsif ( $params =~ /(\d+)/i ) { |
1050 |
|
|
$pr{subcommand} = 'first'; |
1051 |
|
|
$pr{condition} = int($1); |
1052 |
|
|
} else { |
1053 |
|
|
$pr{subcommand} = 'all'; |
1054 |
|
|
} |
1055 |
|
|
} elsif ( $pr{command} eq 'trim' ) { |
1056 |
|
|
if ( $params =~ /(left|right)/i ) { |
1057 |
|
|
$pr{subcommand} = lc($1); |
1058 |
|
|
} else { |
1059 |
|
|
$pr{subcommand} = 'all'; |
1060 |
|
|
} |
1061 |
|
|
} elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) { |
1062 |
|
|
$pr{condition} = $1; |
1063 |
|
|
} elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) { |
1064 |
|
|
$pr{command} = 'image_off'; |
1065 |
|
|
} elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) { |
1066 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1067 |
|
|
next unless $pr{subcommand} =~ /^(depth)$/; |
1068 |
|
|
$pr{condition} = $params; |
1069 |
|
|
} elsif ( $pr{command} eq 'set' ) { |
1070 |
|
|
if ( $params =~ /^(limit)\s+(.*)$/si ) { |
1071 |
|
|
$pr{subcommand} = lc($1); |
1072 |
|
|
$params = $2; |
1073 |
|
|
} |
1074 |
|
|
if ( $params =~ /^(\w+)\s+(.*)$/ ) { |
1075 |
|
|
$pr{condition} = { param => $1, value => $2 }; |
1076 |
|
|
} else { |
1077 |
|
|
next; |
1078 |
|
|
} |
1079 |
|
|
} elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) { |
1080 |
|
|
$pr{command} = 'kill'; |
1081 |
|
|
$pr{condition} = { param => $2, value => $1 }; |
1082 |
|
|
} elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) { |
1083 |
|
|
$pr{command} = 'set'; |
1084 |
|
|
$pr{condition} = { param => 'description', value => 'header' }; |
1085 |
|
|
} elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1086 |
|
|
$pr{subcommand} = $1; $params = $2; |
1087 |
|
|
next unless $pr{subcommand} =~ /^(element|elem|image)$/; |
1088 |
|
|
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; |
1089 |
|
|
my @conditions; |
1090 |
|
|
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { |
1091 |
|
|
push @conditions, { param => lc($1), value => $2 } |
1092 |
|
|
} |
1093 |
|
|
$pr{condition} = \@conditions; |
1094 |
|
|
} elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1095 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1096 |
|
|
next unless $pr{subcommand} =~ /^(image|elem|element)$/; |
1097 |
|
|
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; |
1098 |
|
|
my @conditions; |
1099 |
|
|
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { |
1100 |
|
|
push @conditions, { param => lc($1), value => $2 } |
1101 |
|
|
} |
1102 |
|
|
$pr{condition} = \@conditions; |
1103 |
|
|
} elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) { |
1104 |
|
|
$pr{command} = 'dont'; |
1105 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1106 |
|
|
next unless $pr{subcommand} =~ /^(cut|validate)$/; |
1107 |
|
|
my @conditions; |
1108 |
|
|
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { |
1109 |
|
|
$pr{condition} = { param => lc($1), value => $2 }; |
1110 |
|
|
} elsif ( $params =~ /(image)/i ) { |
1111 |
|
|
$pr{condition} = { param => lc($1) }; |
1112 |
|
|
} else { |
1113 |
|
|
next; |
1114 |
|
|
} |
1115 |
|
|
} elsif ( $pr{command} eq 'clean' ) { |
1116 |
|
|
if ( $params =~ /^(off)\s+(.*)$/ ) { |
1117 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1118 |
|
|
} |
1119 |
|
|
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { |
1120 |
|
|
$pr{condition} = { param => lc($1), value => $2 }; |
1121 |
|
|
} elsif ( $params =~ /(span)/i ) { |
1122 |
|
|
$pr{condition} = { param => 'tag', value => lc($1) }; |
1123 |
|
|
} else { |
1124 |
|
|
next; |
1125 |
|
|
} |
1126 |
|
|
} else { |
1127 |
|
|
next; |
1128 |
|
|
} |
1129 |
|
|
push @parsed, \%pr; |
1130 |
|
|
} |
1131 |
|
|
} |
1132 |
|
|
return ( scalar @parsed ? \@parsed : undef ); |
1133 |
|
|
} |
1134 |
|
|
|
1135 |
|
|
|
1136 |
|
|
sub __post_rool { |
1137 |
|
|
my ($self, $element, $rools, $description) = @_; |
1138 |
|
|
|
1139 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools ) { |
1140 |
|
|
foreach my $rool ( @$rools ) { |
1141 |
|
|
if ( $rool->{command} eq 'cut' ) { |
1142 |
|
|
my $condition = $rool->{condition}; |
1143 |
|
|
if ( $rool->{subcommand} eq 'off' ) { |
1144 |
|
|
$element->{text} =~ s/$condition//sgi; |
1145 |
|
|
} elsif ( $rool->{subcommand} eq 'from' ) { |
1146 |
|
|
my $pos = index $element->{text}, $condition; |
1147 |
|
|
if ( $pos >= 0 ) { |
1148 |
|
|
$element->{text} = substr $element->{text}, 0, $pos; |
1149 |
|
|
} |
1150 |
|
|
# $element->{text} =~ s/$condition(.*)$//si; |
1151 |
|
|
} elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) { |
1152 |
|
|
my $str = substr $description, 0, 12; |
1153 |
|
|
my $pos = index $element->{text}, $str; |
1154 |
|
|
if ( $pos >= 0 ) { |
1155 |
|
|
$element->{text} = substr $element->{text}, $pos, -1; |
1156 |
|
|
} |
1157 |
|
|
} elsif ( $rool->{subcommand} eq 'till' ) { |
1158 |
|
|
$element->{text} =~ s/^(.*?)($condition)//si; |
1159 |
|
|
} elsif ( $rool->{subcommand} eq 'untill' ) { |
1160 |
|
|
$element->{text} =~ s/^(.*?)($condition)/$2/si; |
1161 |
|
|
} elsif ( $rool->{subcommand} eq 'regex' ) { |
1162 |
|
|
$element->{text} =~ s/$condition//sgi; |
1163 |
|
|
} |
1164 |
|
|
} elsif ( $rool->{command} eq 'trim' ) { |
1165 |
|
|
if ( $rool->{subcommand} eq 'left' ) { |
1166 |
|
|
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; |
1167 |
|
|
} elsif ( $rool->{subcommand} eq 'right' ) { |
1168 |
|
|
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; |
1169 |
|
|
} else { |
1170 |
|
|
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; |
1171 |
|
|
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; |
1172 |
|
|
} |
1173 |
|
|
} |
1174 |
|
|
} |
1175 |
|
|
} |
1176 |
|
|
} |
1177 |
|
|
|
1178 |
|
|
|
1179 |
|
|
1; |