1 |
99 |
ahitrov |
package Contenido::Parser::HTML; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use warnings; |
5 |
|
|
use locale; |
6 |
|
|
|
7 |
|
|
use base 'Contenido::Parser'; |
8 |
|
|
|
9 |
|
|
use Contenido::Globals; |
10 |
|
|
use Utils::HTML; |
11 |
|
|
use Data::Dumper; |
12 |
|
|
use utf8; |
13 |
101 |
ahitrov |
use Encode; |
14 |
99 |
ahitrov |
|
15 |
|
|
my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery |
16 |
|
|
head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart |
17 |
|
|
basket main reply title corner address page buy pix pixel spacer fon welcome razd about back |
18 |
|
|
shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones |
19 |
|
|
dealer diler background bg news rss index none btn cards up footer noimage but link excel price |
20 |
|
|
mid graphic busket map girl space catalog bann headline hosting contact schedule redir email |
21 |
|
|
); |
22 |
|
|
|
23 |
|
|
my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru |
24 |
|
|
mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net |
25 |
|
|
); |
26 |
|
|
|
27 |
|
|
my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban |
28 |
|
|
icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt |
29 |
|
|
ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc |
30 |
|
|
); |
31 |
|
|
|
32 |
|
|
|
33 |
|
|
my @bad_dimensions = ( |
34 |
|
|
{ w => 120, h => 60 }, |
35 |
|
|
{ w => 468, h => 60 }, |
36 |
|
|
{ w => 120, h => 600 }, |
37 |
|
|
{ w => 88, h => 31 }, |
38 |
|
|
); |
39 |
|
|
|
40 |
|
|
|
41 |
|
|
sub new { |
42 |
|
|
my ($proto) = @_; |
43 |
|
|
my $class = ref($proto) || $proto; |
44 |
|
|
my $self = {}; |
45 |
|
|
bless $self, $class; |
46 |
|
|
|
47 |
|
|
return $self; |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
|
51 |
|
|
sub parse { |
52 |
|
|
my ($self, %opts) = @_; |
53 |
|
|
|
54 |
|
|
my $content; |
55 |
|
|
if ( $opts{content} ) { |
56 |
101 |
ahitrov |
$content = decode('utf-8', delete $opts{content}); |
57 |
99 |
ahitrov |
delete $self->{content}; |
58 |
|
|
} elsif ( $self->{success} || $self->{content} ) { |
59 |
101 |
ahitrov |
$content = decode('utf-8', delete $self->{content}); |
60 |
99 |
ahitrov |
} else { |
61 |
|
|
$self->{success} = 0; |
62 |
|
|
return $self; |
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
my $base_url = delete $self->{base_url} || delete $opts{base_url}; |
66 |
|
|
my $strip_html = delete $opts{strip_html}; |
67 |
|
|
my $debug = $DEBUG; |
68 |
|
|
my $gui = delete $opts{gui}; |
69 |
101 |
ahitrov |
my $header = decode('utf-8', delete $opts{header}); |
70 |
109 |
ahitrov |
warn "Header length: ".length($header || '')."\n" if $debug; |
71 |
101 |
ahitrov |
my $description = decode('utf-8', delete $opts{description}); |
72 |
109 |
ahitrov |
warn "Description length: ".length($description || '')."\n" if $debug; |
73 |
99 |
ahitrov |
my $minimum = delete $opts{min} || length $description; |
74 |
|
|
|
75 |
|
|
my $pre_rools = $self->__parse_rools (delete $opts{parser_pre}); |
76 |
|
|
warn Dumper ($pre_rools) if $debug; |
77 |
|
|
my $parse_rools = $self->__parse_rools (delete $opts{parser_run}); |
78 |
|
|
warn Dumper ($parse_rools) if $debug; |
79 |
|
|
my $post_rools = $self->__parse_rools (delete $opts{parser_end}); |
80 |
|
|
warn Dumper ($post_rools) if $debug; |
81 |
|
|
|
82 |
|
|
# warn "Experimental. Debug!!!\n" if $debug; |
83 |
|
|
if ( ref $pre_rools eq 'ARRAY' ) { |
84 |
|
|
my @sets = grep { $_->{command} eq 'set' } @$pre_rools; |
85 |
|
|
foreach my $set ( @sets ) { |
86 |
|
|
if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) { |
87 |
|
|
my $value = $set->{condition}{value}; |
88 |
|
|
unless ( $value =~ /\D/ ) { |
89 |
|
|
if ( $set->{subcommand} eq 'limit' ) { |
90 |
|
|
$minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value); |
91 |
|
|
} else { |
92 |
|
|
$minimum = int($value); |
93 |
|
|
} |
94 |
|
|
} |
95 |
|
|
} |
96 |
|
|
if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) { |
97 |
|
|
$description = $header; |
98 |
|
|
} |
99 |
|
|
} |
100 |
|
|
} |
101 |
|
|
$minimum ||= 300; |
102 |
|
|
|
103 |
|
|
warn "Tag cleaning...\n" if $debug; |
104 |
|
|
$self->__clean_tags (\$content, $pre_rools); |
105 |
|
|
$content =~ s/>\s+</></sgi; |
106 |
|
|
warn "Image cleaning...\n" if $debug; |
107 |
|
|
$self->__clean_img (\$content); |
108 |
|
|
warn "Empty div cleaning...\n" if $debug; |
109 |
|
|
while ( $self->__clean_empty_div (\$content) ) {} |
110 |
|
|
warn "Make tree...\n" if $debug; |
111 |
|
|
my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug); |
112 |
|
|
|
113 |
|
|
$self->__extract_img ($shortcuts, $base_url, $debug); |
114 |
|
|
$self->__extract_headers ($shortcuts, $header, $debug); |
115 |
|
|
warn "Getting big texts (min=$minimum)...\n" if $debug; |
116 |
|
|
my $chosen = $self->__dig_big_texts ( |
117 |
|
|
structure => $shortcuts, |
118 |
|
|
min => $minimum, |
119 |
|
|
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), |
120 |
|
|
debug => $debug ); |
121 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
122 |
|
|
$self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value'; |
123 |
|
|
return $self->is_success(0) unless $gui; |
124 |
|
|
} |
125 |
|
|
if ( $description ) { |
126 |
|
|
my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY'; |
127 |
|
|
$chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools; |
128 |
|
|
} |
129 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
130 |
|
|
$self->{error_message} = 'I didn\'t find any valuable text'; |
131 |
|
|
return $self->is_success(0) unless $gui; |
132 |
|
|
} |
133 |
|
|
if ( scalar @$chosen > 1 ) { |
134 |
|
|
$chosen = $self->__check_headers ($chosen, $header, $debug); |
135 |
|
|
} |
136 |
|
|
unless ( ref $chosen eq 'ARRAY' && @$chosen ) { |
137 |
|
|
$self->{error_message} = 'I didn\'t find any valuable text'; |
138 |
|
|
return $self->is_success(0) unless $gui; |
139 |
|
|
} |
140 |
|
|
$self->__strip_html ( |
141 |
|
|
chosen => $chosen, |
142 |
|
|
header => $header, |
143 |
|
|
ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (), |
144 |
|
|
debug => $debug |
145 |
|
|
); |
146 |
|
|
if ( ref $parse_rools eq 'ARRAY' ) { |
147 |
|
|
my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools; |
148 |
|
|
$self->__glue ( $chosen, $glue, $debug ) if ref $glue; |
149 |
|
|
} |
150 |
107 |
ahitrov |
warn "Getting images...\n" if $debug; |
151 |
99 |
ahitrov |
my $images = $self->__get_images ( |
152 |
|
|
structure => $shortcuts, |
153 |
|
|
chosen => $chosen->[0], |
154 |
|
|
base_url => $base_url, |
155 |
|
|
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (), |
156 |
|
|
debug => $debug, |
157 |
|
|
); |
158 |
|
|
if ( ref $images eq 'ARRAY' && @$images ) { |
159 |
104 |
ahitrov |
$self->{images} = $images; |
160 |
|
|
$self->{image} = $images->[0]; |
161 |
99 |
ahitrov |
} |
162 |
|
|
|
163 |
|
|
if ( $gui ) { |
164 |
|
|
if ( ref $chosen eq 'ARRAY' ) { |
165 |
|
|
foreach my $elem ( @$chosen ) { |
166 |
|
|
$self->__post_rool ($elem, $post_rools, $description); |
167 |
|
|
} |
168 |
|
|
} |
169 |
|
|
$self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen; |
170 |
101 |
ahitrov |
# $self->{html} = $content; |
171 |
|
|
# $self->{tree} = $shortcuts; |
172 |
|
|
$self->{tree} = $tree; |
173 |
99 |
ahitrov |
$self->{chosen} = $chosen; |
174 |
|
|
} else { |
175 |
|
|
$self->__post_rool ($chosen->[0], $post_rools, $description); |
176 |
|
|
$self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text}); |
177 |
101 |
ahitrov |
$self->{chosen} = $chosen; |
178 |
|
|
map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY'; |
179 |
99 |
ahitrov |
$tree = undef; |
180 |
|
|
foreach my $key ( keys %$shortcuts ) { |
181 |
|
|
delete $shortcuts->{$key}; |
182 |
|
|
} |
183 |
|
|
$shortcuts = undef; |
184 |
|
|
$content = undef; |
185 |
|
|
} |
186 |
|
|
return $self->is_success(1); |
187 |
|
|
} |
188 |
|
|
|
189 |
|
|
sub __clean_tags { |
190 |
|
|
my ($self, $content, $rools) = @_; |
191 |
|
|
|
192 |
|
|
my @cut_rools; |
193 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
194 |
|
|
@cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools; |
195 |
|
|
} |
196 |
|
|
my @clean_off_rools; |
197 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
198 |
|
|
@clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools; |
199 |
|
|
} |
200 |
|
|
$$content =~ s/<!DOCTYPE(.*?)>//sgi; |
201 |
|
|
$$content =~ s/<!--(.*?)-->//sgi; |
202 |
|
|
$$content =~ s/<script(.*?)<\/script>//sgi; |
203 |
|
|
$$content =~ s/<hr(.*?)>//sgi; |
204 |
|
|
$$content =~ s/<noscript(.*?)<\/noscript>//sgi; |
205 |
|
|
$$content =~ s/<iframe(.*?)<\/iframe>//sgi; |
206 |
|
|
unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) { |
207 |
|
|
$$content =~ s/<noindex(.*?)<\/noindex>//sgi; |
208 |
|
|
} else { |
209 |
|
|
$$content =~ s/<\/?noindex(.*?)>//sgi; |
210 |
|
|
} |
211 |
|
|
$$content =~ s/<object(.*?)<\/object>//sgi; |
212 |
|
|
$$content =~ s/<embed(.*?)<\/embed>//sgi; |
213 |
|
|
$$content =~ s/<style(.*?)<\/style>//sgi; |
214 |
|
|
if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) { |
215 |
|
|
$$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi; |
216 |
|
|
$$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi; |
217 |
|
|
$$content =~ s/<input([^>]*?)>//sgi; |
218 |
|
|
} else { |
219 |
|
|
$$content =~ s/<form(.*?)<\/form>//sgi; |
220 |
104 |
ahitrov |
$$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi; |
221 |
|
|
$$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi; |
222 |
99 |
ahitrov |
} |
223 |
|
|
foreach my $rool ( @clean_off_rools ) { |
224 |
|
|
next unless $rool->{condition}{param} eq 'tag'; |
225 |
|
|
my $tag = $rool->{condition}{value}; |
226 |
|
|
$$content =~ s/<$tag(.*?)<\/$tag>//sgi; |
227 |
|
|
} |
228 |
|
|
$$content =~ s/<head(.*?)<\/head>//sgi; |
229 |
|
|
$$content =~ s/\ style="(.*?)"//sgi; |
230 |
|
|
|
231 |
|
|
$$content =~ s/<\/?span(.*?)>//sgi; |
232 |
|
|
$$content =~ s/<\/?font(.*?)>//sgi; |
233 |
|
|
$$content =~ s/<br(.*?)>/\n/sgi; |
234 |
|
|
$$content =~ s/<link(.*?)>//sgi; |
235 |
|
|
$$content =~ s/<spacer(.*?)>//sgi; |
236 |
|
|
$$content =~ s/<\!\?(.*?)\?>//sgi; |
237 |
|
|
# $$content =~ s/<a\s*?(.*?)>/\n/sgi; |
238 |
|
|
$$content =~ s/<\/p\s*>//sgi; |
239 |
|
|
# $$content =~ s/<\/a\s*>//sgi; |
240 |
|
|
$$content =~ s/<p\s*(.*?)>/\n\n/sgi; |
241 |
|
|
$$content =~ s/onclick="(.*?)"//sgi; |
242 |
|
|
$$content =~ s/onload="(.*?)"//sgi; |
243 |
|
|
|
244 |
|
|
} |
245 |
|
|
|
246 |
|
|
sub __clean_img { |
247 |
|
|
my ($self, $content) = @_; |
248 |
|
|
|
249 |
|
|
my @garbage; |
250 |
|
|
my $i = 1; |
251 |
|
|
|
252 |
|
|
while ( $$content =~ /(<img.*?>)/sgi ) { |
253 |
|
|
my $img = $1; |
254 |
|
|
my $src; |
255 |
|
|
if ( $img =~ /src=([^\x20|^>]+)/i ) { |
256 |
|
|
$src = $1; |
257 |
|
|
} |
258 |
|
|
my ($w, $h); |
259 |
|
|
if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) { |
260 |
|
|
$w = $1; |
261 |
|
|
} |
262 |
|
|
if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) { |
263 |
|
|
$h = $1; |
264 |
|
|
} |
265 |
|
|
my $delim = 0; |
266 |
|
|
if ( $w && $h ) { |
267 |
|
|
foreach my $pair ( @bad_dimensions ) { |
268 |
|
|
if ($w == $pair->{w} && $h == $pair->{h}) { |
269 |
|
|
$delim = 10; |
270 |
|
|
last; |
271 |
|
|
} |
272 |
|
|
} |
273 |
|
|
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; |
274 |
|
|
} |
275 |
|
|
my $bad_name = __check_img_name ( $src ); |
276 |
|
|
if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) { |
277 |
|
|
# warn "Bad name: [$src]\n"; |
278 |
|
|
push @garbage, $src; |
279 |
|
|
} |
280 |
|
|
} |
281 |
|
|
|
282 |
|
|
foreach my $src (@garbage) { |
283 |
|
|
$src =~ s|([*+?()/\\\$\[\]])|\\$1|sg; |
284 |
|
|
$$content =~ s/<img([^>]*?)src=$src([^>]+)>//si; |
285 |
|
|
} |
286 |
|
|
} |
287 |
|
|
|
288 |
|
|
|
289 |
|
|
sub __check_img_name { |
290 |
|
|
my $name = shift; |
291 |
|
|
my $test = $1 if $name =~ /\/([^\/]+)$/; |
292 |
107 |
ahitrov |
if ( $test && ($test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i) ) { |
293 |
99 |
ahitrov |
return 1; |
294 |
|
|
} |
295 |
|
|
foreach my $word ( @PICNAME ) { |
296 |
107 |
ahitrov |
if ( $test && ($test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si) ) { |
297 |
99 |
ahitrov |
return 1; |
298 |
|
|
} |
299 |
|
|
} |
300 |
|
|
foreach my $word ( @PICURL ) { |
301 |
|
|
if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) { |
302 |
|
|
return 1; |
303 |
|
|
} |
304 |
|
|
} |
305 |
|
|
foreach my $word ( @PICHOST ) { |
306 |
|
|
if ( index (lc($name), $word) >= 0 ) { |
307 |
|
|
return 1; |
308 |
|
|
} |
309 |
|
|
} |
310 |
|
|
return 0; |
311 |
|
|
} |
312 |
|
|
|
313 |
|
|
|
314 |
|
|
sub __clean_empty_div { |
315 |
|
|
my ($self, $content) = @_; |
316 |
|
|
|
317 |
|
|
my $i = 0; |
318 |
|
|
while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) { |
319 |
|
|
$i++; |
320 |
|
|
} |
321 |
|
|
|
322 |
|
|
return $i; |
323 |
|
|
} |
324 |
|
|
|
325 |
|
|
|
326 |
|
|
sub __make_tree { |
327 |
|
|
my ($self, $content, $rools, $debug) = @_; |
328 |
|
|
|
329 |
|
|
my @elems = split (//,$$content); |
330 |
|
|
my @collaborate; |
331 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
332 |
|
|
@collaborate = grep { $_->{command} eq 'collaborate' } @$rools; |
333 |
|
|
} |
334 |
|
|
my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 ); |
335 |
|
|
my $id = 1; |
336 |
|
|
my $level = 0; |
337 |
|
|
my %tree = ( |
338 |
|
|
root => { |
339 |
|
|
id => $id++, |
340 |
|
|
text => '', |
341 |
|
|
type => 'root', |
342 |
|
|
children=> [], |
343 |
|
|
parent => undef, |
344 |
|
|
level => $level, |
345 |
|
|
}, |
346 |
|
|
); |
347 |
|
|
my @stack; |
348 |
|
|
my %elem_hash = ( 1 => $tree{root} ); |
349 |
|
|
my $current = $tree{root}; |
350 |
|
|
my $previous; |
351 |
|
|
|
352 |
|
|
while ( @elems ) { |
353 |
|
|
if ($elems[0] =~ /[\ \t]/ && !$current){ |
354 |
|
|
shift @elems; |
355 |
|
|
next; |
356 |
|
|
} |
357 |
|
|
if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) { |
358 |
|
|
my $tag = $self->__try_tag (\@elems); |
359 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
360 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
361 |
|
|
if ( ref $last_text_tag ) { |
362 |
|
|
$last_text_tag->{text} .= $tag->{content}; |
363 |
|
|
$last_text_tag->{count} += $tag->{count}; |
364 |
|
|
} else { |
365 |
|
|
$last_text_tag = $tag; |
366 |
|
|
$last_text_tag->{id} = $id++; |
367 |
|
|
$last_text_tag->{type} = 'text'; |
368 |
|
|
$last_text_tag->{parent} = $current; |
369 |
|
|
$last_text_tag->{level} = $level+1; |
370 |
101 |
ahitrov |
$last_text_tag->{text} = $tag->{content}; |
371 |
99 |
ahitrov |
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
372 |
|
|
push @{$current->{children}}, $last_text_tag; |
373 |
|
|
$current->{text_count}++; |
374 |
|
|
} |
375 |
|
|
$current->{text_value} += $tag->{count}; |
376 |
|
|
splice @elems, 0, $tag->{count}; |
377 |
101 |
ahitrov |
# warn "Tag opened. Next text: [".join('',$elems[0..10])."]\n"; |
378 |
99 |
ahitrov |
} elsif ( ref $tag ) { |
379 |
|
|
if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) { |
380 |
|
|
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug; |
381 |
|
|
do { |
382 |
|
|
$current = pop @stack; |
383 |
|
|
$level = $current->{level}; |
384 |
|
|
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
385 |
|
|
} while ( ($current->{type} !~ /table|body|html/) && scalar @stack ); |
386 |
|
|
|
387 |
|
|
} |
388 |
|
|
if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) { |
389 |
|
|
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug; |
390 |
|
|
do { |
391 |
|
|
$current = pop @stack; |
392 |
|
|
$level = $current->{level}; |
393 |
|
|
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
394 |
|
|
} while ( ($current->{type} eq "table") && scalar @stack ); |
395 |
|
|
|
396 |
|
|
} |
397 |
|
|
$tag->{id} = $id++; |
398 |
|
|
$tag->{children} = []; |
399 |
|
|
$tag->{text_count} = 0; |
400 |
|
|
$tag->{parent} = $current; |
401 |
|
|
$tag->{level} = ++$level; |
402 |
|
|
$elem_hash{$tag->{id}} = $tag; |
403 |
101 |
ahitrov |
push @{$current->{children}}, $tag; |
404 |
99 |
ahitrov |
push @stack, $current; |
405 |
101 |
ahitrov |
# warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n"; |
406 |
99 |
ahitrov |
$current = $tag; |
407 |
|
|
splice @elems, 0, $tag->{count}; |
408 |
|
|
} else { |
409 |
|
|
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; |
410 |
|
|
last; |
411 |
|
|
} |
412 |
|
|
} elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) { |
413 |
|
|
my $tag = $self->__try_end (\@elems); |
414 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
415 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
416 |
|
|
if ( ref $last_text_tag ) { |
417 |
|
|
$last_text_tag->{text} .= $tag->{content}; |
418 |
|
|
$last_text_tag->{count} += $tag->{count}; |
419 |
|
|
} else { |
420 |
|
|
$last_text_tag = $tag; |
421 |
|
|
$last_text_tag->{id} = $id++; |
422 |
|
|
$last_text_tag->{type} = 'text'; |
423 |
|
|
$last_text_tag->{parent} = $current; |
424 |
101 |
ahitrov |
$last_text_tag->{text} = $tag->{content}; |
425 |
99 |
ahitrov |
$last_text_tag->{level} = $level+1; |
426 |
|
|
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
427 |
|
|
push @{$current->{children}}, $last_text_tag; |
428 |
|
|
$current->{text_count}++; |
429 |
|
|
} |
430 |
|
|
$current->{text_value} += $tag->{count}; |
431 |
|
|
splice @elems, 0, $tag->{count}; |
432 |
|
|
} elsif ( ref $tag ) { |
433 |
|
|
if ( $current->{type} ne $tag->{type} ) { |
434 |
|
|
# warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug; |
435 |
|
|
# warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n"; |
436 |
|
|
if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) { |
437 |
|
|
do { |
438 |
|
|
$current = pop @stack; |
439 |
|
|
$level = $current->{level}; |
440 |
|
|
# warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
441 |
|
|
} while ( ($current->{type} ne $tag->{type}) && scalar @stack ); |
442 |
|
|
$current = pop @stack; |
443 |
|
|
$level = $current->{level}; |
444 |
|
|
# warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; |
445 |
|
|
}else{ |
446 |
|
|
# warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug; |
447 |
|
|
} |
448 |
|
|
} else { |
449 |
|
|
if ( @collaborate ) { |
450 |
|
|
if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate) |
451 |
|
|
&& $previous->{type} eq $current->{type} && $previous->{level} == $current->{level} |
452 |
|
|
&& $previous->{text} && $current->{text} ) { |
453 |
|
|
$previous->{text} .= ' '.$current->{text}; |
454 |
|
|
my $parent = $current->{parent}; |
455 |
|
|
splice @{$parent->{children}}, -1, 1; |
456 |
|
|
delete $elem_hash{$current->{id}}; |
457 |
|
|
$current = undef; |
458 |
|
|
} elsif ( !defined $previous && $current->{text} ) { |
459 |
|
|
$previous = $current; |
460 |
|
|
} else { |
461 |
|
|
$previous = undef; |
462 |
|
|
} |
463 |
|
|
} |
464 |
|
|
$current = pop @stack; |
465 |
|
|
$level = $current->{level}; |
466 |
|
|
# warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text}; |
467 |
|
|
# warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
468 |
|
|
} |
469 |
|
|
splice @elems, 0, $tag->{count}; |
470 |
|
|
} else { |
471 |
|
|
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; |
472 |
|
|
last; |
473 |
|
|
} |
474 |
|
|
} else { |
475 |
|
|
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef; |
476 |
|
|
if ( ref $last_text_tag ) { |
477 |
|
|
$last_text_tag->{text} .= shift @elems; |
478 |
|
|
$last_text_tag->{count}++; |
479 |
|
|
} else { |
480 |
101 |
ahitrov |
$last_text_tag = {}; |
481 |
99 |
ahitrov |
$last_text_tag->{text} = shift @elems; |
482 |
|
|
$last_text_tag->{count} = 1; |
483 |
|
|
$last_text_tag->{id} = $id++; |
484 |
|
|
$last_text_tag->{type} = 'text'; |
485 |
|
|
$last_text_tag->{parent} = $current; |
486 |
|
|
$last_text_tag->{level} = $level+1; |
487 |
|
|
$elem_hash{$last_text_tag->{id}} = $last_text_tag; |
488 |
|
|
push @{$current->{children}}, $last_text_tag; |
489 |
|
|
$current->{text_count}++; |
490 |
|
|
$current->{text_value} = 0; |
491 |
|
|
} |
492 |
|
|
$current->{text_value}++; |
493 |
|
|
} |
494 |
|
|
} |
495 |
|
|
return (\%tree, \%elem_hash); |
496 |
|
|
} |
497 |
|
|
|
498 |
|
|
|
499 |
|
|
sub __try_tag { |
500 |
|
|
my ($self, $content) = @_; |
501 |
|
|
|
502 |
|
|
my $i = 1; |
503 |
|
|
my %tag; |
504 |
|
|
my $tag = $content->[0]; |
505 |
101 |
ahitrov |
while ( $i < (scalar @$content - 1) && $content->[$i] ne '<' && $content->[$i] ne '>' ) { |
506 |
99 |
ahitrov |
$tag .= $content->[$i]; |
507 |
|
|
$i++; |
508 |
|
|
} |
509 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
510 |
|
|
return { |
511 |
|
|
type => 'text', |
512 |
|
|
content => $tag, |
513 |
|
|
count => $i, |
514 |
|
|
}; |
515 |
101 |
ahitrov |
} |
516 |
|
|
$tag .= $content->[$i++]; |
517 |
|
|
# warn "TAG: [$tag]\n"; |
518 |
|
|
|
519 |
|
|
if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) { |
520 |
|
|
my $val = $1; |
521 |
|
|
if ( $tag =~ /^<($val)\s*(.*)/i ) { |
522 |
|
|
$tag{type} = lc($1); |
523 |
|
|
my $args = $2; |
524 |
|
|
$tag{count} = $i; |
525 |
|
|
my %args; |
526 |
|
|
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) { |
527 |
|
|
$args{lc($1)} = $2; |
528 |
99 |
ahitrov |
} |
529 |
101 |
ahitrov |
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) { |
530 |
|
|
$args{lc($1)} = $2; |
531 |
|
|
} |
532 |
|
|
while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) { |
533 |
|
|
$args{lc($1)} = $2; |
534 |
|
|
} |
535 |
|
|
foreach my $arg ( qw( name id class width align ) ) { |
536 |
|
|
$tag{params}{$arg} = $args{$arg} if exists $args{$arg}; |
537 |
|
|
} |
538 |
|
|
return \%tag; |
539 |
|
|
} else { |
540 |
99 |
ahitrov |
return { |
541 |
|
|
type => 'text', |
542 |
|
|
content => $tag, |
543 |
|
|
count => $i, |
544 |
|
|
}; |
545 |
|
|
} |
546 |
101 |
ahitrov |
} else { |
547 |
|
|
return { |
548 |
|
|
type => 'text', |
549 |
|
|
content => $tag, |
550 |
|
|
count => $i, |
551 |
|
|
}; |
552 |
99 |
ahitrov |
} |
553 |
|
|
} |
554 |
|
|
|
555 |
|
|
sub __try_end { |
556 |
|
|
my ($self, $content) = @_; |
557 |
|
|
|
558 |
|
|
my $i = 2; |
559 |
|
|
my %tag; |
560 |
|
|
my $tag = $content->[0].$content->[1]; |
561 |
101 |
ahitrov |
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < (scalar @$content-1) ) { |
562 |
99 |
ahitrov |
$tag .= $content->[$i]; |
563 |
|
|
$i++; |
564 |
|
|
} |
565 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
566 |
|
|
return { |
567 |
|
|
type => 'text', |
568 |
|
|
content => $tag, |
569 |
|
|
count => $i, |
570 |
|
|
}; |
571 |
101 |
ahitrov |
} |
572 |
|
|
$tag .= $content->[$i++]; |
573 |
|
|
# warn "TAG END: [$tag]\n"; |
574 |
|
|
if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) { |
575 |
99 |
ahitrov |
my $val = $1; |
576 |
101 |
ahitrov |
if ( $tag =~ /^<\/($val)[\s>]/i ) { |
577 |
99 |
ahitrov |
$tag{type} = lc($1); |
578 |
101 |
ahitrov |
$tag{count} = $i; |
579 |
99 |
ahitrov |
return \%tag; |
580 |
|
|
} else { |
581 |
|
|
return { |
582 |
|
|
type => 'text', |
583 |
|
|
content => $tag, |
584 |
|
|
count => $i, |
585 |
|
|
}; |
586 |
|
|
} |
587 |
101 |
ahitrov |
} else { |
588 |
99 |
ahitrov |
return { |
589 |
|
|
type => 'text', |
590 |
|
|
content => $tag, |
591 |
|
|
count => $i, |
592 |
|
|
}; |
593 |
|
|
} |
594 |
|
|
} |
595 |
|
|
|
596 |
|
|
|
597 |
|
|
sub __extract_img { |
598 |
|
|
my ($self, $structure, $base_url, $debug) = @_; |
599 |
|
|
return unless ref $structure eq 'HASH'; |
600 |
|
|
|
601 |
101 |
ahitrov |
foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) { |
602 |
99 |
ahitrov |
my $text = $tag->{text}; |
603 |
|
|
while ( $text =~ /<img (.*?)>/sgi ) { |
604 |
|
|
# warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n"; |
605 |
|
|
my $params = $1; |
606 |
|
|
my $img = {}; |
607 |
|
|
if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) { |
608 |
|
|
$img->{url} = $1; |
609 |
101 |
ahitrov |
$img->{url} =~ s/[\r\t\n\ ]+$//; |
610 |
|
|
$img->{url} =~ s/^[\r\t\n\ ]+//; |
611 |
99 |
ahitrov |
$img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/; |
612 |
|
|
$img->{url} =~ s/\/+/\//sgi; |
613 |
|
|
$img->{url} =~ s/http:\//http:\/\//sgi; |
614 |
|
|
$img->{w} = $1 if $params =~ /width[\D]+(\d+)/; |
615 |
|
|
$img->{h} = $1 if $params =~ /height[\D]+(\d+)/; |
616 |
|
|
$img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/; |
617 |
|
|
$tag->{images} = [] unless ref $tag->{images} eq 'ARRAY'; |
618 |
|
|
push @{ $tag->{images} }, $img; |
619 |
|
|
# warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n"; |
620 |
|
|
} |
621 |
|
|
} |
622 |
|
|
$text =~ s/<img (.*?)>//sgi; |
623 |
|
|
$tag->{text} = $text; |
624 |
|
|
$tag->{count} = length ($text); |
625 |
|
|
} |
626 |
|
|
} |
627 |
|
|
|
628 |
|
|
|
629 |
|
|
sub __extract_headers { |
630 |
|
|
my ($self, $structure, $debug) = @_; |
631 |
|
|
return unless ref $structure eq 'HASH'; |
632 |
|
|
|
633 |
101 |
ahitrov |
foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) { |
634 |
99 |
ahitrov |
my $text = $tag->{text}; |
635 |
|
|
while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) { |
636 |
|
|
my $header_level = $1; |
637 |
|
|
my $header_text = $2; |
638 |
|
|
$tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY'; |
639 |
|
|
push @{ $tag->{headers} }, { level => $header_level, text => $header_text }; |
640 |
|
|
} |
641 |
|
|
} |
642 |
|
|
|
643 |
|
|
} |
644 |
|
|
|
645 |
|
|
|
646 |
|
|
sub __dig_big_texts { |
647 |
|
|
my ($self, %opts) = @_; |
648 |
|
|
my $structure = exists $opts{structure} ? $opts{structure} : undef; |
649 |
|
|
my $minimum = exists $opts{min} ? $opts{min} : undef; |
650 |
|
|
my $debug = exists $opts{debug} ? $opts{debug} : undef; |
651 |
|
|
my $rools = exists $opts{rools} ? $opts{rools} : undef; |
652 |
|
|
return unless ref $structure eq 'HASH'; |
653 |
|
|
|
654 |
|
|
my @rools; |
655 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
656 |
|
|
@rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools; |
657 |
|
|
} |
658 |
|
|
my @exclude_rools; |
659 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
660 |
|
|
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools; |
661 |
|
|
} |
662 |
|
|
|
663 |
|
|
my @ret; |
664 |
101 |
ahitrov |
foreach my $tag ( sort { $a->{id} <=> $b->{id} } grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) { |
665 |
|
|
next if $self->__exclude_rools($tag->{parent}, \@exclude_rools); |
666 |
99 |
ahitrov |
|
667 |
|
|
if ( @rools ) { |
668 |
|
|
my $choose = 0; |
669 |
|
|
foreach my $rool ( @rools ) { |
670 |
|
|
my $matched = 1; |
671 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
672 |
101 |
ahitrov |
unless ( exists $tag->{parent}{params}{$cond->{param}} && $tag->{parent}{params}{$cond->{param}} eq $cond->{value} ) { |
673 |
99 |
ahitrov |
$matched = 0; |
674 |
|
|
} |
675 |
|
|
} |
676 |
|
|
$choose ||= $matched; |
677 |
|
|
} |
678 |
|
|
if ( $choose ) { |
679 |
101 |
ahitrov |
for ( $tag->{text} ) { |
680 |
|
|
s/^[\t\ \n\r]+//s; |
681 |
|
|
s/[\t\ \n\r]+$//s; |
682 |
|
|
s/[\t\ ]+/\ /sg; |
683 |
|
|
s/\r//sg; |
684 |
|
|
s/\n{2,}/\n\n/sg; |
685 |
|
|
s/\&\\x(\d+)//sgi; |
686 |
|
|
} |
687 |
99 |
ahitrov |
|
688 |
|
|
my $text = $tag->{text}; |
689 |
|
|
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; |
690 |
101 |
ahitrov |
$text = Contenido::Parser::Util::strip_html($text); |
691 |
|
|
$tag->{text_weight} = length($text); |
692 |
99 |
ahitrov |
if ( length($text) >= $minimum ) { |
693 |
101 |
ahitrov |
for ( $tag->{text} ) { |
694 |
|
|
s/<a.*?>//sgi; |
695 |
|
|
s/<\/a.*?>//sgi; |
696 |
|
|
} |
697 |
|
|
push @ret, $tag; |
698 |
99 |
ahitrov |
} |
699 |
|
|
} |
700 |
|
|
} else { |
701 |
101 |
ahitrov |
for ( $tag->{text} ) { |
702 |
|
|
s/^[\t\ \n\r]+//s; |
703 |
|
|
s/[\t\ \n\r]+$//s; |
704 |
|
|
s/[\t\ ]+/\ /sg; |
705 |
|
|
s/\r//sg; |
706 |
|
|
s/\n{2,}/\n\n/sg; |
707 |
|
|
} |
708 |
99 |
ahitrov |
my $text = $tag->{text}; |
709 |
|
|
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi; |
710 |
101 |
ahitrov |
$text = Contenido::Parser::Util::strip_html($text); |
711 |
|
|
$tag->{text_weight} = length($text); |
712 |
99 |
ahitrov |
if ( length($text) >= $minimum ) { |
713 |
101 |
ahitrov |
for ( $tag->{text} ) { |
714 |
|
|
s/<a.*?>//sgi; |
715 |
|
|
s/<\/a.*?>//sgi; |
716 |
|
|
s/\&\\x(\d+)//sgi; |
717 |
|
|
} |
718 |
99 |
ahitrov |
push @ret, $tag; |
719 |
|
|
} |
720 |
|
|
} |
721 |
|
|
} |
722 |
|
|
unless ( @ret ) { |
723 |
|
|
warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug; |
724 |
|
|
} |
725 |
107 |
ahitrov |
warn "Digging done!\n" if $debug; |
726 |
99 |
ahitrov |
return \@ret; |
727 |
|
|
} |
728 |
|
|
|
729 |
|
|
|
730 |
|
|
|
731 |
|
|
sub __check_headers { |
732 |
|
|
my ($self, $chosen, $header, $debug) = @_; |
733 |
|
|
return unless ref $chosen eq 'ARRAY'; |
734 |
|
|
|
735 |
|
|
unless ( grep { exists $_->{headers} } @$chosen ) { |
736 |
|
|
warn "No headers found\n" if $debug; |
737 |
|
|
return $chosen; |
738 |
|
|
} else { |
739 |
|
|
# @$chosen = grep { exists $_->{headers} } @$chosen; |
740 |
|
|
} |
741 |
|
|
my @ret; |
742 |
|
|
foreach my $unit ( @$chosen ) { |
743 |
|
|
unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) { |
744 |
|
|
$unit->{header_identity} = 0; |
745 |
|
|
$unit->{header_min_level} = 32768; |
746 |
|
|
next; |
747 |
|
|
} |
748 |
|
|
my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}}; |
749 |
|
|
my $min_level = $headers[0]->{level}; |
750 |
|
|
$unit->{header_min_level} = $min_level; |
751 |
|
|
if ( $header ) { |
752 |
|
|
my $coeff = $self->__str_compare( $header, $headers[0]->{text} ); |
753 |
|
|
$unit->{header_identity} = $coeff; |
754 |
|
|
} |
755 |
|
|
} |
756 |
|
|
# @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen; |
757 |
|
|
# return \@ret; |
758 |
|
|
return $chosen; |
759 |
|
|
} |
760 |
|
|
|
761 |
|
|
|
762 |
|
|
|
763 |
|
|
sub __check_description { |
764 |
|
|
my ($self, $chosen, $desc, $debug) = @_; |
765 |
|
|
return unless ref $chosen eq 'ARRAY' && $desc; |
766 |
|
|
|
767 |
|
|
my @ret; |
768 |
|
|
foreach my $unit ( @$chosen ) { |
769 |
|
|
if ( $desc ) { |
770 |
|
|
my $coeff = $self->__str_compare( $unit->{text}, $desc ); |
771 |
|
|
warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug; |
772 |
|
|
$unit->{description_identity} = $coeff; |
773 |
|
|
} |
774 |
|
|
} |
775 |
|
|
@ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen; |
776 |
|
|
return \@ret; |
777 |
|
|
} |
778 |
|
|
|
779 |
|
|
|
780 |
|
|
# wtf, bastards! how come my code's used here? --ra |
781 |
|
|
# damn, it's not 100% your code already |
782 |
|
|
sub __str_compare { |
783 |
|
|
my ($self, $original, $applicant) = @_; |
784 |
|
|
|
785 |
|
|
my $Al = __freq_list($original); |
786 |
|
|
return -1 unless defined $Al; |
787 |
|
|
my $Bl = __freq_list($applicant); |
788 |
|
|
return -1 unless defined $Bl; |
789 |
|
|
my $df = 0; |
790 |
|
|
|
791 |
|
|
foreach my $word ( %$Bl ) { |
792 |
|
|
if ( exists $Al->{$word} ) { |
793 |
104 |
ahitrov |
$df += $Al->{$word} || 0; |
794 |
99 |
ahitrov |
} else { |
795 |
104 |
ahitrov |
$df -= $Bl->{$word} || 0; |
796 |
99 |
ahitrov |
} |
797 |
|
|
} |
798 |
|
|
|
799 |
|
|
return $df; |
800 |
|
|
} |
801 |
|
|
|
802 |
|
|
sub __freq_list { |
803 |
|
|
|
804 |
|
|
my @d = grep { length($_) > 3 } split /\W/, $_[0]; |
805 |
|
|
return undef unless @d; |
806 |
|
|
my $z = 1/scalar(@d); my %l = (); |
807 |
|
|
$l{$_} += $z for @d; \%l; |
808 |
|
|
} |
809 |
|
|
|
810 |
|
|
|
811 |
|
|
|
812 |
|
|
sub __strip_html { |
813 |
|
|
my ($self, %opts) = @_; |
814 |
|
|
return unless ref $opts{chosen} eq 'ARRAY'; |
815 |
|
|
|
816 |
|
|
my $chosen = $opts{chosen}; |
817 |
|
|
my $rooles = $opts{rools}; |
818 |
|
|
my $header = $opts{header}; |
819 |
|
|
|
820 |
|
|
foreach my $unit ( @$chosen ) { |
821 |
|
|
my %tags; |
822 |
|
|
my $headers = $unit->{headers} if exists $unit->{headers}; |
823 |
|
|
if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) { |
824 |
|
|
if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) { |
825 |
|
|
$unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi; |
826 |
|
|
$unit->{text} =~ s/^[\x20\t\r\n]+//si; |
827 |
|
|
} elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) { |
828 |
|
|
while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) { |
829 |
|
|
my $hdr = 'h'.$1; |
830 |
|
|
$unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si; |
831 |
|
|
} |
832 |
|
|
} |
833 |
|
|
} |
834 |
|
|
for ( $unit->{text} ) { |
835 |
|
|
s/></> </sg; |
836 |
|
|
s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi; |
837 |
|
|
s/<\/h(\d+)(.*?)>/\. /sgi; |
838 |
|
|
s/<h(\d+)(.*?)>/\n/sgi; |
839 |
|
|
s/&/\&/sg; |
840 |
|
|
s/&/\&/sgi; |
841 |
|
|
s/«/«/sg; |
842 |
|
|
s/»/»/sg; |
843 |
|
|
s/£/£/sg; |
844 |
|
|
s/–/–/sg; |
845 |
|
|
s/—/—/sg; |
846 |
|
|
s/…/\.\.\./sg; |
847 |
|
|
s/„/"/sg; |
848 |
|
|
s/“/"/sg; |
849 |
|
|
s/”/"/sg; |
850 |
|
|
s/´/'/sg; |
851 |
|
|
s/ /\n/sg; |
852 |
|
|
s/"/"/sg; |
853 |
|
|
s/ /\ /sgi; |
854 |
|
|
} |
855 |
|
|
# $unit->{text} = HTML::Entities::decode_entities($unit->{text}); |
856 |
|
|
# $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text}); |
857 |
|
|
for ( $unit->{text} ) { |
858 |
101 |
ahitrov |
s/^[\ \t\r\n]+//si; |
859 |
99 |
ahitrov |
s/^(\d+)\.(\d+)\.(\d+)//si; |
860 |
101 |
ahitrov |
s/^[\ \t\r\n]+//si; |
861 |
99 |
ahitrov |
s/^(\d+):(\d+)//si; |
862 |
101 |
ahitrov |
s/^[\ \t\r\n]+//si; |
863 |
99 |
ahitrov |
} |
864 |
|
|
if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) { |
865 |
|
|
substr $unit->{text}, 0, length($header), ''; |
866 |
101 |
ahitrov |
$unit->{text} =~ s/^[\.\ \t\r\n]+//sgi; |
867 |
99 |
ahitrov |
} |
868 |
101 |
ahitrov |
$unit->{text} =~ s/[\ \t\r\n]+$//sgi; |
869 |
99 |
ahitrov |
} |
870 |
|
|
} |
871 |
|
|
|
872 |
|
|
|
873 |
|
|
|
874 |
|
|
sub __glue { |
875 |
|
|
my ($self, $chosen, $glue, $debug) = @_; |
876 |
|
|
return unless ref $chosen eq 'ARRAY'; |
877 |
|
|
|
878 |
|
|
my $i = 0; |
879 |
|
|
|
880 |
|
|
if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) { |
881 |
|
|
my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768; |
882 |
|
|
foreach my $unit ( @$chosen ) { |
883 |
|
|
next unless $i++; |
884 |
|
|
if ( $i <= $count ) { |
885 |
|
|
$chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text}; |
886 |
|
|
} |
887 |
|
|
} |
888 |
|
|
} elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) { |
889 |
|
|
my $text = ''; |
890 |
|
|
my $i = 0; |
891 |
|
|
foreach my $pos ( @{ $glue->{condition} } ) { |
892 |
|
|
$text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text}; |
893 |
|
|
} |
894 |
|
|
$chosen->[0]->{text} = $text; |
895 |
|
|
} |
896 |
|
|
} |
897 |
|
|
|
898 |
|
|
|
899 |
|
|
sub __get_images { |
900 |
|
|
my ($self, %opts) = @_; |
901 |
|
|
my $structure = exists $opts{structure} ? $opts{structure} : undef; |
902 |
|
|
my $chosen = exists $opts{chosen} ? $opts{chosen} : undef; |
903 |
|
|
my $debug = exists $opts{debug} ? $opts{debug} : undef; |
904 |
|
|
my $rools = exists $opts{rools} ? $opts{rools} : undef; |
905 |
|
|
my $base_url = delete $opts{base_url}; |
906 |
|
|
return unless ref $chosen && ref $structure; |
907 |
|
|
|
908 |
|
|
return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools; |
909 |
|
|
my @use_rools; |
910 |
|
|
my @exclude_rools; |
911 |
|
|
my $no_validation = 0; |
912 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
913 |
|
|
@use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools; |
914 |
|
|
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools; |
915 |
|
|
$no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools; |
916 |
|
|
} |
917 |
|
|
my $image_depth; |
918 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools) { |
919 |
|
|
my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools; |
920 |
|
|
$image_depth = $rools[-1]->{condition} if @rools; |
921 |
|
|
} |
922 |
|
|
|
923 |
|
|
my @images; |
924 |
|
|
foreach my $tag ( values %$structure ) { |
925 |
|
|
next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY'; |
926 |
|
|
next if $self->__exclude_rools($tag, \@exclude_rools); |
927 |
|
|
|
928 |
|
|
if ( @use_rools ) { |
929 |
|
|
my $choose = 0; |
930 |
|
|
foreach my $rool ( @use_rools ) { |
931 |
|
|
my $matched = 1; |
932 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
933 |
|
|
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { |
934 |
|
|
$matched = 0; |
935 |
|
|
} |
936 |
|
|
} |
937 |
|
|
$matched = 0 if $self->__exclude_rools($tag, \@exclude_rools); |
938 |
|
|
$choose ||= $matched; |
939 |
|
|
} |
940 |
|
|
if ( $choose ) { |
941 |
|
|
my @img = grep { $no_validation || $self->__img_is_valid ($_) } map { |
942 |
|
|
my $img = rchannel::Image->new($_); |
943 |
|
|
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; |
944 |
|
|
$img; |
945 |
|
|
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; |
946 |
|
|
|
947 |
|
|
push @images, @img; |
948 |
|
|
} |
949 |
|
|
} else { |
950 |
101 |
ahitrov |
next if ($tag->{level}+1) < $chosen->{parent}{level}; |
951 |
|
|
next if $image_depth && ( $tag->{level} > ($chosen->{parent}{level} + $image_depth) ); |
952 |
99 |
ahitrov |
|
953 |
|
|
my $ok = 0; |
954 |
101 |
ahitrov |
my $uphops = $tag->{level} > $chosen->{parent}{level} ? 1 : 2; |
955 |
|
|
my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{parent}{level} + $uphops; |
956 |
99 |
ahitrov |
next if ($hops - $uphops) > 4; |
957 |
|
|
my @img_parents = ($tag->{id}); |
958 |
|
|
my $parent = $tag; |
959 |
|
|
for ( 1..$hops ) { |
960 |
|
|
$parent = $parent->{parent}; |
961 |
|
|
push @img_parents, $parent->{id}; |
962 |
|
|
} |
963 |
101 |
ahitrov |
$parent = $chosen->{parent}{parent}; |
964 |
99 |
ahitrov |
for ( 0..$uphops ) { |
965 |
|
|
if ( grep { $parent->{id} == $_ } @img_parents ) { |
966 |
|
|
$ok = 1; |
967 |
|
|
last; |
968 |
|
|
} |
969 |
|
|
$parent = $parent->{parent}; |
970 |
|
|
} |
971 |
|
|
if ( $ok ) { |
972 |
|
|
my @img = grep { $self->__img_is_valid ($_) } map { |
973 |
101 |
ahitrov |
my $img = $_; |
974 |
|
|
$img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; |
975 |
99 |
ahitrov |
$img; |
976 |
|
|
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} }; |
977 |
|
|
|
978 |
|
|
push @images, @img; |
979 |
|
|
} |
980 |
|
|
} |
981 |
|
|
} |
982 |
|
|
# warn Dumper (\@images); |
983 |
|
|
if ( @images ) { |
984 |
|
|
return \@images; |
985 |
|
|
} else { |
986 |
|
|
return undef; |
987 |
|
|
} |
988 |
|
|
} |
989 |
|
|
|
990 |
|
|
|
991 |
|
|
sub __img_is_valid { |
992 |
|
|
my ($self, $img) = @_; |
993 |
|
|
|
994 |
101 |
ahitrov |
return 1; |
995 |
99 |
ahitrov |
if ( $img->check_online ) { |
996 |
|
|
my $delim = 0; |
997 |
|
|
my $w = $img->width; |
998 |
|
|
my $h = $img->height; |
999 |
|
|
if ( $w && $h ) { |
1000 |
|
|
foreach my $pair ( @bad_dimensions ) { |
1001 |
|
|
if ($w == $pair->{w} && $h == $pair->{h}) { |
1002 |
|
|
return undef; |
1003 |
|
|
} |
1004 |
|
|
} |
1005 |
|
|
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim; |
1006 |
|
|
if ( $w < 80 || $h < 80 || $delim > 2.5 ) { |
1007 |
|
|
return undef; |
1008 |
|
|
} |
1009 |
|
|
} |
1010 |
|
|
} else { |
1011 |
|
|
# warn "Image ".$img->src." not found on server"; |
1012 |
|
|
return undef; |
1013 |
|
|
} |
1014 |
|
|
return 1; |
1015 |
|
|
} |
1016 |
|
|
|
1017 |
|
|
|
1018 |
|
|
sub __exclude_rools { |
1019 |
|
|
my ($self, $tag, $rools) = @_; |
1020 |
|
|
return undef unless ref $rools eq 'ARRAY' && @$rools; |
1021 |
|
|
|
1022 |
|
|
my $choose = 0; |
1023 |
|
|
foreach my $rool ( @$rools ) { |
1024 |
|
|
my $matched = 1; |
1025 |
|
|
foreach my $cond ( @{$rool->{condition}} ) { |
1026 |
|
|
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) { |
1027 |
|
|
$matched = 0; |
1028 |
|
|
} |
1029 |
|
|
} |
1030 |
|
|
$choose ||= $matched; |
1031 |
|
|
} |
1032 |
|
|
return $choose; |
1033 |
|
|
} |
1034 |
|
|
|
1035 |
|
|
|
1036 |
|
|
sub __parse_rools { |
1037 |
|
|
my ($self, $rools) = @_; |
1038 |
|
|
return unless $rools; |
1039 |
|
|
$rools =~ s/\r//sgi; |
1040 |
|
|
my @rools = split /\n/, $rools; |
1041 |
|
|
return unless @rools; |
1042 |
|
|
|
1043 |
|
|
my @parsed; |
1044 |
|
|
foreach my $rool ( @rools ) { |
1045 |
|
|
my %pr; |
1046 |
|
|
next if $rool =~ /^#/; |
1047 |
|
|
$rool =~ s/[\x20\t]+$//; |
1048 |
|
|
$rool =~ s/^[\x20\t]+//; |
1049 |
|
|
if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) { |
1050 |
|
|
$pr{command} = lc($1); |
1051 |
|
|
my $params = $2; |
1052 |
|
|
|
1053 |
|
|
if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1054 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1055 |
|
|
next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/; |
1056 |
|
|
$params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; |
1057 |
|
|
$pr{condition} = $params; |
1058 |
|
|
} elsif ( $pr{command} eq 'glue' ) { |
1059 |
|
|
if ( $params =~ /^(\w+)\s+(.*)$/ ) { |
1060 |
|
|
$pr{subcommand} = $1; $params = $2; |
1061 |
|
|
next unless $pr{subcommand} =~ /^(first|all|order)$/; |
1062 |
|
|
if ( $pr{subcommand} eq 'order' ) { |
1063 |
|
|
my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params); |
1064 |
|
|
$pr{condition} = \@pars; |
1065 |
|
|
} else { |
1066 |
|
|
$pr{condition} = int($1); |
1067 |
|
|
} |
1068 |
|
|
} elsif ( $params =~ /(\d+)/i ) { |
1069 |
|
|
$pr{subcommand} = 'first'; |
1070 |
|
|
$pr{condition} = int($1); |
1071 |
|
|
} else { |
1072 |
|
|
$pr{subcommand} = 'all'; |
1073 |
|
|
} |
1074 |
|
|
} elsif ( $pr{command} eq 'trim' ) { |
1075 |
|
|
if ( $params =~ /(left|right)/i ) { |
1076 |
|
|
$pr{subcommand} = lc($1); |
1077 |
|
|
} else { |
1078 |
|
|
$pr{subcommand} = 'all'; |
1079 |
|
|
} |
1080 |
|
|
} elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) { |
1081 |
|
|
$pr{condition} = $1; |
1082 |
|
|
} elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) { |
1083 |
|
|
$pr{command} = 'image_off'; |
1084 |
|
|
} elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) { |
1085 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1086 |
|
|
next unless $pr{subcommand} =~ /^(depth)$/; |
1087 |
|
|
$pr{condition} = $params; |
1088 |
|
|
} elsif ( $pr{command} eq 'set' ) { |
1089 |
|
|
if ( $params =~ /^(limit)\s+(.*)$/si ) { |
1090 |
|
|
$pr{subcommand} = lc($1); |
1091 |
|
|
$params = $2; |
1092 |
|
|
} |
1093 |
|
|
if ( $params =~ /^(\w+)\s+(.*)$/ ) { |
1094 |
|
|
$pr{condition} = { param => $1, value => $2 }; |
1095 |
|
|
} else { |
1096 |
|
|
next; |
1097 |
|
|
} |
1098 |
|
|
} elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) { |
1099 |
|
|
$pr{command} = 'kill'; |
1100 |
|
|
$pr{condition} = { param => $2, value => $1 }; |
1101 |
|
|
} elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) { |
1102 |
|
|
$pr{command} = 'set'; |
1103 |
|
|
$pr{condition} = { param => 'description', value => 'header' }; |
1104 |
|
|
} elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1105 |
|
|
$pr{subcommand} = $1; $params = $2; |
1106 |
|
|
next unless $pr{subcommand} =~ /^(element|elem|image)$/; |
1107 |
|
|
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; |
1108 |
|
|
my @conditions; |
1109 |
|
|
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { |
1110 |
|
|
push @conditions, { param => lc($1), value => $2 } |
1111 |
|
|
} |
1112 |
|
|
$pr{condition} = \@conditions; |
1113 |
|
|
} elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1114 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1115 |
|
|
next unless $pr{subcommand} =~ /^(image|elem|element)$/; |
1116 |
|
|
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/; |
1117 |
|
|
my @conditions; |
1118 |
|
|
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) { |
1119 |
|
|
push @conditions, { param => lc($1), value => $2 } |
1120 |
|
|
} |
1121 |
|
|
$pr{condition} = \@conditions; |
1122 |
|
|
} elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) { |
1123 |
|
|
$pr{command} = 'dont'; |
1124 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1125 |
|
|
next unless $pr{subcommand} =~ /^(cut|validate)$/; |
1126 |
|
|
my @conditions; |
1127 |
|
|
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { |
1128 |
|
|
$pr{condition} = { param => lc($1), value => $2 }; |
1129 |
|
|
} elsif ( $params =~ /(image)/i ) { |
1130 |
|
|
$pr{condition} = { param => lc($1) }; |
1131 |
|
|
} else { |
1132 |
|
|
next; |
1133 |
|
|
} |
1134 |
|
|
} elsif ( $pr{command} eq 'clean' ) { |
1135 |
|
|
if ( $params =~ /^(off)\s+(.*)$/ ) { |
1136 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1137 |
|
|
} |
1138 |
|
|
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) { |
1139 |
|
|
$pr{condition} = { param => lc($1), value => $2 }; |
1140 |
|
|
} elsif ( $params =~ /(span)/i ) { |
1141 |
|
|
$pr{condition} = { param => 'tag', value => lc($1) }; |
1142 |
|
|
} else { |
1143 |
|
|
next; |
1144 |
|
|
} |
1145 |
|
|
} else { |
1146 |
|
|
next; |
1147 |
|
|
} |
1148 |
|
|
push @parsed, \%pr; |
1149 |
|
|
} |
1150 |
|
|
} |
1151 |
|
|
return ( scalar @parsed ? \@parsed : undef ); |
1152 |
|
|
} |
1153 |
|
|
|
1154 |
|
|
|
1155 |
|
|
sub __post_rool { |
1156 |
|
|
my ($self, $element, $rools, $description) = @_; |
1157 |
|
|
|
1158 |
|
|
if ( ref $rools eq 'ARRAY' && @$rools ) { |
1159 |
|
|
foreach my $rool ( @$rools ) { |
1160 |
|
|
if ( $rool->{command} eq 'cut' ) { |
1161 |
|
|
my $condition = $rool->{condition}; |
1162 |
|
|
if ( $rool->{subcommand} eq 'off' ) { |
1163 |
|
|
$element->{text} =~ s/$condition//sgi; |
1164 |
|
|
} elsif ( $rool->{subcommand} eq 'from' ) { |
1165 |
|
|
my $pos = index $element->{text}, $condition; |
1166 |
|
|
if ( $pos >= 0 ) { |
1167 |
|
|
$element->{text} = substr $element->{text}, 0, $pos; |
1168 |
|
|
} |
1169 |
|
|
# $element->{text} =~ s/$condition(.*)$//si; |
1170 |
|
|
} elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) { |
1171 |
|
|
my $str = substr $description, 0, 12; |
1172 |
|
|
my $pos = index $element->{text}, $str; |
1173 |
|
|
if ( $pos >= 0 ) { |
1174 |
|
|
$element->{text} = substr $element->{text}, $pos, -1; |
1175 |
|
|
} |
1176 |
|
|
} elsif ( $rool->{subcommand} eq 'till' ) { |
1177 |
|
|
$element->{text} =~ s/^(.*?)($condition)//si; |
1178 |
|
|
} elsif ( $rool->{subcommand} eq 'untill' ) { |
1179 |
|
|
$element->{text} =~ s/^(.*?)($condition)/$2/si; |
1180 |
|
|
} elsif ( $rool->{subcommand} eq 'regex' ) { |
1181 |
|
|
$element->{text} =~ s/$condition//sgi; |
1182 |
|
|
} |
1183 |
|
|
} elsif ( $rool->{command} eq 'trim' ) { |
1184 |
|
|
if ( $rool->{subcommand} eq 'left' ) { |
1185 |
|
|
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; |
1186 |
|
|
} elsif ( $rool->{subcommand} eq 'right' ) { |
1187 |
|
|
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; |
1188 |
|
|
} else { |
1189 |
|
|
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg; |
1190 |
|
|
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg; |
1191 |
|
|
} |
1192 |
|
|
} |
1193 |
|
|
} |
1194 |
|
|
} |
1195 |
|
|
} |
1196 |
|
|
|
1197 |
|
|
|
1198 |
|
|
1; |