Line # Revision Author
1 98 ahitrov package Contenido::Parser::RSS;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 99 ahitrov use base 'Contenido::Parser';
8
9 use Contenido::Globals;
10 98 ahitrov use Utils::HTML;
11 99 ahitrov use Time::ParseDate;
12 98 ahitrov #use Date::Parse;
13 use Data::Dumper;
14 use Digest::MD5 qw(md5_hex);
15 #use Class::Date;
16 use Encode;
17 use utf8;
18
19 my @INVALID_TAGS = qw ( A ABBREV ACRONYM ADDRESS APP APPLET AREA AU B BANNER BASE BASEFONT BDO BGSOUND BIG BLINK BLOCKQUOTE
20 BODY BQ BR CAPTION CENTER CITE CODE COL COLGROUP CREDIT DD DEL DFN DIR DIV DL DT EM FN FIG FONT FORM FRAME FRAMESET
21 H1 H2 H3 H4 H5 H6 HP HR I IMG INPUT INS ISINDEX KBD LANG LH LI LISTING MAP MARQUEE MENU META NEXTID NOBR NOEMBED
22 NOFRAMES NOTE OL OPTION OVERLAY P PARAM PERSON PLAINTEXT PRE Q S SAMP SELECT SMALL SPAN STRIKE STRONG SUB SUP TAB
23 TABLE TBODY TD TEXTAREA TFOOT TH THEAD TR TT U UL VAR WBR XMP EMBED
24 );
25
26 sub new {
27 my ($proto) = @_;
28 my $class = ref($proto) || $proto;
29 my $self = {};
30 bless $self, $class;
31
32 return $self;
33 }
34
35 sub parse {
36 my ($self, %opts) = @_;
37
38 99 ahitrov my $content;
39 if ( $opts{content} ) {
40 $content = delete $opts{content};
41 delete $self->{content};
42 } elsif ( $self->{success} || $self->{content} ) {
43 $content = delete $self->{content};
44 } else {
45 $self->{success} = 0;
46 return $self;
47 }
48 my $base_url = delete $self->{base_url} || delete $opts{base_url};
49 98 ahitrov my $allow_global_fulltext = delete $opts{allow_fulltext} || 0;
50 my $content_global_type = delete $opts{content_type} || 1;
51 99 ahitrov my $debug = $DEBUG;
52 98 ahitrov my $gui = delete $opts{gui};
53 my $description_as_fulltext = delete $opts{description_as_fulltext};
54 107 ahitrov warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug && $opts{parser_rss};
55 98 ahitrov
56 my $rss_rools = $self->__parse_rools (delete $opts{parser_rss});
57
58 107 ahitrov warn "RSS Rools: ".Dumper ($rss_rools) if $debug && $rss_rools;
59 98 ahitrov
60 my @items;
61 my $feed = $self->__parse_content(\$content);
62
63 if ( ref $feed eq 'ARRAY' ) {
64 foreach my $item ( @$feed ) {
65 my $fulltext_field;
66 my $content_type = $content_global_type;
67 my $allow_fulltext = $allow_global_fulltext;
68 $self->__check_rewrite ( item => $item, rools => $rss_rools );
69 99 ahitrov my $date = $self->__parse_date($item->{pubdate});
70 my $pubdate = Contenido::DateTime->new( epoch => $date );
71 101 ahitrov # $pubdate = $pubdate->ymd('-').' '.$pubdate->hms;
72 98 ahitrov next if ref $item->{title};
73 next if ref $item->{description};
74 $self->__check_ignore ( item => $item, rools => $rss_rools );
75 $self->__check_only ( item => $item, rools => $rss_rools );
76 $item->{title} = $self->__field_prepare ($item->{title});
77 $self->__check_filter ( gui => $gui, field => 'title', item => $item, rools => $rss_rools );
78 my $title = $item->{title};
79 my $link;
80 if ( ref $item->{link} eq 'HASH' ) {
81 if ( ( (exists $item->{link}{type} && $item->{link}{type} eq 'text/html') || !exists $item->{link}{type} ) && exists $item->{link}{href} ) {
82 $link = $item->{link}{href};
83 }
84 } elsif ( ref $item->{link} eq 'ARRAY' ) {
85 foreach my $lnk ( @{ $item->{link} } ) {
86 if ( ref $lnk ) {
87 if ( ( (exists $lnk->{type} && $lnk->{type} eq 'text/html') || !exists $lnk->{type} ) && exists $lnk->{href} ) {
88 $link = $lnk->{href};
89 }
90 } else {
91 $link = $lnk;
92 last;
93 }
94 }
95 } else {
96 $link = $item->{'link'} || (ref $item->{'url'} eq 'ARRAY' ? $item->{'url'}->[0] : $item->{'url'});
97 }
98 $link = $self->__field_prepare ($link);
99 $link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/);
100 104 ahitrov $item->{description} = encode('utf-8', $self->__field_prepare (decode('utf-8', $item->{description})));
101 98 ahitrov $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
102 101 ahitrov my %image_href;
103 98 ahitrov my $description = $item->{description};
104 if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) {
105 $allow_fulltext = 1;
106 }
107 my $fulltext;
108 if ( $description_as_fulltext ) {
109 $fulltext = $description;
110 $fulltext_field = 'description'
111 } else {
112 if ( $gui ) {
113 foreach my $field ( qw( rambler:fulltext rambler:full-text yandex:full-text mailru:full-text content:encoded full-text fulltext text ) ) {
114 if ( exists $item->{$field} && $item->{$field} ) {
115 $fulltext_field = $field;
116 $fulltext = $item->{$field};
117 last;
118 }
119 }
120 } else {
121 $fulltext =
122 $item->{'rambler:fulltext'} ||
123 $item->{'rambler:full-text'} ||
124 $item->{'yandex:full-text'} ||
125 $item->{'mailru:full-text'} ||
126 $item->{'content:encoded'} ||
127 $item->{'full-text'} ||
128 $item->{'fulltext'} ||
129 $item->{'text'};
130 }
131 if ( ref $fulltext eq 'HASH') {
132 my @values = values %$fulltext;
133 if ( scalar @values == 1 ) {
134 $fulltext = $values[0];
135 }
136 }
137 if ( ref $fulltext eq 'ARRAY' ) {
138 $fulltext = join "\n", @$fulltext;
139 }
140 $self->__check_filter ( gui => $gui, field => 'fulltext', item => $item, text => \$fulltext, rools => $rss_rools );
141 $fulltext = $self->__field_prepare ($fulltext);
142 }
143 if ( $fulltext && !$description ) {
144 $item->{description} = Utils::HTML::limit_words ( $fulltext, 150, 300 );
145 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
146 $description = $item->{description};
147 }
148 $allow_fulltext = 0 unless $fulltext;
149 my $author;
150 if ( exists $item->{author} && $item->{author} ) {
151 if ( ref $item->{author} eq 'HASH' && exists $item->{author}{name} ) {
152 $author = $item->{author}{name};
153 } elsif ( !ref $item->{author} ) {
154 $author = $item->{author};
155 }
156 }
157 my $category = [];
158 if ( exists $item->{category} && ref $item->{category} eq 'ARRAY' ) {
159 $category = $item->{category};
160 } elsif ( exists $item->{category} ) {
161 $category = [$item->{category}];
162 }
163 my @images;
164 if ( exists $item->{image} || exists $item->{enclosure} ) {
165 my @src = ref $item->{image} eq 'ARRAY' ? @{ $item->{image} } : ( $item->{image} ) if exists $item->{image};
166 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
167 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att;
168 @images = map {
169 99 ahitrov my $img = $_;
170 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; $img;
171 98 ahitrov } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
172 }
173 my @videos;
174 if ( exists $item->{video} || exists $item->{enclosure} ) {
175 my @src = ref $item->{video} eq 'ARRAY' ? @{ $item->{video} } : ( $item->{video} ) if exists $item->{video};
176 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
177 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /video/ } @att;
178 @videos = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : (), $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
179 }
180 my @audios;
181 if ( exists $item->{audio} || exists $item->{enclosure} ) {
182 my @src = ref $item->{audio} eq 'ARRAY' ? @{ $item->{audio} } : ( $item->{audio} ) if exists $item->{audio};
183 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
184 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /audio/ } @att;
185 @audios = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
186 }
187 my ($video_url, $audio_url);
188 if ( $content_type == 2 || @videos || exists $item->{'videourl'} || exists $item->{'video_url'} ) {
189 $video_url = exists $item->{video} && ref $item->{video} eq 'HASH' && exists $item->{video}{url} ?
190 $item->{video}{url} :
191 $item->{'videourl'} ||
192 $item->{'video_url'} ||
193 ($item->{'guid'} =~ /^http:/ ? $item->{'guid'} : undef) ||
194 (exists $item->{'link'} && ref $item->{'link'} eq 'HASH' ? $item->{'link'}{'href'} || $item->{'link'}{'url'} : $item->{'link'} );
195 $content_type = 2;
196 }
197 if ( @audios || exists $item->{'audiourl'} || exists $item->{'audio_url'} ) {
198 $audio_url = exists $item->{audio} && ref $item->{audio} eq 'HASH' && exists $item->{audio}{url} ?
199 $item->{audio}{url} :
200 $item->{'audiourl'} || $item->{'audio_url'};
201 $content_type = 2;
202 }
203 my $related = [];
204 if ( exists $item->{'rambler:related'} && $item->{'rambler:related'} ) {
205 if ( ref $item->{'rambler:related'} eq 'ARRAY' ) {
206 foreach my $relitem ( @{ $item->{'rambler:related'} } ) {
207 my $rel = $self->__parse_related ( $relitem );
208 push @$related, $rel if ref $rel;
209 }
210 } elsif ( ref $item->{'rambler:related'} eq 'HASH' ) {
211 my $rel = $self->__parse_related ( $item->{'rambler:related'} );
212 push @$related, $rel if ref $rel;
213 }
214 }
215 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos;
216 99 ahitrov my @inlined_images;
217 for ( $description, $fulltext ) {
218 107 ahitrov next unless $_;
219 101 ahitrov my $field = $_;
220 while ( $field =~ /<img ([^>]+)>/sgi ) {
221 my $image = $self->__parse_params( $1 );
222 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src};
223 }
224 while ( $field =~ /<a ([^>]+)>/sgi ) {
225 my $anchor = $self->__parse_params( $1 );
226 if ( $anchor->{href} && $anchor->{href} =~ /\.(jpe?g|gif|png)$/ ) {
227 push @inlined_images, { src => $anchor->{href} };
228 }
229 }
230 99 ahitrov }
231 if ( @inlined_images ) {
232 my %images = map { $_->{src} => $_ } @images, @inlined_images;
233 @images = values %images;
234 }
235 98 ahitrov push @items, {
236 108 ahitrov 'checksum' => md5_hex(encode_utf8(($title || '').($description || ''))),
237 98 ahitrov 'ignore' => $item->{ignore} || 0,
238 'title' => $title || '',
239 'title_gui' => $item->{title_gui} || $title || '',
240 'description' => $description || '',
241 'description_gui' => $item->{description_gui} || $description || '',
242 'desc_length' => length( $description || '' ),
243 'link' => $link || '',
244 'pubdate' => $pubdate || '',
245 'fulltext' => $fulltext || '',
246 'fulltext_gui' => $item->{fulltext_gui} || '',
247 'fulltext_field' => $fulltext_field || '',
248 'image' => @images ? $images[0] : undef,
249 'images' => @images ? \@images : undef,
250 'video' => @videos ? $videos[0] : undef,
251 'videos' => @videos ? \@videos : undef,
252 'categories' => $category,
253 'video_url' => $video_url || '',
254 'audio_url' => $audio_url || '',
255 'author' => $author || '',
256 'related' => $related,
257 'content_type' => $content_type,
258 'allow_fulltext' => $allow_fulltext,
259 };
260 }
261 } else {
262 99 ahitrov warn ($@ || 'Something wrong while parsing content');
263 98 ahitrov return $self->is_success(0);
264 }
265
266 99 ahitrov $self->{items} = \@items;
267 98 ahitrov return $self->is_success(1);
268 }
269
270
271 sub __check_rewrite {
272 my ($self, %opts) = @_;
273 my $item = $opts{item};
274 return unless ref $item;
275 return unless ref $opts{rools} eq 'ARRAY';
276 my @rools = grep { $_->{action} eq 'rewrite' } @{ $opts{rools} };
277 return unless @rools;
278 foreach my $rool ( @rools ) {
279 my $field = $rool->{target};
280 my $value = $rool->{condition};
281 if ( $value eq 'CURRENT_DATETIME' ) {
282 my $dt = DateTime->now( time_zone => "Europe/Moscow" );
283 $value = $dt->ymd('-').'T'.$dt->hms.' MSK';
284 }
285 $item->{$field} = $value if exists $item->{$field};
286 }
287 }
288
289
290 sub __check_filter {
291 my ($self, %opts) = @_;
292 my $field = $opts{field};
293 my $gui = $opts{gui};
294 my $item = $opts{item};
295 my $text = exists $opts{text} ? $opts{text} : undef;
296 return unless ref $item;
297 return unless exists $opts{text} || exists $item->{$field};
298 return unless ref $opts{rools} eq 'ARRAY';
299 my @rools = grep { $_->{action} eq 'filter' && $_->{target} eq $field } @{ $opts{rools} };
300 return unless @rools;
301 foreach my $rool ( @rools ) {
302 if ( $rool->{command} eq 'cut' ) {
303 my $condition = $rool->{condition};
304 if ( $rool->{subcommand} eq 'off' ) {
305 if ( $opts{gui} ) {
306 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
307 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
308 $item->{$field."_gui"} = $field_gui;
309 }
310 if ( exists $opts{text} ) {
311 $$text =~ s/$condition//sgi;
312 } else {
313 $item->{$field} =~ s/$condition//sgi;
314 }
315 } elsif ( $rool->{subcommand} eq 'from' ) {
316 if ( $gui ) {
317 my $cut_text = '';
318 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
319 my $pos = index $field_gui, $condition;
320 if ( $pos >= 0 ) {
321 $cut_text = substr $field_gui, $pos, -1;
322 $field_gui = substr $field_gui, 0, $pos;
323 }
324 # $field_gui =~ s/($condition)(.*)$/<b style="color:red">$1$2<\/b>/si;
325 $item->{$field."_gui"} = $field_gui.'<b style="color:red">'.$cut_text.'</b>';
326 }
327 if ( exists $opts{text} ) {
328 my $pos = index $$text, $condition;
329 if ( $pos >= 0 ) {
330 $$text = substr $$text, 0, $pos;
331 }
332 } else {
333 my $pos = index $item->{$field}, $condition;
334 if ( $pos >= 0 ) {
335 $item->{$field} = substr $item->{$field}, 0, $pos;
336 }
337 }
338 } elsif ( $rool->{subcommand} eq 'till' ) {
339 if ( $opts{gui} ) {
340 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
341 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1$2<\/b>/si;
342 $item->{$field."_gui"} = $field_gui;
343 }
344 if ( exists $opts{text} ) {
345 $$text =~ s/^(.*?)($condition)//si;
346 } else {
347 $item->{$field} =~ s/^(.*?)($condition)//si;
348 }
349 } elsif ( $rool->{subcommand} eq 'untill' ) {
350 if ( $opts{gui} ) {
351 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
352 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1<\/b>$2/si;
353 $item->{$field."_gui"} = $field_gui;
354 }
355 if ( exists $opts{text} ) {
356 $$text =~ s/^(.*?)($condition)/$2/si;
357 } else {
358 $item->{$field} =~ s/^(.*?)($condition)/$2/si;
359 }
360 } elsif ( $rool->{subcommand} eq 'regex' ) {
361 if ( $opts{gui} ) {
362 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
363 if ( substr($condition,0,1) eq '^' ) {
364 my $cond = reverse($condition);
365 chop($cond);
366 $cond = reverse($cond);
367 $field_gui =~ s/^($cond)/<b style="color:red">$1<\/b>/si;
368 } elsif ( substr($condition,-1,1) eq '$' ) {
369 my $cond = $condition;
370 chop($cond);
371 $field_gui =~ s/($cond)$/<b style="color:red">$1<\/b>/si;
372 } else {
373 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
374 }
375 $item->{$field."_gui"} = $field_gui;
376 }
377 if ( exists $opts{text} ) {
378 $$text =~ s/$condition//sgi;
379 } else {
380 $item->{$field} =~ s/$condition//sgi;
381 }
382 }
383 } elsif ( $rool->{command} eq 'regex' ) {
384 my $from = $rool->{condition}{from};
385 my $to = $rool->{condition}{to};
386 if ( exists $opts{text} ) {
387 eval ("\$\$text =~ s/$from/$to/sgi");
388 } else {
389 eval ("\$item->{\$field} =~ s/$from/$to/sgi");
390 }
391 }
392 }
393 }
394
395
396 sub __check_ignore {
397 my ($self, %opts) = @_;
398 my $item = $opts{item};
399 return unless ref $item;
400 return unless ref $opts{rools} eq 'ARRAY';
401 my @rools = grep { $_->{action} eq 'ignore' } @{ $opts{rools} };
402 return unless @rools;
403 foreach my $rool ( @rools ) {
404 my $target = $rool->{target};
405 if ( $rool->{command} =~ /^contain/ ) {
406 $item->{ignore} = 1 if index (lc($item->{$target}), lc($rool->{condition})) >= 0;
407 }
408 if ( $rool->{command} eq '=' ) {
409 $item->{ignore} = 1 if lc($item->{$target}) eq lc($rool->{condition});
410 }
411 if ( $rool->{command} eq 'regex' ) {
412 my $regex = $rool->{condition};
413 $item->{ignore} = 1 if $item->{$target} =~ /$regex/sgi;
414 }
415 }
416 }
417
418
419 sub __check_only {
420 my ($self, %opts) = @_;
421 my $item = $opts{item};
422 return unless ref $item;
423 return unless ref $opts{rools} eq 'ARRAY';
424 my @rools = grep { $_->{action} eq 'only' } @{ $opts{rools} };
425 return unless @rools;
426 foreach my $rool ( @rools ) {
427 my $target = $rool->{target};
428 if ( $rool->{command} =~ /^contain/ ) {
429 $item->{ignore} = 1 unless index (lc($item->{$target}), lc($rool->{condition})) >= 0;
430 }
431 if ( $rool->{command} eq '=' ) {
432 $item->{ignore} = 1 unless lc($item->{$target}) eq lc($rool->{condition});
433 }
434 if ( $rool->{command} eq 'regex' ) {
435 my $regex = $rool->{condition};
436 $item->{ignore} = 1 unless $item->{$target} =~ /$regex/sgi;
437 }
438 }
439 }
440
441
442 sub __feed_type {
443 my ($self, $contref) = @_;
444
445 my $type;
446 if ( $$contref =~ /<rss([^>]*)>/ ) {
447 $type = 'RSS';
448 } elsif ( $$contref =~ /<feed\s+([^>]*)>/ ) {
449 my $feed_params = $1;
450 my $params = $self->__parse_params ($feed_params);
451 if ( exists $params->{xmlns} && $params->{xmlns} =~ /purl.org\/atom/ ) {
452 $type = 'ATOM';
453 } elsif ( exists $params->{xmlns} && $params->{xmlns} =~ /www.w3.org\/2005\/Atom/ ) {
454 $type = 'ATOM';
455 }
456 } elsif ( $$contref =~ /<rdf([^>]*)>/ ) {
457 $type = 'RDF';
458 }
459 return $type;
460 }
461
462
463 sub __parse_content {
464 my ($self, $contref) = @_;
465
466 my $feed_type = $self->__feed_type($contref);
467 # warn "FEED Type = [$feed_type]\n";
468 return undef unless $feed_type;
469
470 $$contref =~ s/>\s+</></sgi;
471 $$contref =~ s/<items(.*?)>(.*?)<\/items([^>]*?)>//sgi;
472 $$contref =~ s/<\/?br(.*?)>/\n/sgi;
473 $$contref =~ s/<\/?nobr(.*?)>//sgi;
474 #$$contref =~ s/<p>/\n\n/sgi;
475 #$$contref =~ s/<p\s(.*?)>/\n\n/sgi;
476 $$contref =~ s/<\/?strong\s(.*?)>//sgi;
477 $$contref =~ s/<\/?s>//sgi;
478 $$contref =~ s/<\/?i>//sgi;
479 $$contref =~ s/<\/?b>//sgi;
480 $$contref =~ s/<\/?strong>//sgi;
481 #$$contref =~ s/<\/p>//sgi;
482 #$$contref =~ s/<\/p\s(.*?)>//sgi;
483 my @items;
484
485 if ( $feed_type eq 'RSS' ) {
486 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
487 my $item_params = $1;
488 my $item_body = $2;
489 # warn "BODY: [$item_body]\n\n";
490 my $params = $self->__parse_params ($item_params);
491 my $item = $self->__parse_item_RSS ($item_body) || {};
492 if ( ref $params eq 'HASH' ) {
493 foreach my $key ( %$params ) {
494 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
495 push @{ $item->{$key} }, $params->{$key};
496 } elsif ( exists $item->{$key} ) {
497 my @arr = ( $item->{$key}, $params->{$key} );
498 $item->{$key} = \@arr;
499 } else {
500 $item->{$key} = $params->{$key};
501 }
502 }
503 }
504 if ( ref $item eq 'HASH' && scalar keys %$item ) {
505 if ( exists $item->{'feedburner:origlink'} ) {
506 $item->{link} = ref $item->{'feedburner:origlink'} eq 'ARRAY' ? $item->{'feedburner:origlink'}->[0] : $item->{'feedburner:origlink'};
507 } elsif ( !exists $item->{link} ) {
508 foreach my $key ( qw( guid ) ) {
509 if ( exists $item->{$key} ) {
510 $item->{link} = $item->{$key};
511 last;
512 }
513 }
514 }
515 push @items, $item;
516 }
517 # warn Dumper($item);
518 }
519 }
520 if ( $feed_type eq 'RDF' ) {
521 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
522 my $item_params = $1;
523 my $item_body = $2;
524 # warn "BODY: [$item_body]\n\n";
525 my $params = $self->__parse_params ($item_params);
526 my $item = $self->__parse_item_RSS ($item_body) || {};
527 if ( ref $params eq 'HASH' ) {
528 foreach my $key ( %$params ) {
529 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
530 push @{ $item->{$key} }, $params->{$key};
531 } elsif ( exists $item->{$key} ) {
532 my @arr = ( $item->{$key}, $params->{$key} );
533 $item->{$key} = \@arr;
534 } else {
535 $item->{$key} = $params->{$key};
536 }
537 }
538 }
539 # warn Dumper($item);
540 if ( ref $item eq 'HASH' && scalar keys %$item ) {
541 if ( !exists $item->{pubdate} ) {
542 foreach my $key ( 'prism:publicationdate', 'dc:date' ) {
543 if ( exists $item->{$key} ) {
544 $item->{pubdate} = $item->{$key};
545 last;
546 }
547 }
548 }
549 push @items, $item;
550 }
551 }
552 }
553 if ( $feed_type eq 'ATOM' ) {
554 while ( $$contref =~ /<entry(.*?)>(.*?)<\/entry([^>]*?)>/sgi ) {
555 my $item_params = $1;
556 my $item_body = $2;
557 my $item = $self->__parse_item_ATOM ($item_body) || {};
558 # warn Dumper($item);
559 if ( ref $item eq 'HASH' && scalar keys %$item ) {
560 if ( !exists $item->{pubdate} ) {
561 foreach my $key ( 'published', 'updated' ) {
562 if ( exists $item->{$key} ) {
563 $item->{pubdate} = $item->{$key};
564 last;
565 }
566 }
567 }
568 push @items, $item;
569 }
570 }
571 }
572 return ( scalar @items ? \@items : undef );
573 }
574
575
576 sub __parse_params {
577 my ($self, $params) = @_;
578 return undef unless $params;
579
580 my %params;
581 while ( $params =~ /([\w\:]+)(\s*?)=(\s*?)["'](.*?)["']/sgi ) {
582 my $name = $1;
583 my $value = $4;
584 if ( $name && $value ) {
585 $params{$name} = $value;
586 }
587 }
588 return ( scalar(keys %params) ? \%params : undef );
589 }
590
591
592 sub __parse_item_RSS {
593 my ($self, $item_body, $debug) = @_;
594 return undef unless $item_body;
595
596 my %item;
597 # my $embedded = $self->__item_cut_rss_embedded(\$item_body);
598 # if ( ref $embedded ) {
599 # %item = %$embedded;
600 # }
601 # my $content = $self->__item_cut_rss_description(\$item_body);
602 # $item{description} = $content if $content;
603 # my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
604 # if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
605 # foreach my $elem ( @$one_string_elements ) {
606 # my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
607 # if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
608 # push @{ $item{$elem_name} }, $elem->{$elem_name};
609 # } elsif ( exists $item{$elem_name} ) {
610 # $item{$elem_name} = [$item{$elem_name}, $elem->{$elem_name}];
611 # } else {
612 # $item{$elem_name} = $elem->{$elem_name};
613 # }
614 # }
615 # }
616 my $parsed = $self->__make_tree (\$item_body, $debug);
617 # warn Dumper($parsed);
618 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
619 foreach my $tag ( @{ $parsed->{1}{children} } ) {
620 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
621 my %params;
622 foreach my $it ( @{ $tag->{children} } ) {
623 next unless $it->{text};
624 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
625 push @{ $params{$it->{type}} }, $it->{text};
626 } elsif ( exists $params{$it->{type}} ) {
627 my @arr = ( $params{$it->{type}}, $it->{text} );
628 $params{$it->{type}} = \@arr;
629 } else {
630 $params{$it->{type}} = $it->{text};
631 }
632 }
633 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
634 push @{ $item{$tag->{type}} }, \%params;
635 } elsif ( exists $item{$tag->{type}} ) {
636 my @arr = ( $item{$tag->{type}}, \%params );
637 $item{$tag->{type}} = \@arr;
638 } else {
639 $item{$tag->{type}} = \%params;
640 }
641 } else {
642 my $body = $tag->{text} || $tag->{params};
643 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
644 push @{ $item{$tag->{type}} }, $body;
645 } elsif ( exists $item{$tag->{type}} ) {
646 my @arr = ( $item{$tag->{type}}, $body );
647 $item{$tag->{type}} = \@arr;
648 } else {
649 $item{$tag->{type}} = $body;
650 }
651 }
652 }
653 }
654 # warn Dumper(\%item);
655 return \%item;
656 }
657
658
659 sub __parse_item_ATOM {
660 my ($self, $item_body, $debug) = @_;
661 return undef unless $item_body;
662
663 my %item;
664 my $embedded = $self->__item_cut_rss_embedded(\$item_body);
665 if ( ref $embedded ) {
666 %item = %$embedded;
667 }
668 if ( exists $item{summary} ) {
669 $item{description} = delete $item{summary};
670 } else {
671 my $summary = $self->__item_cut_atom_summary(\$item_body);
672 $item{description} = $summary if $summary;
673 }
674 my $content = $self->__item_cut_atom_content(\$item_body);
675 if ( $content && $item{description} ) {
676 $item{fulltext} = $content;
677 } elsif ( $content ) {
678 $item{description} = $content;
679 }
680 my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
681 # warn Dumper ($one_string_elements);
682 if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
683 foreach my $elem ( @$one_string_elements ) {
684 my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
685 if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
686 push @{$item{$elem_name}}, $elem->{$elem_name};
687 } elsif ( exists $item{$elem_name} ) {
688 my @arr = ($item{$elem_name}, $elem->{$elem_name});
689 $item{$elem_name} = \@arr;
690 } else {
691 $item{$elem_name} = $elem->{$elem_name};
692 }
693 if ( $elem->{$elem_name}{type} =~ /^image/ ) {
694 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
695 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
696 push @{ $item{enclosure} }, $enclosure;
697 } elsif ( exists $item{enclosure} ) {
698 my @arr = ($item{enclosure}, $enclosure);
699 $item{enclosure} = \@arr;
700 } else {
701 $item{enclosure} = $enclosure;
702 }
703 }
704 if ( $elem->{$elem_name}{type} =~ /^video/ ) {
705 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
706 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
707 push @{ $item{enclosure} }, $enclosure;
708 } elsif ( exists $item{enclosure} ) {
709 my @arr = ($item{enclosure}, $enclosure);
710 $item{enclosure} = \@arr;
711 } else {
712 $item{enclosure} = $enclosure;
713 }
714 }
715 }
716 }
717 my $parsed = $self->__make_tree (\$item_body, $debug);
718 # warn Dumper($parsed);
719 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
720 foreach my $tag ( @{ $parsed->{1}{children} } ) {
721 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
722 my %params;
723 foreach my $it ( @{ $tag->{children} } ) {
724 next unless $it->{text};
725 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
726 push @{ $params{$it->{type}} }, $it->{text};
727 } elsif ( exists $params{$it->{type}} ) {
728 my @arr = ( $params{$it->{type}}, $it->{text} );
729 $params{$it->{type}} = \@arr;
730 } else {
731 $params{$it->{type}} = $it->{text};
732 }
733 }
734 if ( exists $tag->{params} && ref $tag->{params} eq 'HASH' ) {
735 while ( my ($param, $value) = each %{ $tag->{params} } ) {
736 if ( exists $params{$param} && ref $params{$param} eq 'ARRAY' ) {
737 push @{ $params{$param} }, $value;
738 } elsif ( exists $params{$param} ) {
739 my @arr = ( $params{$param}, $value );
740 $params{$param} = \@arr;
741 } else {
742 $params{$param} = $value;
743 }
744 }
745 }
746 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
747 push @{ $item{$tag->{type}} }, \%params;
748 } elsif ( exists $item{$tag->{type}} ) {
749 my @arr = ( $item{$tag->{type}}, \%params );
750 $item{$tag->{type}} = \@arr;
751 } else {
752 $item{$tag->{type}} = \%params;
753 }
754 } else {
755 my $body = $tag->{text} || $tag->{params};
756 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
757 push @{ $item{$tag->{type}} }, $body;
758 } elsif ( exists $item{$tag->{type}} ) {
759 my @arr = ( $item{$tag->{type}}, $body );
760 $item{$tag->{type}} = \@arr;
761 } else {
762 $item{$tag->{type}} = $body;
763 }
764 }
765 }
766 my $pubDate = exists $item{issued} ? $item{issued} : exists $item{modified} ? $item{modified} : undef;
767 $item{pubdate} = $pubDate if $pubDate;
768 }
769
770 # warn Dumper(\%item);
771 return \%item;
772 }
773
774
775 sub __make_tree {
776 my ($self, $content, $debug) = @_;
777
778 my @elems = split (//,$$content);
779 # warn "CONTENT: [$$content]\n\n";
780 my $id = 1;
781 my $level = 0;
782 my @stack;
783 my %tree = (
784 root => {
785 id => $id++,
786 text => '',
787 type => 'root',
788 children=> [],
789 parent => undef,
790 level => $level,
791 },
792 );
793 my %elem_hash = ( 1 => $tree{root} );
794 my $current = $tree{root};
795
796 while ( @elems ) {
797 if ( $elems[0] eq '<' && $elems[1] =~ /[\!a-zA-Z]/ ) {
798 my $tag = $self->__try_tag (\@elems);
799 if ( ref $tag && $tag->{type} eq 'text' ) {
800 $current->{text} .= $tag->{content};
801 splice @elems, 0, $tag->{count};
802 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
803 } elsif ( ref $tag && exists $tag->{closed} && $tag->{closed} ) {
804 $tag->{id} = $id++;
805 $tag->{parent} = $current;
806 $tag->{level} = $level+1;
807 $elem_hash{$tag->{id}} = $tag;
808 push @{$current->{children}}, $tag;
809 splice @elems, 0, $tag->{count};
810 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
811 } elsif ( ref $tag ) {
812 $tag->{id} = $id++;
813 $tag->{children} = [];
814 $tag->{parent} = $current;
815 $tag->{level} = ++$level;
816 $elem_hash{$tag->{id}} = $tag;
817 push @{$current->{children}}, $tag;
818 push @stack, $current;
819 $current = $tag;
820 splice @elems, 0, $tag->{count};
821 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
822 } else {
823 # warn "!!!! Error: RSS analyse. Job on item broken... !!!!\n" if $debug;
824 return undef;
825 }
826 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
827 my $tag = $self->__try_end (\@elems);
828 if ( ref $tag && $tag->{type} eq 'text' ) {
829 $current->{text} .= $tag->{content};
830 $current->{count} += $tag->{count};
831 splice @elems, 0, $tag->{count};
832 } elsif ( ref $tag ) {
833 if ( $current->{type} ne $tag->{type} ) {
834 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
835 return undef;
836 } else {
837 $current = pop @stack;
838 $level = $current->{level};
839 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
840 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
841 }
842 splice @elems, 0, $tag->{count};
843 } else {
844 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
845 return undef;
846 }
847 } else {
848 $current->{text} .= shift @elems;
849 $current->{count}++;
850 }
851 }
852 return (\%elem_hash);
853 }
854
855
856
857 sub __try_tag {
858 my ($self, $content) = @_;
859
860 my $i = 1;
861 my %tag;
862 my $tag = $content->[0];
863 if ( $content->[$i] eq '!' ) {
864 # warn "What? Think it's CDATA\n";
865 my $try_cdata = join '', @$content[1..8];
866 if ( $try_cdata eq '![CDATA[' ) {
867 $tag = '';
868 $i = 9;
869 while ( !($content->[$i-1] eq '>' && $content->[$i-2] eq ']' && $content->[$i-3] eq ']') && $i < scalar @$content ) {
870 $tag .= $content->[$i];
871 $i++;
872 }
873 chop $tag; chop $tag; chop $tag;
874 }
875 # warn "CDATA Found: [$tag]";
876 return {
877 type => 'text',
878 content => $tag,
879 count => $i,
880 };
881 }
882 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
883 $tag .= $content->[$i];
884 $i++;
885 }
886 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
887 return {
888 type => 'text',
889 content => $tag,
890 count => $i,
891 };
892 } else {
893 if ( $tag =~ /^<([\w:-]+)\s*(.*)/si ) {
894 my $elem_name = $1;
895 my $elem_body = $2;
896 unless ( $self->__is_valid_tag ($elem_name) ) {
897 return {
898 type => 'text',
899 content => $tag,
900 count => $i,
901 };
902 } else {
903 my $params = $self->__parse_params ($elem_body) if $elem_body;
904 if ( $content->[$i] eq '>' && $content->[$i-1] eq '/' ) {
905 $tag{closed} = 1;
906 }
907 $tag{type} = lc($elem_name);
908 $tag{count} = $i+1;
909 $tag{params} = $params if ref $params;
910 return \%tag;
911 }
912 } else {
913 return {
914 type => 'text',
915 content => $tag,
916 count => $i,
917 };
918 }
919 }
920 }
921
922
923 sub __try_end {
924 my ($self, $content) = @_;
925
926 my $i = 2;
927 my %tag;
928 my $tag = $content->[0].$content->[1];
929 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
930 $tag .= $content->[$i];
931 $i++;
932 }
933 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
934 return {
935 type => 'text',
936 content => $tag,
937 count => $i,
938 };
939 } else {
940 if ( $tag =~ /^<\/([\w:-]+)/i ) {
941 my $elem_name = $1;
942 unless ( $self->__is_valid_tag ($elem_name) ) {
943 return {
944 type => 'text',
945 content => $tag,
946 count => $i,
947 };
948 } else {
949 $tag{type} = lc($elem_name);
950 $tag{count} = $i+1;
951 return \%tag;
952 }
953 } else {
954 return {
955 type => 'text',
956 content => $tag,
957 count => $i,
958 };
959 }
960 }
961 }
962
963
964 sub __is_valid_tag {
965 my ($self, $tag) = @_;
966 foreach my $invtag ( @INVALID_TAGS ) {
967 return 0 if lc($invtag) eq lc($tag);
968 }
969 return 1;
970 }
971
972
973 sub __item_cut_atom_content {
974 my ($self, $item_body) = @_;
975
976 my %elem;
977 if ( $$item_body =~ /<content([^>]*?)>(.*?)<\/content([^>]*)>/si ) {
978 my $content_params = $1;
979 my $content_body = $2;
980 my $params = $self->__parse_params ($content_params) if $content_params;
981 $$item_body =~ s/<content([^>]*?)>(.*?)<\/content([^>]*)>//si;
982 return $content_body;
983 }
984 }
985
986
987 sub __item_cut_atom_summary {
988 my ($self, $item_body) = @_;
989
990 my %elem;
991 if ( $$item_body =~ /<summary([^>]*)>(.*?)<\/summary([^>]*)>/si ) {
992 my $content_params = $1;
993 my $content_body = $2;
994 my $params = $self->__parse_params ($content_params) if $content_params;
995 $$item_body =~ s/<summary([^>]*)>(.*?)<\/summary([^>]*)>//si;
996 return $content_body;
997 }
998 }
999
1000
1001 sub __item_cut_rss_description {
1002 my ($self, $item_body) = @_;
1003
1004 my %elem;
1005 if ( $$item_body =~ /<description([^>]*?)>(.*?)<\/description([^>]*)>/si ) {
1006 my $content_params = $1;
1007 my $content_body = $2;
1008 my $params = $self->__parse_params ($content_params) if $content_params;
1009 $$item_body =~ s/<description([^>]*?)>(.*?)<\/description([^>]*)>//si;
1010 return $content_body;
1011 }
1012 }
1013
1014
1015 sub __item_cut_rss_embedded {
1016 my ($self, $item_body) = @_;
1017
1018 my %elem;
1019 while ( $$item_body =~ /<([^>]*?)>\s*<!\[CDATA\[(.*?)\]\]>\s*<\/([^>]*)>/sgi ) {
1020 my $tag = $3;
1021 my $content_body = $2;
1022 my $content_params = $1;
1023 if ( $content_params =~ /([\w:-]+)\s+(.*)/ ) {
1024 $tag = 1;
1025 $content_params = $2;
1026 }
1027 my $params = $self->__parse_params ($content_params) if $content_params;
1028 $elem{$tag} = $content_body;
1029 $$item_body =~ s/<$tag([^>]*?)>(.*?)<\/$tag([^>]*)>//si;
1030 }
1031 return scalar keys %elem ? \%elem : undef;
1032 }
1033
1034
1035
1036 sub __item_cut_single_elements {
1037 my ($self, $item_body) = @_;
1038
1039 my @elems;
1040 while ( $$item_body =~ /<([\w\:\-]+)\s*([^>]*?)\/>/sgi ) {
1041 my $elem_name = $1;
1042 my $elem_body = $2;
1043 my $params = $self->__parse_params ($elem_body) if $elem_body;
1044 if ( $elem_name && ref $params ) {
1045 push @elems, { $elem_name => $params }
1046 }
1047 }
1048 $$item_body =~ s/<(\w+)\s*([^>]*?)\/>//sgi;
1049 return ( @elems ? \@elems : undef );
1050 }
1051
1052
1053 sub __field_prepare {
1054 my ($self, $text) = @_;
1055 101 ahitrov return unless $text;
1056 98 ahitrov
1057 100 ahitrov $text =~ s/^[\n\r\ \t]+//;
1058 $text =~ s/[\n\r\ \t]+$//;
1059 98 ahitrov $self->__cdata (\$text);
1060 $self->__extchar (\$text);
1061 99 ahitrov # $text = HTML::Entities::decode_entities($text);
1062 98 ahitrov
1063 # Remove linebreaks inside incorrectly breaked paragraphs
1064 if (length($text) > 100) {
1065 my $pcount = 0;
1066 while ($text =~ /<p>(.+?)(?=<\/?p>|$)/sgi) {
1067 my $p = $1;
1068 if (length $p > 50) {
1069 107 ahitrov my ($dcount, $ndcount) = (0,0);
1070 98 ahitrov # Count sentences normally ended vs breaked
1071 $dcount++ while $p =~ /(\.|\?|\!)['"]?\s*[\r\n]+/g;
1072 $ndcount++ while $p =~ /([^\.\?\!\s])\s*[\r\n]+/g;
1073 # Found broken paragraph
1074 last if $ndcount > $dcount and ++$pcount > 1;
1075 }
1076 }
1077 if ($pcount > 0) {
1078 $text =~ s/[\n\r]+/ /sg;
1079 }
1080 }
1081 $text =~ s/<br[^>]*>/\n/sgi;
1082 $text =~ s/<p\s*>/\n\n/sgi;
1083 $text =~ s/<\/p\s*>//sgi;
1084 99 ahitrov # $text = Contenido::Parser::Util::strip_html($text);
1085 # $text = Contenido::Parser::Util::text_cleanup($text);
1086 98 ahitrov return $text;
1087 }
1088
1089
1090 sub __extchar {
1091 my ($self, $textref) = @_;
1092
1093 for ( $$textref ) {
1094 s/&#38;/\&/sg;
1095 s/\&amp;/\&/sgi;
1096 99 ahitrov s/\&amp;/\&/sgi;
1097 100 ahitrov s/\&lt;/</sgi;
1098 s/\&gt;/>/sgi;
1099 99 ahitrov s/\&quot;/"/sgi;
1100 s/\&#171;/«/sg;
1101 s/\&#187;/»/sg;
1102 s/\&#163;/£/sg;
1103 s/\&#150;/&ndash;/sg;
1104 s/\&#151;/&mdash;/sg;
1105 s/\&#132;/"/sg;
1106 s/\&#147;/"/sg;
1107 s/\&#148;/"/sg;
1108 s/\&#180;/'/sg;
1109 s/\&#133;/\.\.\./sg;
1110 s/\&#13;/\n/sg;
1111 s/\&#34;/"/sg;
1112 98 ahitrov }
1113 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg;
1114 # $$textref =~ s/&laquo;/«/sgi;
1115 # $$textref =~ s/&raquo;/»/sgi;
1116 # $$textref =~ s/&copy;/©/sgi;
1117 # $$textref =~ s/&ndash;/–/sgi;
1118 # $$textref =~ s/&mdash;/—/sgi;
1119 # $$textref =~ s/&deg;/º/sgi;
1120 # $$textref =~ s/&nbsp;/\x20/sgi;
1121 }
1122
1123 sub __normalise {
1124 my $chr = shift;
1125 return sprintf("%04d",$chr);
1126 }
1127
1128 sub __cdata {
1129 my ($self, $textref) = @_;
1130 if ( $$textref =~ /^<\!\[CDATA\[/ ) {
1131 $$textref =~ s/<\!\[CDATA\[//sgi;
1132 $$textref =~ s/\]\]>//sgi;
1133 }
1134 }
1135
1136
1137 sub __parse_rools {
1138 my ($self, $rools) = @_;
1139 return unless $rools;
1140 $rools =~ s/\r//sgi;
1141 my @rools = split /\n/, $rools;
1142 return unless @rools;
1143
1144 my @parsed;
1145 foreach my $rool ( @rools ) {
1146 my %pr;
1147 next if $rool =~ /^#/;
1148 $rool =~ s/[\x20\t]+$//;
1149 $rool =~ s/^[\x20\t]+//;
1150 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
1151 $pr{action} = lc($1);
1152 my $params = $2;
1153 if ( $pr{action} eq 'use' && $params =~ /^(current)\s+(date)$/ ) {
1154 $pr{action} = 'rewrite';
1155 $pr{target} = 'pubdate';
1156 $pr{command} = 'set';
1157 $pr{condition} = 'CURRENT_DATETIME';
1158 push @parsed, \%pr;
1159 } elsif ( $params =~ /^(\w+)\s+(.*)$/ ) {
1160 $pr{target} = lc($1);
1161 $params = $2;
1162 if ( $params =~ /^([\w=]+)\s+(.*)$/ ) {
1163 $pr{command} = lc($1);
1164 $params = $2;
1165 if ( $pr{action} eq 'filter' && $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
1166 $pr{subcommand} = lc($1); $params = $2;
1167 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex)$/;
1168 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1169 $pr{condition} = $params;
1170 } elsif ( $pr{action} eq 'filter' && $pr{command} eq 'regex' && substr($params,0,1) eq substr($params,-1,1) && substr($params,0,1) =~ /([\/\#\|])/ ) {
1171 my $delim = $1;
1172 $params = substr($params,1,length($params)-2);
1173 my @params = split(//,$params);
1174 my ($from, $to) = ('','');
1175 my $prev = '';
1176 while ( @params ) {
1177 my $ch = shift @params;
1178 if ( $ch eq $delim && $prev ne '\\' ) {
1179 last;
1180 } else {
1181 $prev = $ch;
1182 $from .= $ch;
1183 }
1184 }
1185 $to = join ('', @params);
1186 $pr{condition} = { from => $from, to => $to };
1187 } elsif ( ($pr{action} eq 'ignore' || $pr{action} eq 'only') && $pr{command} =~ /^(regex|=|contain|contains)$/ ) {
1188 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1189 $pr{condition} = $params;
1190 } else {
1191 next;
1192 }
1193 push @parsed, \%pr;
1194 }
1195 }
1196 }
1197 }
1198 return ( scalar @parsed ? \@parsed : undef );
1199 }
1200
1201
1202 sub __parse_related {
1203 my ($self, $related) = @_;
1204 return unless ref $related eq 'HASH';
1205 return unless exists $related->{url} && $related->{url} =~ /^http:\/\//i;
1206 return unless exists $related->{rel} && $related->{rel} =~ /(news|discussion|teaser)/;
1207 my $result = { url => $related->{url}, rel => $related->{rel} };
1208 $result->{type} = $related->{type} if exists $related->{type};
1209 $result->{title} = $self->__field_prepare($related->{title}) if exists $related->{title} && $related->{title};
1210
1211 $result->{author} = $self->__field_prepare($related->{author}) if exists $related->{author} && $related->{author};
1212 $result->{description} = $self->__field_prepare($related->{description}) if exists $related->{description} && $related->{description};
1213
1214 if ( exists $related->{pubdate} && $related->{pubdate} ) {
1215 my $pubdate = Class::Date::localdate(Date::Parse::str2time($related->{pubdate}));
1216 $result->{pubdate} = $pubdate if $pubdate;
1217 }
1218 if ( $related->{rel} =~ /(news|teaser)/ ) {
1219 return undef unless $result->{title} && $result->{pubdate};
1220 } else {
1221 $result->{title} ||= 'Обсудить';
1222 }
1223
1224 if ( exists $related->{image} && $related->{image} ) {
1225 if ( ref $related->{image} eq 'HASH' && (exists $related->{image}{url} || exists $related->{image}{href}) ) {
1226 my $img = rchannel::Image->new( { src => ($related->{image}{url} || $related->{image}{href}) } );
1227 $result->{image} = $img if ref $img;
1228 } elsif ( !ref $related->{image} ) {
1229 my $img = rchannel::Image->new( { src => $related->{image} } );
1230 $result->{image} = $img if ref $img;
1231 }
1232 }
1233
1234 return $result;
1235 }
1236
1237
1238 99 ahitrov sub __parse_date {
1239 my $self = shift;
1240 my $str = shift;
1241
1242 if ($str=~/(\d{2})(\d{2})(\d{4})T(\d{2})(\d{2})(\d{2})/){
1243 return parsedate ("$3-$2-$1 $4:$5:$6");
1244 } elsif ($str=~/(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})/){
1245 return parsedate ("$1 $2");
1246 } else {
1247 return parsedate($str);
1248 }
1249 }
1250
1251
1252
1253 98 ahitrov # TODO IMAGES:
1254 # enclosure
1255 # media:content
1256 # media:thumbnail
1257 # image
1258 # img
1259
1260 # FOUDNED:
1261 # author
1262 # category
1263 # comments
1264 # content
1265 # content:encoded
1266 # content:format
1267 # dc:creator
1268 # dc:date
1269 # dc:rights
1270 # dc:subject
1271 # description
1272 # enclosure
1273 # feedburner:awareness
1274 # feedburner:origLink
1275 # full-text
1276 # fulltext
1277 # guid
1278 # guide
1279 # habrahabr:ballsCount
1280 # habrahabr:commentsCount
1281 # id
1282 # image
1283 # img
1284 # link
1285 # media:content
1286 # media:thumbnail
1287 # pdalink
1288 # pubDate
1289 # pubdate
1290 # pubid
1291 # published
1292 # rambler:full-text
1293 # rambler:fulltext
1294 # region
1295 # section
1296 # sections
1297 # source
1298 # sport
1299 # summary
1300 # text
1301 # title
1302 # updated
1303 # wfw:commentRSS
1304 # wfw:commentRss
1305 # wmj:fulltext
1306 # yandex:full-text
1307
1308 1;