Line # Revision Author
1 98 ahitrov package Contenido::Parser::RSS;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 99 ahitrov use base 'Contenido::Parser';
8
9 use Contenido::Globals;
10 98 ahitrov use Utils::HTML;
11 99 ahitrov use Time::ParseDate;
12 98 ahitrov #use Date::Parse;
13 use Data::Dumper;
14 use Digest::MD5 qw(md5_hex);
15 #use Class::Date;
16 use Encode;
17 use utf8;
18
19 my @INVALID_TAGS = qw ( A ABBREV ACRONYM ADDRESS APP APPLET AREA AU B BANNER BASE BASEFONT BDO BGSOUND BIG BLINK BLOCKQUOTE
20 BODY BQ BR CAPTION CENTER CITE CODE COL COLGROUP CREDIT DD DEL DFN DIR DIV DL DT EM FN FIG FONT FORM FRAME FRAMESET
21 H1 H2 H3 H4 H5 H6 HP HR I IMG INPUT INS ISINDEX KBD LANG LH LI LISTING MAP MARQUEE MENU META NEXTID NOBR NOEMBED
22 NOFRAMES NOTE OL OPTION OVERLAY P PARAM PERSON PLAINTEXT PRE Q S SAMP SELECT SMALL SPAN STRIKE STRONG SUB SUP TAB
23 TABLE TBODY TD TEXTAREA TFOOT TH THEAD TR TT U UL VAR WBR XMP EMBED
24 );
25
26 sub new {
27 my ($proto) = @_;
28 my $class = ref($proto) || $proto;
29 my $self = {};
30 bless $self, $class;
31
32 return $self;
33 }
34
35 sub parse {
36 my ($self, %opts) = @_;
37
38 99 ahitrov my $content;
39 if ( $opts{content} ) {
40 $content = delete $opts{content};
41 delete $self->{content};
42 } elsif ( $self->{success} || $self->{content} ) {
43 $content = delete $self->{content};
44 } else {
45 $self->{success} = 0;
46 return $self;
47 }
48 my $base_url = delete $self->{base_url} || delete $opts{base_url};
49 98 ahitrov my $allow_global_fulltext = delete $opts{allow_fulltext} || 0;
50 my $content_global_type = delete $opts{content_type} || 1;
51 99 ahitrov my $debug = $DEBUG;
52 98 ahitrov my $gui = delete $opts{gui};
53 my $description_as_fulltext = delete $opts{description_as_fulltext};
54 warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug;
55
56 my $rss_rools = $self->__parse_rools (delete $opts{parser_rss});
57
58 warn "RSS Rools: ".Dumper ($rss_rools) if $debug;
59
60 my @items;
61 my $feed = $self->__parse_content(\$content);
62
63 if ( ref $feed eq 'ARRAY' ) {
64 foreach my $item ( @$feed ) {
65 my $fulltext_field;
66 my $content_type = $content_global_type;
67 my $allow_fulltext = $allow_global_fulltext;
68 $self->__check_rewrite ( item => $item, rools => $rss_rools );
69 99 ahitrov my $date = $self->__parse_date($item->{pubdate});
70 my $pubdate = Contenido::DateTime->new( epoch => $date );
71 $pubdate = $pubdate->ymd('-').' '.$pubdate->hms;
72 98 ahitrov next if ref $item->{title};
73 next if ref $item->{description};
74 $self->__check_ignore ( item => $item, rools => $rss_rools );
75 $self->__check_only ( item => $item, rools => $rss_rools );
76 $item->{title} = $self->__field_prepare ($item->{title});
77 $self->__check_filter ( gui => $gui, field => 'title', item => $item, rools => $rss_rools );
78 my $title = $item->{title};
79 my $link;
80 if ( ref $item->{link} eq 'HASH' ) {
81 if ( ( (exists $item->{link}{type} && $item->{link}{type} eq 'text/html') || !exists $item->{link}{type} ) && exists $item->{link}{href} ) {
82 $link = $item->{link}{href};
83 }
84 } elsif ( ref $item->{link} eq 'ARRAY' ) {
85 foreach my $lnk ( @{ $item->{link} } ) {
86 if ( ref $lnk ) {
87 if ( ( (exists $lnk->{type} && $lnk->{type} eq 'text/html') || !exists $lnk->{type} ) && exists $lnk->{href} ) {
88 $link = $lnk->{href};
89 }
90 } else {
91 $link = $lnk;
92 last;
93 }
94 }
95 } else {
96 $link = $item->{'link'} || (ref $item->{'url'} eq 'ARRAY' ? $item->{'url'}->[0] : $item->{'url'});
97 }
98 $link = $self->__field_prepare ($link);
99 $link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/);
100 $item->{description} = $self->__field_prepare ($item->{description});
101 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
102 my $description = $item->{description};
103 if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) {
104 $allow_fulltext = 1;
105 }
106 my $fulltext;
107 if ( $description_as_fulltext ) {
108 $fulltext = $description;
109 $fulltext_field = 'description'
110 } else {
111 if ( $gui ) {
112 foreach my $field ( qw( rambler:fulltext rambler:full-text yandex:full-text mailru:full-text content:encoded full-text fulltext text ) ) {
113 if ( exists $item->{$field} && $item->{$field} ) {
114 $fulltext_field = $field;
115 $fulltext = $item->{$field};
116 last;
117 }
118 }
119 } else {
120 $fulltext =
121 $item->{'rambler:fulltext'} ||
122 $item->{'rambler:full-text'} ||
123 $item->{'yandex:full-text'} ||
124 $item->{'mailru:full-text'} ||
125 $item->{'content:encoded'} ||
126 $item->{'full-text'} ||
127 $item->{'fulltext'} ||
128 $item->{'text'};
129 }
130 if ( ref $fulltext eq 'HASH') {
131 my @values = values %$fulltext;
132 if ( scalar @values == 1 ) {
133 $fulltext = $values[0];
134 }
135 }
136 if ( ref $fulltext eq 'ARRAY' ) {
137 $fulltext = join "\n", @$fulltext;
138 }
139 $self->__check_filter ( gui => $gui, field => 'fulltext', item => $item, text => \$fulltext, rools => $rss_rools );
140 $fulltext = $self->__field_prepare ($fulltext);
141 }
142 if ( $fulltext && !$description ) {
143 $item->{description} = Utils::HTML::limit_words ( $fulltext, 150, 300 );
144 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
145 $description = $item->{description};
146 }
147 $allow_fulltext = 0 unless $fulltext;
148 my $author;
149 if ( exists $item->{author} && $item->{author} ) {
150 if ( ref $item->{author} eq 'HASH' && exists $item->{author}{name} ) {
151 $author = $item->{author}{name};
152 } elsif ( !ref $item->{author} ) {
153 $author = $item->{author};
154 }
155 }
156 my $category = [];
157 if ( exists $item->{category} && ref $item->{category} eq 'ARRAY' ) {
158 $category = $item->{category};
159 } elsif ( exists $item->{category} ) {
160 $category = [$item->{category}];
161 }
162 my @images;
163 if ( exists $item->{image} || exists $item->{enclosure} ) {
164 my @src = ref $item->{image} eq 'ARRAY' ? @{ $item->{image} } : ( $item->{image} ) if exists $item->{image};
165 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
166 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att;
167 @images = map {
168 99 ahitrov my $img = $_;
169 $img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; $img;
170 98 ahitrov } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
171 }
172 my @videos;
173 if ( exists $item->{video} || exists $item->{enclosure} ) {
174 my @src = ref $item->{video} eq 'ARRAY' ? @{ $item->{video} } : ( $item->{video} ) if exists $item->{video};
175 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
176 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /video/ } @att;
177 @videos = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : (), $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
178 }
179 my @audios;
180 if ( exists $item->{audio} || exists $item->{enclosure} ) {
181 my @src = ref $item->{audio} eq 'ARRAY' ? @{ $item->{audio} } : ( $item->{audio} ) if exists $item->{audio};
182 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
183 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /audio/ } @att;
184 @audios = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
185 }
186 my ($video_url, $audio_url);
187 if ( $content_type == 2 || @videos || exists $item->{'videourl'} || exists $item->{'video_url'} ) {
188 $video_url = exists $item->{video} && ref $item->{video} eq 'HASH' && exists $item->{video}{url} ?
189 $item->{video}{url} :
190 $item->{'videourl'} ||
191 $item->{'video_url'} ||
192 ($item->{'guid'} =~ /^http:/ ? $item->{'guid'} : undef) ||
193 (exists $item->{'link'} && ref $item->{'link'} eq 'HASH' ? $item->{'link'}{'href'} || $item->{'link'}{'url'} : $item->{'link'} );
194 $content_type = 2;
195 }
196 if ( @audios || exists $item->{'audiourl'} || exists $item->{'audio_url'} ) {
197 $audio_url = exists $item->{audio} && ref $item->{audio} eq 'HASH' && exists $item->{audio}{url} ?
198 $item->{audio}{url} :
199 $item->{'audiourl'} || $item->{'audio_url'};
200 $content_type = 2;
201 }
202 my $related = [];
203 if ( exists $item->{'rambler:related'} && $item->{'rambler:related'} ) {
204 if ( ref $item->{'rambler:related'} eq 'ARRAY' ) {
205 foreach my $relitem ( @{ $item->{'rambler:related'} } ) {
206 my $rel = $self->__parse_related ( $relitem );
207 push @$related, $rel if ref $rel;
208 }
209 } elsif ( ref $item->{'rambler:related'} eq 'HASH' ) {
210 my $rel = $self->__parse_related ( $item->{'rambler:related'} );
211 push @$related, $rel if ref $rel;
212 }
213 }
214 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos;
215 99 ahitrov my @inlined_images;
216 for ( $description, $fulltext ) {
217 my $field = $_;
218 while ( $field =~ /<img ([^>]+)>/sgi ) {
219 my $image = $self->__parse_params( $1 );
220 push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src};
221 }
222 }
223 if ( @inlined_images ) {
224 my %images = map { $_->{src} => $_ } @images, @inlined_images;
225 @images = values %images;
226 }
227 98 ahitrov push @items, {
228 'checksum' => md5_hex(encode_utf8($title.$description)),
229 'ignore' => $item->{ignore} || 0,
230 'title' => $title || '',
231 'title_gui' => $item->{title_gui} || $title || '',
232 'description' => $description || '',
233 'description_gui' => $item->{description_gui} || $description || '',
234 'desc_length' => length( $description || '' ),
235 'link' => $link || '',
236 'pubdate' => $pubdate || '',
237 'fulltext' => $fulltext || '',
238 'fulltext_gui' => $item->{fulltext_gui} || '',
239 'fulltext_field' => $fulltext_field || '',
240 'image' => @images ? $images[0] : undef,
241 'images' => @images ? \@images : undef,
242 'video' => @videos ? $videos[0] : undef,
243 'videos' => @videos ? \@videos : undef,
244 'categories' => $category,
245 'video_url' => $video_url || '',
246 'audio_url' => $audio_url || '',
247 'author' => $author || '',
248 'related' => $related,
249 'content_type' => $content_type,
250 'allow_fulltext' => $allow_fulltext,
251 };
252 }
253 } else {
254 99 ahitrov warn ($@ || 'Something wrong while parsing content');
255 98 ahitrov return $self->is_success(0);
256 }
257
258 99 ahitrov $self->{items} = \@items;
259 98 ahitrov return $self->is_success(1);
260 }
261
262
263 sub __check_rewrite {
264 my ($self, %opts) = @_;
265 my $item = $opts{item};
266 return unless ref $item;
267 return unless ref $opts{rools} eq 'ARRAY';
268 my @rools = grep { $_->{action} eq 'rewrite' } @{ $opts{rools} };
269 return unless @rools;
270 foreach my $rool ( @rools ) {
271 my $field = $rool->{target};
272 my $value = $rool->{condition};
273 if ( $value eq 'CURRENT_DATETIME' ) {
274 my $dt = DateTime->now( time_zone => "Europe/Moscow" );
275 $value = $dt->ymd('-').'T'.$dt->hms.' MSK';
276 }
277 $item->{$field} = $value if exists $item->{$field};
278 }
279 }
280
281
282 sub __check_filter {
283 my ($self, %opts) = @_;
284 my $field = $opts{field};
285 my $gui = $opts{gui};
286 my $item = $opts{item};
287 my $text = exists $opts{text} ? $opts{text} : undef;
288 return unless ref $item;
289 return unless exists $opts{text} || exists $item->{$field};
290 return unless ref $opts{rools} eq 'ARRAY';
291 my @rools = grep { $_->{action} eq 'filter' && $_->{target} eq $field } @{ $opts{rools} };
292 return unless @rools;
293 foreach my $rool ( @rools ) {
294 if ( $rool->{command} eq 'cut' ) {
295 my $condition = $rool->{condition};
296 if ( $rool->{subcommand} eq 'off' ) {
297 if ( $opts{gui} ) {
298 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
299 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
300 $item->{$field."_gui"} = $field_gui;
301 }
302 if ( exists $opts{text} ) {
303 $$text =~ s/$condition//sgi;
304 } else {
305 $item->{$field} =~ s/$condition//sgi;
306 }
307 } elsif ( $rool->{subcommand} eq 'from' ) {
308 if ( $gui ) {
309 my $cut_text = '';
310 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
311 my $pos = index $field_gui, $condition;
312 if ( $pos >= 0 ) {
313 $cut_text = substr $field_gui, $pos, -1;
314 $field_gui = substr $field_gui, 0, $pos;
315 }
316 # $field_gui =~ s/($condition)(.*)$/<b style="color:red">$1$2<\/b>/si;
317 $item->{$field."_gui"} = $field_gui.'<b style="color:red">'.$cut_text.'</b>';
318 }
319 if ( exists $opts{text} ) {
320 my $pos = index $$text, $condition;
321 if ( $pos >= 0 ) {
322 $$text = substr $$text, 0, $pos;
323 }
324 } else {
325 my $pos = index $item->{$field}, $condition;
326 if ( $pos >= 0 ) {
327 $item->{$field} = substr $item->{$field}, 0, $pos;
328 }
329 }
330 } elsif ( $rool->{subcommand} eq 'till' ) {
331 if ( $opts{gui} ) {
332 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
333 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1$2<\/b>/si;
334 $item->{$field."_gui"} = $field_gui;
335 }
336 if ( exists $opts{text} ) {
337 $$text =~ s/^(.*?)($condition)//si;
338 } else {
339 $item->{$field} =~ s/^(.*?)($condition)//si;
340 }
341 } elsif ( $rool->{subcommand} eq 'untill' ) {
342 if ( $opts{gui} ) {
343 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
344 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1<\/b>$2/si;
345 $item->{$field."_gui"} = $field_gui;
346 }
347 if ( exists $opts{text} ) {
348 $$text =~ s/^(.*?)($condition)/$2/si;
349 } else {
350 $item->{$field} =~ s/^(.*?)($condition)/$2/si;
351 }
352 } elsif ( $rool->{subcommand} eq 'regex' ) {
353 if ( $opts{gui} ) {
354 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
355 if ( substr($condition,0,1) eq '^' ) {
356 my $cond = reverse($condition);
357 chop($cond);
358 $cond = reverse($cond);
359 $field_gui =~ s/^($cond)/<b style="color:red">$1<\/b>/si;
360 } elsif ( substr($condition,-1,1) eq '$' ) {
361 my $cond = $condition;
362 chop($cond);
363 $field_gui =~ s/($cond)$/<b style="color:red">$1<\/b>/si;
364 } else {
365 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
366 }
367 $item->{$field."_gui"} = $field_gui;
368 }
369 if ( exists $opts{text} ) {
370 $$text =~ s/$condition//sgi;
371 } else {
372 $item->{$field} =~ s/$condition//sgi;
373 }
374 }
375 } elsif ( $rool->{command} eq 'regex' ) {
376 my $from = $rool->{condition}{from};
377 my $to = $rool->{condition}{to};
378 if ( exists $opts{text} ) {
379 eval ("\$\$text =~ s/$from/$to/sgi");
380 } else {
381 eval ("\$item->{\$field} =~ s/$from/$to/sgi");
382 }
383 }
384 }
385 }
386
387
388 sub __check_ignore {
389 my ($self, %opts) = @_;
390 my $item = $opts{item};
391 return unless ref $item;
392 return unless ref $opts{rools} eq 'ARRAY';
393 my @rools = grep { $_->{action} eq 'ignore' } @{ $opts{rools} };
394 return unless @rools;
395 foreach my $rool ( @rools ) {
396 my $target = $rool->{target};
397 if ( $rool->{command} =~ /^contain/ ) {
398 $item->{ignore} = 1 if index (lc($item->{$target}), lc($rool->{condition})) >= 0;
399 }
400 if ( $rool->{command} eq '=' ) {
401 $item->{ignore} = 1 if lc($item->{$target}) eq lc($rool->{condition});
402 }
403 if ( $rool->{command} eq 'regex' ) {
404 my $regex = $rool->{condition};
405 $item->{ignore} = 1 if $item->{$target} =~ /$regex/sgi;
406 }
407 }
408 }
409
410
411 sub __check_only {
412 my ($self, %opts) = @_;
413 my $item = $opts{item};
414 return unless ref $item;
415 return unless ref $opts{rools} eq 'ARRAY';
416 my @rools = grep { $_->{action} eq 'only' } @{ $opts{rools} };
417 return unless @rools;
418 foreach my $rool ( @rools ) {
419 my $target = $rool->{target};
420 if ( $rool->{command} =~ /^contain/ ) {
421 $item->{ignore} = 1 unless index (lc($item->{$target}), lc($rool->{condition})) >= 0;
422 }
423 if ( $rool->{command} eq '=' ) {
424 $item->{ignore} = 1 unless lc($item->{$target}) eq lc($rool->{condition});
425 }
426 if ( $rool->{command} eq 'regex' ) {
427 my $regex = $rool->{condition};
428 $item->{ignore} = 1 unless $item->{$target} =~ /$regex/sgi;
429 }
430 }
431 }
432
433
434 sub __feed_type {
435 my ($self, $contref) = @_;
436
437 my $type;
438 if ( $$contref =~ /<rss([^>]*)>/ ) {
439 $type = 'RSS';
440 } elsif ( $$contref =~ /<feed\s+([^>]*)>/ ) {
441 my $feed_params = $1;
442 my $params = $self->__parse_params ($feed_params);
443 if ( exists $params->{xmlns} && $params->{xmlns} =~ /purl.org\/atom/ ) {
444 $type = 'ATOM';
445 } elsif ( exists $params->{xmlns} && $params->{xmlns} =~ /www.w3.org\/2005\/Atom/ ) {
446 $type = 'ATOM';
447 }
448 } elsif ( $$contref =~ /<rdf([^>]*)>/ ) {
449 $type = 'RDF';
450 }
451 return $type;
452 }
453
454
455 sub __parse_content {
456 my ($self, $contref) = @_;
457
458 my $feed_type = $self->__feed_type($contref);
459 # warn "FEED Type = [$feed_type]\n";
460 return undef unless $feed_type;
461
462 $$contref =~ s/>\s+</></sgi;
463 $$contref =~ s/<items(.*?)>(.*?)<\/items([^>]*?)>//sgi;
464 $$contref =~ s/<\/?br(.*?)>/\n/sgi;
465 $$contref =~ s/<\/?nobr(.*?)>//sgi;
466 #$$contref =~ s/<p>/\n\n/sgi;
467 #$$contref =~ s/<p\s(.*?)>/\n\n/sgi;
468 $$contref =~ s/<\/?strong\s(.*?)>//sgi;
469 $$contref =~ s/<\/?s>//sgi;
470 $$contref =~ s/<\/?i>//sgi;
471 $$contref =~ s/<\/?b>//sgi;
472 $$contref =~ s/<\/?strong>//sgi;
473 #$$contref =~ s/<\/p>//sgi;
474 #$$contref =~ s/<\/p\s(.*?)>//sgi;
475 my @items;
476
477 if ( $feed_type eq 'RSS' ) {
478 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
479 my $item_params = $1;
480 my $item_body = $2;
481 # warn "BODY: [$item_body]\n\n";
482 my $params = $self->__parse_params ($item_params);
483 my $item = $self->__parse_item_RSS ($item_body) || {};
484 if ( ref $params eq 'HASH' ) {
485 foreach my $key ( %$params ) {
486 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
487 push @{ $item->{$key} }, $params->{$key};
488 } elsif ( exists $item->{$key} ) {
489 my @arr = ( $item->{$key}, $params->{$key} );
490 $item->{$key} = \@arr;
491 } else {
492 $item->{$key} = $params->{$key};
493 }
494 }
495 }
496 if ( ref $item eq 'HASH' && scalar keys %$item ) {
497 if ( exists $item->{'feedburner:origlink'} ) {
498 $item->{link} = ref $item->{'feedburner:origlink'} eq 'ARRAY' ? $item->{'feedburner:origlink'}->[0] : $item->{'feedburner:origlink'};
499 } elsif ( !exists $item->{link} ) {
500 foreach my $key ( qw( guid ) ) {
501 if ( exists $item->{$key} ) {
502 $item->{link} = $item->{$key};
503 last;
504 }
505 }
506 }
507 push @items, $item;
508 }
509 # warn Dumper($item);
510 }
511 }
512 if ( $feed_type eq 'RDF' ) {
513 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
514 my $item_params = $1;
515 my $item_body = $2;
516 # warn "BODY: [$item_body]\n\n";
517 my $params = $self->__parse_params ($item_params);
518 my $item = $self->__parse_item_RSS ($item_body) || {};
519 if ( ref $params eq 'HASH' ) {
520 foreach my $key ( %$params ) {
521 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
522 push @{ $item->{$key} }, $params->{$key};
523 } elsif ( exists $item->{$key} ) {
524 my @arr = ( $item->{$key}, $params->{$key} );
525 $item->{$key} = \@arr;
526 } else {
527 $item->{$key} = $params->{$key};
528 }
529 }
530 }
531 # warn Dumper($item);
532 if ( ref $item eq 'HASH' && scalar keys %$item ) {
533 if ( !exists $item->{pubdate} ) {
534 foreach my $key ( 'prism:publicationdate', 'dc:date' ) {
535 if ( exists $item->{$key} ) {
536 $item->{pubdate} = $item->{$key};
537 last;
538 }
539 }
540 }
541 push @items, $item;
542 }
543 }
544 }
545 if ( $feed_type eq 'ATOM' ) {
546 while ( $$contref =~ /<entry(.*?)>(.*?)<\/entry([^>]*?)>/sgi ) {
547 my $item_params = $1;
548 my $item_body = $2;
549 my $item = $self->__parse_item_ATOM ($item_body) || {};
550 # warn Dumper($item);
551 if ( ref $item eq 'HASH' && scalar keys %$item ) {
552 if ( !exists $item->{pubdate} ) {
553 foreach my $key ( 'published', 'updated' ) {
554 if ( exists $item->{$key} ) {
555 $item->{pubdate} = $item->{$key};
556 last;
557 }
558 }
559 }
560 push @items, $item;
561 }
562 }
563 }
564 return ( scalar @items ? \@items : undef );
565 }
566
567
568 sub __parse_params {
569 my ($self, $params) = @_;
570 return undef unless $params;
571
572 my %params;
573 while ( $params =~ /([\w\:]+)(\s*?)=(\s*?)["'](.*?)["']/sgi ) {
574 my $name = $1;
575 my $value = $4;
576 if ( $name && $value ) {
577 $params{$name} = $value;
578 }
579 }
580 return ( scalar(keys %params) ? \%params : undef );
581 }
582
583
584 sub __parse_item_RSS {
585 my ($self, $item_body, $debug) = @_;
586 return undef unless $item_body;
587
588 my %item;
589 # my $embedded = $self->__item_cut_rss_embedded(\$item_body);
590 # if ( ref $embedded ) {
591 # %item = %$embedded;
592 # }
593 # my $content = $self->__item_cut_rss_description(\$item_body);
594 # $item{description} = $content if $content;
595 # my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
596 # if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
597 # foreach my $elem ( @$one_string_elements ) {
598 # my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
599 # if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
600 # push @{ $item{$elem_name} }, $elem->{$elem_name};
601 # } elsif ( exists $item{$elem_name} ) {
602 # $item{$elem_name} = [$item{$elem_name}, $elem->{$elem_name}];
603 # } else {
604 # $item{$elem_name} = $elem->{$elem_name};
605 # }
606 # }
607 # }
608 my $parsed = $self->__make_tree (\$item_body, $debug);
609 # warn Dumper($parsed);
610 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
611 foreach my $tag ( @{ $parsed->{1}{children} } ) {
612 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
613 my %params;
614 foreach my $it ( @{ $tag->{children} } ) {
615 next unless $it->{text};
616 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
617 push @{ $params{$it->{type}} }, $it->{text};
618 } elsif ( exists $params{$it->{type}} ) {
619 my @arr = ( $params{$it->{type}}, $it->{text} );
620 $params{$it->{type}} = \@arr;
621 } else {
622 $params{$it->{type}} = $it->{text};
623 }
624 }
625 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
626 push @{ $item{$tag->{type}} }, \%params;
627 } elsif ( exists $item{$tag->{type}} ) {
628 my @arr = ( $item{$tag->{type}}, \%params );
629 $item{$tag->{type}} = \@arr;
630 } else {
631 $item{$tag->{type}} = \%params;
632 }
633 } else {
634 my $body = $tag->{text} || $tag->{params};
635 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
636 push @{ $item{$tag->{type}} }, $body;
637 } elsif ( exists $item{$tag->{type}} ) {
638 my @arr = ( $item{$tag->{type}}, $body );
639 $item{$tag->{type}} = \@arr;
640 } else {
641 $item{$tag->{type}} = $body;
642 }
643 }
644 }
645 }
646 # warn Dumper(\%item);
647 return \%item;
648 }
649
650
651 sub __parse_item_ATOM {
652 my ($self, $item_body, $debug) = @_;
653 return undef unless $item_body;
654
655 my %item;
656 my $embedded = $self->__item_cut_rss_embedded(\$item_body);
657 if ( ref $embedded ) {
658 %item = %$embedded;
659 }
660 if ( exists $item{summary} ) {
661 $item{description} = delete $item{summary};
662 } else {
663 my $summary = $self->__item_cut_atom_summary(\$item_body);
664 $item{description} = $summary if $summary;
665 }
666 my $content = $self->__item_cut_atom_content(\$item_body);
667 if ( $content && $item{description} ) {
668 $item{fulltext} = $content;
669 } elsif ( $content ) {
670 $item{description} = $content;
671 }
672 my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
673 # warn Dumper ($one_string_elements);
674 if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
675 foreach my $elem ( @$one_string_elements ) {
676 my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
677 if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
678 push @{$item{$elem_name}}, $elem->{$elem_name};
679 } elsif ( exists $item{$elem_name} ) {
680 my @arr = ($item{$elem_name}, $elem->{$elem_name});
681 $item{$elem_name} = \@arr;
682 } else {
683 $item{$elem_name} = $elem->{$elem_name};
684 }
685 if ( $elem->{$elem_name}{type} =~ /^image/ ) {
686 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
687 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
688 push @{ $item{enclosure} }, $enclosure;
689 } elsif ( exists $item{enclosure} ) {
690 my @arr = ($item{enclosure}, $enclosure);
691 $item{enclosure} = \@arr;
692 } else {
693 $item{enclosure} = $enclosure;
694 }
695 }
696 if ( $elem->{$elem_name}{type} =~ /^video/ ) {
697 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
698 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
699 push @{ $item{enclosure} }, $enclosure;
700 } elsif ( exists $item{enclosure} ) {
701 my @arr = ($item{enclosure}, $enclosure);
702 $item{enclosure} = \@arr;
703 } else {
704 $item{enclosure} = $enclosure;
705 }
706 }
707 }
708 }
709 my $parsed = $self->__make_tree (\$item_body, $debug);
710 # warn Dumper($parsed);
711 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
712 foreach my $tag ( @{ $parsed->{1}{children} } ) {
713 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
714 my %params;
715 foreach my $it ( @{ $tag->{children} } ) {
716 next unless $it->{text};
717 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
718 push @{ $params{$it->{type}} }, $it->{text};
719 } elsif ( exists $params{$it->{type}} ) {
720 my @arr = ( $params{$it->{type}}, $it->{text} );
721 $params{$it->{type}} = \@arr;
722 } else {
723 $params{$it->{type}} = $it->{text};
724 }
725 }
726 if ( exists $tag->{params} && ref $tag->{params} eq 'HASH' ) {
727 while ( my ($param, $value) = each %{ $tag->{params} } ) {
728 if ( exists $params{$param} && ref $params{$param} eq 'ARRAY' ) {
729 push @{ $params{$param} }, $value;
730 } elsif ( exists $params{$param} ) {
731 my @arr = ( $params{$param}, $value );
732 $params{$param} = \@arr;
733 } else {
734 $params{$param} = $value;
735 }
736 }
737 }
738 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
739 push @{ $item{$tag->{type}} }, \%params;
740 } elsif ( exists $item{$tag->{type}} ) {
741 my @arr = ( $item{$tag->{type}}, \%params );
742 $item{$tag->{type}} = \@arr;
743 } else {
744 $item{$tag->{type}} = \%params;
745 }
746 } else {
747 my $body = $tag->{text} || $tag->{params};
748 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
749 push @{ $item{$tag->{type}} }, $body;
750 } elsif ( exists $item{$tag->{type}} ) {
751 my @arr = ( $item{$tag->{type}}, $body );
752 $item{$tag->{type}} = \@arr;
753 } else {
754 $item{$tag->{type}} = $body;
755 }
756 }
757 }
758 my $pubDate = exists $item{issued} ? $item{issued} : exists $item{modified} ? $item{modified} : undef;
759 $item{pubdate} = $pubDate if $pubDate;
760 }
761
762 # warn Dumper(\%item);
763 return \%item;
764 }
765
766
767 sub __make_tree {
768 my ($self, $content, $debug) = @_;
769
770 my @elems = split (//,$$content);
771 # warn "CONTENT: [$$content]\n\n";
772 my $id = 1;
773 my $level = 0;
774 my @stack;
775 my %tree = (
776 root => {
777 id => $id++,
778 text => '',
779 type => 'root',
780 children=> [],
781 parent => undef,
782 level => $level,
783 },
784 );
785 my %elem_hash = ( 1 => $tree{root} );
786 my $current = $tree{root};
787
788 while ( @elems ) {
789 if ( $elems[0] eq '<' && $elems[1] =~ /[\!a-zA-Z]/ ) {
790 my $tag = $self->__try_tag (\@elems);
791 if ( ref $tag && $tag->{type} eq 'text' ) {
792 $current->{text} .= $tag->{content};
793 splice @elems, 0, $tag->{count};
794 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
795 } elsif ( ref $tag && exists $tag->{closed} && $tag->{closed} ) {
796 $tag->{id} = $id++;
797 $tag->{parent} = $current;
798 $tag->{level} = $level+1;
799 $elem_hash{$tag->{id}} = $tag;
800 push @{$current->{children}}, $tag;
801 splice @elems, 0, $tag->{count};
802 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
803 } elsif ( ref $tag ) {
804 $tag->{id} = $id++;
805 $tag->{children} = [];
806 $tag->{parent} = $current;
807 $tag->{level} = ++$level;
808 $elem_hash{$tag->{id}} = $tag;
809 push @{$current->{children}}, $tag;
810 push @stack, $current;
811 $current = $tag;
812 splice @elems, 0, $tag->{count};
813 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
814 } else {
815 # warn "!!!! Error: RSS analyse. Job on item broken... !!!!\n" if $debug;
816 return undef;
817 }
818 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
819 my $tag = $self->__try_end (\@elems);
820 if ( ref $tag && $tag->{type} eq 'text' ) {
821 $current->{text} .= $tag->{content};
822 $current->{count} += $tag->{count};
823 splice @elems, 0, $tag->{count};
824 } elsif ( ref $tag ) {
825 if ( $current->{type} ne $tag->{type} ) {
826 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
827 return undef;
828 } else {
829 $current = pop @stack;
830 $level = $current->{level};
831 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
832 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
833 }
834 splice @elems, 0, $tag->{count};
835 } else {
836 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
837 return undef;
838 }
839 } else {
840 $current->{text} .= shift @elems;
841 $current->{count}++;
842 }
843 }
844 return (\%elem_hash);
845 }
846
847
848
849 sub __try_tag {
850 my ($self, $content) = @_;
851
852 my $i = 1;
853 my %tag;
854 my $tag = $content->[0];
855 if ( $content->[$i] eq '!' ) {
856 # warn "What? Think it's CDATA\n";
857 my $try_cdata = join '', @$content[1..8];
858 if ( $try_cdata eq '![CDATA[' ) {
859 $tag = '';
860 $i = 9;
861 while ( !($content->[$i-1] eq '>' && $content->[$i-2] eq ']' && $content->[$i-3] eq ']') && $i < scalar @$content ) {
862 $tag .= $content->[$i];
863 $i++;
864 }
865 chop $tag; chop $tag; chop $tag;
866 }
867 # warn "CDATA Found: [$tag]";
868 return {
869 type => 'text',
870 content => $tag,
871 count => $i,
872 };
873 }
874 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
875 $tag .= $content->[$i];
876 $i++;
877 }
878 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
879 return {
880 type => 'text',
881 content => $tag,
882 count => $i,
883 };
884 } else {
885 if ( $tag =~ /^<([\w:-]+)\s*(.*)/si ) {
886 my $elem_name = $1;
887 my $elem_body = $2;
888 unless ( $self->__is_valid_tag ($elem_name) ) {
889 return {
890 type => 'text',
891 content => $tag,
892 count => $i,
893 };
894 } else {
895 my $params = $self->__parse_params ($elem_body) if $elem_body;
896 if ( $content->[$i] eq '>' && $content->[$i-1] eq '/' ) {
897 $tag{closed} = 1;
898 }
899 $tag{type} = lc($elem_name);
900 $tag{count} = $i+1;
901 $tag{params} = $params if ref $params;
902 return \%tag;
903 }
904 } else {
905 return {
906 type => 'text',
907 content => $tag,
908 count => $i,
909 };
910 }
911 }
912 }
913
914
915 sub __try_end {
916 my ($self, $content) = @_;
917
918 my $i = 2;
919 my %tag;
920 my $tag = $content->[0].$content->[1];
921 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
922 $tag .= $content->[$i];
923 $i++;
924 }
925 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
926 return {
927 type => 'text',
928 content => $tag,
929 count => $i,
930 };
931 } else {
932 if ( $tag =~ /^<\/([\w:-]+)/i ) {
933 my $elem_name = $1;
934 unless ( $self->__is_valid_tag ($elem_name) ) {
935 return {
936 type => 'text',
937 content => $tag,
938 count => $i,
939 };
940 } else {
941 $tag{type} = lc($elem_name);
942 $tag{count} = $i+1;
943 return \%tag;
944 }
945 } else {
946 return {
947 type => 'text',
948 content => $tag,
949 count => $i,
950 };
951 }
952 }
953 }
954
955
956 sub __is_valid_tag {
957 my ($self, $tag) = @_;
958 foreach my $invtag ( @INVALID_TAGS ) {
959 return 0 if lc($invtag) eq lc($tag);
960 }
961 return 1;
962 }
963
964
965 sub __item_cut_atom_content {
966 my ($self, $item_body) = @_;
967
968 my %elem;
969 if ( $$item_body =~ /<content([^>]*?)>(.*?)<\/content([^>]*)>/si ) {
970 my $content_params = $1;
971 my $content_body = $2;
972 my $params = $self->__parse_params ($content_params) if $content_params;
973 $$item_body =~ s/<content([^>]*?)>(.*?)<\/content([^>]*)>//si;
974 return $content_body;
975 }
976 }
977
978
979 sub __item_cut_atom_summary {
980 my ($self, $item_body) = @_;
981
982 my %elem;
983 if ( $$item_body =~ /<summary([^>]*)>(.*?)<\/summary([^>]*)>/si ) {
984 my $content_params = $1;
985 my $content_body = $2;
986 my $params = $self->__parse_params ($content_params) if $content_params;
987 $$item_body =~ s/<summary([^>]*)>(.*?)<\/summary([^>]*)>//si;
988 return $content_body;
989 }
990 }
991
992
993 sub __item_cut_rss_description {
994 my ($self, $item_body) = @_;
995
996 my %elem;
997 if ( $$item_body =~ /<description([^>]*?)>(.*?)<\/description([^>]*)>/si ) {
998 my $content_params = $1;
999 my $content_body = $2;
1000 my $params = $self->__parse_params ($content_params) if $content_params;
1001 $$item_body =~ s/<description([^>]*?)>(.*?)<\/description([^>]*)>//si;
1002 return $content_body;
1003 }
1004 }
1005
1006
1007 sub __item_cut_rss_embedded {
1008 my ($self, $item_body) = @_;
1009
1010 my %elem;
1011 while ( $$item_body =~ /<([^>]*?)>\s*<!\[CDATA\[(.*?)\]\]>\s*<\/([^>]*)>/sgi ) {
1012 my $tag = $3;
1013 my $content_body = $2;
1014 my $content_params = $1;
1015 if ( $content_params =~ /([\w:-]+)\s+(.*)/ ) {
1016 $tag = 1;
1017 $content_params = $2;
1018 }
1019 my $params = $self->__parse_params ($content_params) if $content_params;
1020 $elem{$tag} = $content_body;
1021 $$item_body =~ s/<$tag([^>]*?)>(.*?)<\/$tag([^>]*)>//si;
1022 }
1023 return scalar keys %elem ? \%elem : undef;
1024 }
1025
1026
1027
1028 sub __item_cut_single_elements {
1029 my ($self, $item_body) = @_;
1030
1031 my @elems;
1032 while ( $$item_body =~ /<([\w\:\-]+)\s*([^>]*?)\/>/sgi ) {
1033 my $elem_name = $1;
1034 my $elem_body = $2;
1035 my $params = $self->__parse_params ($elem_body) if $elem_body;
1036 if ( $elem_name && ref $params ) {
1037 push @elems, { $elem_name => $params }
1038 }
1039 }
1040 $$item_body =~ s/<(\w+)\s*([^>]*?)\/>//sgi;
1041 return ( @elems ? \@elems : undef );
1042 }
1043
1044
1045 sub __field_prepare {
1046 my ($self, $text) = @_;
1047
1048 # $text =~ s/^[\n\r\x20\t]+//;
1049 99 ahitrov $text =~ s/[\n\r\x20\t]+$//;
1050 98 ahitrov $self->__cdata (\$text);
1051 $self->__extchar (\$text);
1052 99 ahitrov # $text = HTML::Entities::decode_entities($text);
1053 98 ahitrov
1054 # Remove linebreaks inside incorrectly breaked paragraphs
1055 if (length($text) > 100) {
1056 my $pcount = 0;
1057 while ($text =~ /<p>(.+?)(?=<\/?p>|$)/sgi) {
1058 my $p = $1;
1059 if (length $p > 50) {
1060 my ($dcount, $ndcount) = ();
1061 # Count sentences normally ended vs breaked
1062 $dcount++ while $p =~ /(\.|\?|\!)['"]?\s*[\r\n]+/g;
1063 $ndcount++ while $p =~ /([^\.\?\!\s])\s*[\r\n]+/g;
1064 # Found broken paragraph
1065 last if $ndcount > $dcount and ++$pcount > 1;
1066 }
1067 }
1068 if ($pcount > 0) {
1069 $text =~ s/[\n\r]+/ /sg;
1070 }
1071 }
1072 $text =~ s/<br[^>]*>/\n/sgi;
1073 $text =~ s/<p\s*>/\n\n/sgi;
1074 $text =~ s/<\/p\s*>//sgi;
1075 99 ahitrov # $text = Contenido::Parser::Util::strip_html($text);
1076 # $text = Contenido::Parser::Util::text_cleanup($text);
1077 98 ahitrov return $text;
1078 }
1079
1080
1081 sub __extchar {
1082 my ($self, $textref) = @_;
1083
1084 for ( $$textref ) {
1085 s/&#38;/\&/sg;
1086 s/\&amp;/\&/sgi;
1087 99 ahitrov s/\&amp;/\&/sgi;
1088 s/\&quot;/"/sgi;
1089 s/\&#171;/«/sg;
1090 s/\&#187;/»/sg;
1091 s/\&#163;/£/sg;
1092 s/\&#150;/&ndash;/sg;
1093 s/\&#151;/&mdash;/sg;
1094 s/\&#132;/"/sg;
1095 s/\&#147;/"/sg;
1096 s/\&#148;/"/sg;
1097 s/\&#180;/'/sg;
1098 s/\&#133;/\.\.\./sg;
1099 s/\&#13;/\n/sg;
1100 s/\&#34;/"/sg;
1101 98 ahitrov }
1102 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg;
1103 # $$textref =~ s/&gt;/>/sgi;
1104 # $$textref =~ s/&lt;/</sgi;
1105 # $$textref =~ s/&quot;/"/sgi;
1106 # $$textref =~ s/&laquo;/«/sgi;
1107 # $$textref =~ s/&raquo;/»/sgi;
1108 # $$textref =~ s/&copy;/©/sgi;
1109 # $$textref =~ s/&ndash;/–/sgi;
1110 # $$textref =~ s/&mdash;/—/sgi;
1111 # $$textref =~ s/&deg;/º/sgi;
1112 # $$textref =~ s/&nbsp;/\x20/sgi;
1113 }
1114
1115 sub __normalise {
1116 my $chr = shift;
1117 return sprintf("%04d",$chr);
1118 }
1119
1120 sub __cdata {
1121 my ($self, $textref) = @_;
1122 if ( $$textref =~ /^<\!\[CDATA\[/ ) {
1123 $$textref =~ s/<\!\[CDATA\[//sgi;
1124 $$textref =~ s/\]\]>//sgi;
1125 }
1126 }
1127
1128
1129 sub __parse_rools {
1130 my ($self, $rools) = @_;
1131 return unless $rools;
1132 $rools =~ s/\r//sgi;
1133 my @rools = split /\n/, $rools;
1134 return unless @rools;
1135
1136 my @parsed;
1137 foreach my $rool ( @rools ) {
1138 my %pr;
1139 next if $rool =~ /^#/;
1140 $rool =~ s/[\x20\t]+$//;
1141 $rool =~ s/^[\x20\t]+//;
1142 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
1143 $pr{action} = lc($1);
1144 my $params = $2;
1145 if ( $pr{action} eq 'use' && $params =~ /^(current)\s+(date)$/ ) {
1146 $pr{action} = 'rewrite';
1147 $pr{target} = 'pubdate';
1148 $pr{command} = 'set';
1149 $pr{condition} = 'CURRENT_DATETIME';
1150 push @parsed, \%pr;
1151 } elsif ( $params =~ /^(\w+)\s+(.*)$/ ) {
1152 $pr{target} = lc($1);
1153 $params = $2;
1154 if ( $params =~ /^([\w=]+)\s+(.*)$/ ) {
1155 $pr{command} = lc($1);
1156 $params = $2;
1157 if ( $pr{action} eq 'filter' && $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
1158 $pr{subcommand} = lc($1); $params = $2;
1159 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex)$/;
1160 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1161 $pr{condition} = $params;
1162 } elsif ( $pr{action} eq 'filter' && $pr{command} eq 'regex' && substr($params,0,1) eq substr($params,-1,1) && substr($params,0,1) =~ /([\/\#\|])/ ) {
1163 my $delim = $1;
1164 $params = substr($params,1,length($params)-2);
1165 my @params = split(//,$params);
1166 my ($from, $to) = ('','');
1167 my $prev = '';
1168 while ( @params ) {
1169 my $ch = shift @params;
1170 if ( $ch eq $delim && $prev ne '\\' ) {
1171 last;
1172 } else {
1173 $prev = $ch;
1174 $from .= $ch;
1175 }
1176 }
1177 $to = join ('', @params);
1178 $pr{condition} = { from => $from, to => $to };
1179 } elsif ( ($pr{action} eq 'ignore' || $pr{action} eq 'only') && $pr{command} =~ /^(regex|=|contain|contains)$/ ) {
1180 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1181 $pr{condition} = $params;
1182 } else {
1183 next;
1184 }
1185 push @parsed, \%pr;
1186 }
1187 }
1188 }
1189 }
1190 return ( scalar @parsed ? \@parsed : undef );
1191 }
1192
1193
1194 sub __parse_related {
1195 my ($self, $related) = @_;
1196 return unless ref $related eq 'HASH';
1197 return unless exists $related->{url} && $related->{url} =~ /^http:\/\//i;
1198 return unless exists $related->{rel} && $related->{rel} =~ /(news|discussion|teaser)/;
1199 my $result = { url => $related->{url}, rel => $related->{rel} };
1200 $result->{type} = $related->{type} if exists $related->{type};
1201 $result->{title} = $self->__field_prepare($related->{title}) if exists $related->{title} && $related->{title};
1202
1203 $result->{author} = $self->__field_prepare($related->{author}) if exists $related->{author} && $related->{author};
1204 $result->{description} = $self->__field_prepare($related->{description}) if exists $related->{description} && $related->{description};
1205
1206 if ( exists $related->{pubdate} && $related->{pubdate} ) {
1207 my $pubdate = Class::Date::localdate(Date::Parse::str2time($related->{pubdate}));
1208 $result->{pubdate} = $pubdate if $pubdate;
1209 }
1210 if ( $related->{rel} =~ /(news|teaser)/ ) {
1211 return undef unless $result->{title} && $result->{pubdate};
1212 } else {
1213 $result->{title} ||= 'Обсудить';
1214 }
1215
1216 if ( exists $related->{image} && $related->{image} ) {
1217 if ( ref $related->{image} eq 'HASH' && (exists $related->{image}{url} || exists $related->{image}{href}) ) {
1218 my $img = rchannel::Image->new( { src => ($related->{image}{url} || $related->{image}{href}) } );
1219 $result->{image} = $img if ref $img;
1220 } elsif ( !ref $related->{image} ) {
1221 my $img = rchannel::Image->new( { src => $related->{image} } );
1222 $result->{image} = $img if ref $img;
1223 }
1224 }
1225
1226 return $result;
1227 }
1228
1229
1230 99 ahitrov sub __parse_date {
1231 my $self = shift;
1232 my $str = shift;
1233
1234 if ($str=~/(\d{2})(\d{2})(\d{4})T(\d{2})(\d{2})(\d{2})/){
1235 return parsedate ("$3-$2-$1 $4:$5:$6");
1236 } elsif ($str=~/(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})/){
1237 return parsedate ("$1 $2");
1238 } else {
1239 return parsedate($str);
1240 }
1241 }
1242
1243
1244
1245 98 ahitrov # TODO IMAGES:
1246 # enclosure
1247 # media:content
1248 # media:thumbnail
1249 # image
1250 # img
1251
1252 # FOUDNED:
1253 # author
1254 # category
1255 # comments
1256 # content
1257 # content:encoded
1258 # content:format
1259 # dc:creator
1260 # dc:date
1261 # dc:rights
1262 # dc:subject
1263 # description
1264 # enclosure
1265 # feedburner:awareness
1266 # feedburner:origLink
1267 # full-text
1268 # fulltext
1269 # guid
1270 # guide
1271 # habrahabr:ballsCount
1272 # habrahabr:commentsCount
1273 # id
1274 # image
1275 # img
1276 # link
1277 # media:content
1278 # media:thumbnail
1279 # pdalink
1280 # pubDate
1281 # pubdate
1282 # pubid
1283 # published
1284 # rambler:full-text
1285 # rambler:fulltext
1286 # region
1287 # section
1288 # sections
1289 # source
1290 # sport
1291 # summary
1292 # text
1293 # title
1294 # updated
1295 # wfw:commentRSS
1296 # wfw:commentRss
1297 # wmj:fulltext
1298 # yandex:full-text
1299
1300 1;