Revision 98

Date:
2011/04/27 15:28:25
Author:
ahitrov
Revision Log:
Парсер
Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/core/lib/Contenido/Parser/RSS.pm

     
    1 package Contenido::Parser::RSS;
    2
    3 use strict;
    4 use warnings;
    5 use locale;
    6
    7 use Contenido::Parser::Util;
    8 use Utils::HTML;
    9 #use Time::ParseDate;
    10 #use Date::Parse;
    11 use Data::Dumper;
    12 use Digest::MD5 qw(md5_hex);
    13 #use Class::Date;
    14 use Encode;
    15 use utf8;
    16
    17 my @INVALID_TAGS = qw ( A ABBREV ACRONYM ADDRESS APP APPLET AREA AU B BANNER BASE BASEFONT BDO BGSOUND BIG BLINK BLOCKQUOTE
    18 BODY BQ BR CAPTION CENTER CITE CODE COL COLGROUP CREDIT DD DEL DFN DIR DIV DL DT EM FN FIG FONT FORM FRAME FRAMESET
    19 H1 H2 H3 H4 H5 H6 HP HR I IMG INPUT INS ISINDEX KBD LANG LH LI LISTING MAP MARQUEE MENU META NEXTID NOBR NOEMBED
    20 NOFRAMES NOTE OL OPTION OVERLAY P PARAM PERSON PLAINTEXT PRE Q S SAMP SELECT SMALL SPAN STRIKE STRONG SUB SUP TAB
    21 TABLE TBODY TD TEXTAREA TFOOT TH THEAD TR TT U UL VAR WBR XMP EMBED
    22 );
    23
    24 sub new {
    25 my ($proto) = @_;
    26 my $class = ref($proto) || $proto;
    27 my $self = {};
    28 bless $self, $class;
    29
    30 return $self;
    31 }
    32
    33 sub parse {
    34 my ($self, %opts) = @_;
    35
    36 my $content = delete $opts{content};
    37 my $base_url = delete $opts{base_url};
    38 my $strip_html = delete $opts{strip_html};
    39 my $allow_global_fulltext = delete $opts{allow_fulltext} || 0;
    40 my $content_global_type = delete $opts{content_type} || 1;
    41 my $debug = delete $opts{debug};
    42 my $gui = delete $opts{gui};
    43 my $description_as_fulltext = delete $opts{description_as_fulltext};
    44 warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug;
    45
    46 my $rss_rools = $self->__parse_rools (delete $opts{parser_rss});
    47
    48 warn "RSS Rools: ".Dumper ($rss_rools) if $debug;
    49
    50 my @items;
    51 my $feed = $self->__parse_content(\$content);
    52
    53 if ( ref $feed eq 'ARRAY' ) {
    54 foreach my $item ( @$feed ) {
    55 my $fulltext_field;
    56 my $content_type = $content_global_type;
    57 my $allow_fulltext = $allow_global_fulltext;
    58 $self->__check_rewrite ( item => $item, rools => $rss_rools );
    59 my $date = Time::ParseDate::parsedate($item->{pubdate});
    60 my $pubdate = Class::Date::localdate(Date::Parse::str2time($item->{pubdate}));
    61 next if ref $item->{title};
    62 next if ref $item->{description};
    63 $self->__check_ignore ( item => $item, rools => $rss_rools );
    64 $self->__check_only ( item => $item, rools => $rss_rools );
    65 $item->{title} = $self->__field_prepare ($item->{title});
    66 $self->__check_filter ( gui => $gui, field => 'title', item => $item, rools => $rss_rools );
    67 my $title = $item->{title};
    68 my $link;
    69 if ( ref $item->{link} eq 'HASH' ) {
    70 if ( ( (exists $item->{link}{type} && $item->{link}{type} eq 'text/html') || !exists $item->{link}{type} ) && exists $item->{link}{href} ) {
    71 $link = $item->{link}{href};
    72 }
    73 } elsif ( ref $item->{link} eq 'ARRAY' ) {
    74 foreach my $lnk ( @{ $item->{link} } ) {
    75 if ( ref $lnk ) {
    76 if ( ( (exists $lnk->{type} && $lnk->{type} eq 'text/html') || !exists $lnk->{type} ) && exists $lnk->{href} ) {
    77 $link = $lnk->{href};
    78 }
    79 } else {
    80 $link = $lnk;
    81 last;
    82 }
    83 }
    84 } else {
    85 $link = $item->{'link'} || (ref $item->{'url'} eq 'ARRAY' ? $item->{'url'}->[0] : $item->{'url'});
    86 }
    87 $link = $self->__field_prepare ($link);
    88 $link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/);
    89 $item->{description} = $self->__field_prepare ($item->{description});
    90 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
    91 my $description = $item->{description};
    92 if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) {
    93 $allow_fulltext = 1;
    94 }
    95 my $fulltext;
    96 if ( $description_as_fulltext ) {
    97 $fulltext = $description;
    98 $fulltext_field = 'description'
    99 } else {
    100 if ( $gui ) {
    101 foreach my $field ( qw( rambler:fulltext rambler:full-text yandex:full-text mailru:full-text content:encoded full-text fulltext text ) ) {
    102 if ( exists $item->{$field} && $item->{$field} ) {
    103 $fulltext_field = $field;
    104 $fulltext = $item->{$field};
    105 last;
    106 }
    107 }
    108 } else {
    109 $fulltext =
    110 $item->{'rambler:fulltext'} ||
    111 $item->{'rambler:full-text'} ||
    112 $item->{'yandex:full-text'} ||
    113 $item->{'mailru:full-text'} ||
    114 $item->{'content:encoded'} ||
    115 $item->{'full-text'} ||
    116 $item->{'fulltext'} ||
    117 $item->{'text'};
    118 }
    119 if ( ref $fulltext eq 'HASH') {
    120 my @values = values %$fulltext;
    121 if ( scalar @values == 1 ) {
    122 $fulltext = $values[0];
    123 }
    124 }
    125 if ( ref $fulltext eq 'ARRAY' ) {
    126 $fulltext = join "\n", @$fulltext;
    127 }
    128 $self->__check_filter ( gui => $gui, field => 'fulltext', item => $item, text => \$fulltext, rools => $rss_rools );
    129 $fulltext = $self->__field_prepare ($fulltext);
    130 }
    131 if ( $fulltext && !$description ) {
    132 $item->{description} = Utils::HTML::limit_words ( $fulltext, 150, 300 );
    133 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools );
    134 $description = $item->{description};
    135 }
    136 $allow_fulltext = 0 unless $fulltext;
    137 my $author;
    138 if ( exists $item->{author} && $item->{author} ) {
    139 if ( ref $item->{author} eq 'HASH' && exists $item->{author}{name} ) {
    140 $author = $item->{author}{name};
    141 } elsif ( !ref $item->{author} ) {
    142 $author = $item->{author};
    143 }
    144 }
    145 my $category = [];
    146 if ( exists $item->{category} && ref $item->{category} eq 'ARRAY' ) {
    147 $category = $item->{category};
    148 } elsif ( exists $item->{category} ) {
    149 $category = [$item->{category}];
    150 }
    151 my @images;
    152 if ( exists $item->{image} || exists $item->{enclosure} ) {
    153 my @src = ref $item->{image} eq 'ARRAY' ? @{ $item->{image} } : ( $item->{image} ) if exists $item->{image};
    154 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
    155 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att;
    156 @images = map {
    157 my $img = rchannel::Image->new($_);
    158 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; $img;
    159 } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
    160 }
    161 my @videos;
    162 if ( exists $item->{video} || exists $item->{enclosure} ) {
    163 my @src = ref $item->{video} eq 'ARRAY' ? @{ $item->{video} } : ( $item->{video} ) if exists $item->{video};
    164 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
    165 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /video/ } @att;
    166 @videos = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : (), $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
    167 }
    168 my @audios;
    169 if ( exists $item->{audio} || exists $item->{enclosure} ) {
    170 my @src = ref $item->{audio} eq 'ARRAY' ? @{ $item->{audio} } : ( $item->{audio} ) if exists $item->{audio};
    171 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure};
    172 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /audio/ } @att;
    173 @audios = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att;
    174 }
    175 my ($video_url, $audio_url);
    176 if ( $content_type == 2 || @videos || exists $item->{'videourl'} || exists $item->{'video_url'} ) {
    177 $video_url = exists $item->{video} && ref $item->{video} eq 'HASH' && exists $item->{video}{url} ?
    178 $item->{video}{url} :
    179 $item->{'videourl'} ||
    180 $item->{'video_url'} ||
    181 ($item->{'guid'} =~ /^http:/ ? $item->{'guid'} : undef) ||
    182 (exists $item->{'link'} && ref $item->{'link'} eq 'HASH' ? $item->{'link'}{'href'} || $item->{'link'}{'url'} : $item->{'link'} );
    183 $content_type = 2;
    184 }
    185 if ( @audios || exists $item->{'audiourl'} || exists $item->{'audio_url'} ) {
    186 $audio_url = exists $item->{audio} && ref $item->{audio} eq 'HASH' && exists $item->{audio}{url} ?
    187 $item->{audio}{url} :
    188 $item->{'audiourl'} || $item->{'audio_url'};
    189 $content_type = 2;
    190 }
    191 my $related = [];
    192 if ( exists $item->{'rambler:related'} && $item->{'rambler:related'} ) {
    193 if ( ref $item->{'rambler:related'} eq 'ARRAY' ) {
    194 foreach my $relitem ( @{ $item->{'rambler:related'} } ) {
    195 my $rel = $self->__parse_related ( $relitem );
    196 push @$related, $rel if ref $rel;
    197 }
    198 } elsif ( ref $item->{'rambler:related'} eq 'HASH' ) {
    199 my $rel = $self->__parse_related ( $item->{'rambler:related'} );
    200 push @$related, $rel if ref $rel;
    201 }
    202 }
    203 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos;
    204 push @items, {
    205 'checksum' => md5_hex(encode_utf8($title.$description)),
    206 'ignore' => $item->{ignore} || 0,
    207 'title' => $title || '',
    208 'title_gui' => $item->{title_gui} || $title || '',
    209 'description' => $description || '',
    210 'description_gui' => $item->{description_gui} || $description || '',
    211 'desc_length' => length( $description || '' ),
    212 'link' => $link || '',
    213 'pubdate' => $pubdate || '',
    214 'fulltext' => $fulltext || '',
    215 'fulltext_gui' => $item->{fulltext_gui} || '',
    216 'fulltext_field' => $fulltext_field || '',
    217 'image' => @images ? $images[0] : undef,
    218 'images' => @images ? \@images : undef,
    219 'video' => @videos ? $videos[0] : undef,
    220 'videos' => @videos ? \@videos : undef,
    221 'categories' => $category,
    222 'video_url' => $video_url || '',
    223 'audio_url' => $audio_url || '',
    224 'author' => $author || '',
    225 'related' => $related,
    226 'content_type' => $content_type,
    227 'allow_fulltext' => $allow_fulltext,
    228 };
    229 }
    230 } else {
    231 $self->error_message($@ || 'Something wrong while parsing content');
    232 return $self->is_success(0);
    233 }
    234
    235 $self->items(\@items);
    236 return $self->is_success(1);
    237 }
    238
    239
    240 sub __check_rewrite {
    241 my ($self, %opts) = @_;
    242 my $item = $opts{item};
    243 return unless ref $item;
    244 return unless ref $opts{rools} eq 'ARRAY';
    245 my @rools = grep { $_->{action} eq 'rewrite' } @{ $opts{rools} };
    246 return unless @rools;
    247 foreach my $rool ( @rools ) {
    248 my $field = $rool->{target};
    249 my $value = $rool->{condition};
    250 if ( $value eq 'CURRENT_DATETIME' ) {
    251 my $dt = DateTime->now( time_zone => "Europe/Moscow" );
    252 $value = $dt->ymd('-').'T'.$dt->hms.' MSK';
    253 }
    254 $item->{$field} = $value if exists $item->{$field};
    255 }
    256 }
    257
    258
    259 sub __check_filter {
    260 my ($self, %opts) = @_;
    261 my $field = $opts{field};
    262 my $gui = $opts{gui};
    263 my $item = $opts{item};
    264 my $text = exists $opts{text} ? $opts{text} : undef;
    265 return unless ref $item;
    266 return unless exists $opts{text} || exists $item->{$field};
    267 return unless ref $opts{rools} eq 'ARRAY';
    268 my @rools = grep { $_->{action} eq 'filter' && $_->{target} eq $field } @{ $opts{rools} };
    269 return unless @rools;
    270 foreach my $rool ( @rools ) {
    271 if ( $rool->{command} eq 'cut' ) {
    272 my $condition = $rool->{condition};
    273 if ( $rool->{subcommand} eq 'off' ) {
    274 if ( $opts{gui} ) {
    275 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
    276 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
    277 $item->{$field."_gui"} = $field_gui;
    278 }
    279 if ( exists $opts{text} ) {
    280 $$text =~ s/$condition//sgi;
    281 } else {
    282 $item->{$field} =~ s/$condition//sgi;
    283 }
    284 } elsif ( $rool->{subcommand} eq 'from' ) {
    285 if ( $gui ) {
    286 my $cut_text = '';
    287 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
    288 my $pos = index $field_gui, $condition;
    289 if ( $pos >= 0 ) {
    290 $cut_text = substr $field_gui, $pos, -1;
    291 $field_gui = substr $field_gui, 0, $pos;
    292 }
    293 # $field_gui =~ s/($condition)(.*)$/<b style="color:red">$1$2<\/b>/si;
    294 $item->{$field."_gui"} = $field_gui.'<b style="color:red">'.$cut_text.'</b>';
    295 }
    296 if ( exists $opts{text} ) {
    297 my $pos = index $$text, $condition;
    298 if ( $pos >= 0 ) {
    299 $$text = substr $$text, 0, $pos;
    300 }
    301 } else {
    302 my $pos = index $item->{$field}, $condition;
    303 if ( $pos >= 0 ) {
    304 $item->{$field} = substr $item->{$field}, 0, $pos;
    305 }
    306 }
    307 } elsif ( $rool->{subcommand} eq 'till' ) {
    308 if ( $opts{gui} ) {
    309 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
    310 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1$2<\/b>/si;
    311 $item->{$field."_gui"} = $field_gui;
    312 }
    313 if ( exists $opts{text} ) {
    314 $$text =~ s/^(.*?)($condition)//si;
    315 } else {
    316 $item->{$field} =~ s/^(.*?)($condition)//si;
    317 }
    318 } elsif ( $rool->{subcommand} eq 'untill' ) {
    319 if ( $opts{gui} ) {
    320 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
    321 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1<\/b>$2/si;
    322 $item->{$field."_gui"} = $field_gui;
    323 }
    324 if ( exists $opts{text} ) {
    325 $$text =~ s/^(.*?)($condition)/$2/si;
    326 } else {
    327 $item->{$field} =~ s/^(.*?)($condition)/$2/si;
    328 }
    329 } elsif ( $rool->{subcommand} eq 'regex' ) {
    330 if ( $opts{gui} ) {
    331 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field});
    332 if ( substr($condition,0,1) eq '^' ) {
    333 my $cond = reverse($condition);
    334 chop($cond);
    335 $cond = reverse($cond);
    336 $field_gui =~ s/^($cond)/<b style="color:red">$1<\/b>/si;
    337 } elsif ( substr($condition,-1,1) eq '$' ) {
    338 my $cond = $condition;
    339 chop($cond);
    340 $field_gui =~ s/($cond)$/<b style="color:red">$1<\/b>/si;
    341 } else {
    342 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi;
    343 }
    344 $item->{$field."_gui"} = $field_gui;
    345 }
    346 if ( exists $opts{text} ) {
    347 $$text =~ s/$condition//sgi;
    348 } else {
    349 $item->{$field} =~ s/$condition//sgi;
    350 }
    351 }
    352 } elsif ( $rool->{command} eq 'regex' ) {
    353 my $from = $rool->{condition}{from};
    354 my $to = $rool->{condition}{to};
    355 if ( exists $opts{text} ) {
    356 eval ("\$\$text =~ s/$from/$to/sgi");
    357 } else {
    358 eval ("\$item->{\$field} =~ s/$from/$to/sgi");
    359 }
    360 }
    361 }
    362 }
    363
    364
    365 sub __check_ignore {
    366 my ($self, %opts) = @_;
    367 my $item = $opts{item};
    368 return unless ref $item;
    369 return unless ref $opts{rools} eq 'ARRAY';
    370 my @rools = grep { $_->{action} eq 'ignore' } @{ $opts{rools} };
    371 return unless @rools;
    372 foreach my $rool ( @rools ) {
    373 my $target = $rool->{target};
    374 if ( $rool->{command} =~ /^contain/ ) {
    375 $item->{ignore} = 1 if index (lc($item->{$target}), lc($rool->{condition})) >= 0;
    376 }
    377 if ( $rool->{command} eq '=' ) {
    378 $item->{ignore} = 1 if lc($item->{$target}) eq lc($rool->{condition});
    379 }
    380 if ( $rool->{command} eq 'regex' ) {
    381 my $regex = $rool->{condition};
    382 $item->{ignore} = 1 if $item->{$target} =~ /$regex/sgi;
    383 }
    384 }
    385 }
    386
    387
    388 sub __check_only {
    389 my ($self, %opts) = @_;
    390 my $item = $opts{item};
    391 return unless ref $item;
    392 return unless ref $opts{rools} eq 'ARRAY';
    393 my @rools = grep { $_->{action} eq 'only' } @{ $opts{rools} };
    394 return unless @rools;
    395 foreach my $rool ( @rools ) {
    396 my $target = $rool->{target};
    397 if ( $rool->{command} =~ /^contain/ ) {
    398 $item->{ignore} = 1 unless index (lc($item->{$target}), lc($rool->{condition})) >= 0;
    399 }
    400 if ( $rool->{command} eq '=' ) {
    401 $item->{ignore} = 1 unless lc($item->{$target}) eq lc($rool->{condition});
    402 }
    403 if ( $rool->{command} eq 'regex' ) {
    404 my $regex = $rool->{condition};
    405 $item->{ignore} = 1 unless $item->{$target} =~ /$regex/sgi;
    406 }
    407 }
    408 }
    409
    410
    411 sub __feed_type {
    412 my ($self, $contref) = @_;
    413
    414 my $type;
    415 if ( $$contref =~ /<rss([^>]*)>/ ) {
    416 $type = 'RSS';
    417 } elsif ( $$contref =~ /<feed\s+([^>]*)>/ ) {
    418 my $feed_params = $1;
    419 my $params = $self->__parse_params ($feed_params);
    420 if ( exists $params->{xmlns} && $params->{xmlns} =~ /purl.org\/atom/ ) {
    421 $type = 'ATOM';
    422 } elsif ( exists $params->{xmlns} && $params->{xmlns} =~ /www.w3.org\/2005\/Atom/ ) {
    423 $type = 'ATOM';
    424 }
    425 } elsif ( $$contref =~ /<rdf([^>]*)>/ ) {
    426 $type = 'RDF';
    427 }
    428 return $type;
    429 }
    430
    431
    432 sub __parse_content {
    433 my ($self, $contref) = @_;
    434
    435 my $feed_type = $self->__feed_type($contref);
    436 # warn "FEED Type = [$feed_type]\n";
    437 return undef unless $feed_type;
    438
    439 $$contref =~ s/>\s+</></sgi;
    440 $$contref =~ s/<items(.*?)>(.*?)<\/items([^>]*?)>//sgi;
    441 $$contref =~ s/<\/?br(.*?)>/\n/sgi;
    442 $$contref =~ s/<\/?nobr(.*?)>//sgi;
    443 #$$contref =~ s/<p>/\n\n/sgi;
    444 #$$contref =~ s/<p\s(.*?)>/\n\n/sgi;
    445 $$contref =~ s/<\/?strong\s(.*?)>//sgi;
    446 $$contref =~ s/<\/?s>//sgi;
    447 $$contref =~ s/<\/?i>//sgi;
    448 $$contref =~ s/<\/?b>//sgi;
    449 $$contref =~ s/<\/?strong>//sgi;
    450 #$$contref =~ s/<\/p>//sgi;
    451 #$$contref =~ s/<\/p\s(.*?)>//sgi;
    452 my @items;
    453
    454 if ( $feed_type eq 'RSS' ) {
    455 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
    456 my $item_params = $1;
    457 my $item_body = $2;
    458 # warn "BODY: [$item_body]\n\n";
    459 my $params = $self->__parse_params ($item_params);
    460 my $item = $self->__parse_item_RSS ($item_body) || {};
    461 if ( ref $params eq 'HASH' ) {
    462 foreach my $key ( %$params ) {
    463 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
    464 push @{ $item->{$key} }, $params->{$key};
    465 } elsif ( exists $item->{$key} ) {
    466 my @arr = ( $item->{$key}, $params->{$key} );
    467 $item->{$key} = \@arr;
    468 } else {
    469 $item->{$key} = $params->{$key};
    470 }
    471 }
    472 }
    473 if ( ref $item eq 'HASH' && scalar keys %$item ) {
    474 if ( exists $item->{'feedburner:origlink'} ) {
    475 $item->{link} = ref $item->{'feedburner:origlink'} eq 'ARRAY' ? $item->{'feedburner:origlink'}->[0] : $item->{'feedburner:origlink'};
    476 } elsif ( !exists $item->{link} ) {
    477 foreach my $key ( qw( guid ) ) {
    478 if ( exists $item->{$key} ) {
    479 $item->{link} = $item->{$key};
    480 last;
    481 }
    482 }
    483 }
    484 push @items, $item;
    485 }
    486 # warn Dumper($item);
    487 }
    488 }
    489 if ( $feed_type eq 'RDF' ) {
    490 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) {
    491 my $item_params = $1;
    492 my $item_body = $2;
    493 # warn "BODY: [$item_body]\n\n";
    494 my $params = $self->__parse_params ($item_params);
    495 my $item = $self->__parse_item_RSS ($item_body) || {};
    496 if ( ref $params eq 'HASH' ) {
    497 foreach my $key ( %$params ) {
    498 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) {
    499 push @{ $item->{$key} }, $params->{$key};
    500 } elsif ( exists $item->{$key} ) {
    501 my @arr = ( $item->{$key}, $params->{$key} );
    502 $item->{$key} = \@arr;
    503 } else {
    504 $item->{$key} = $params->{$key};
    505 }
    506 }
    507 }
    508 # warn Dumper($item);
    509 if ( ref $item eq 'HASH' && scalar keys %$item ) {
    510 if ( !exists $item->{pubdate} ) {
    511 foreach my $key ( 'prism:publicationdate', 'dc:date' ) {
    512 if ( exists $item->{$key} ) {
    513 $item->{pubdate} = $item->{$key};
    514 last;
    515 }
    516 }
    517 }
    518 push @items, $item;
    519 }
    520 }
    521 }
    522 if ( $feed_type eq 'ATOM' ) {
    523 while ( $$contref =~ /<entry(.*?)>(.*?)<\/entry([^>]*?)>/sgi ) {
    524 my $item_params = $1;
    525 my $item_body = $2;
    526 my $item = $self->__parse_item_ATOM ($item_body) || {};
    527 # warn Dumper($item);
    528 if ( ref $item eq 'HASH' && scalar keys %$item ) {
    529 if ( !exists $item->{pubdate} ) {
    530 foreach my $key ( 'published', 'updated' ) {
    531 if ( exists $item->{$key} ) {
    532 $item->{pubdate} = $item->{$key};
    533 last;
    534 }
    535 }
    536 }
    537 push @items, $item;
    538 }
    539 }
    540 }
    541 return ( scalar @items ? \@items : undef );
    542 }
    543
    544
    545 sub __parse_params {
    546 my ($self, $params) = @_;
    547 return undef unless $params;
    548
    549 my %params;
    550 while ( $params =~ /([\w\:]+)(\s*?)=(\s*?)["'](.*?)["']/sgi ) {
    551 my $name = $1;
    552 my $value = $4;
    553 if ( $name && $value ) {
    554 $params{$name} = $value;
    555 }
    556 }
    557 return ( scalar(keys %params) ? \%params : undef );
    558 }
    559
    560
    561 sub __parse_item_RSS {
    562 my ($self, $item_body, $debug) = @_;
    563 return undef unless $item_body;
    564
    565 my %item;
    566 # my $embedded = $self->__item_cut_rss_embedded(\$item_body);
    567 # if ( ref $embedded ) {
    568 # %item = %$embedded;
    569 # }
    570 # my $content = $self->__item_cut_rss_description(\$item_body);
    571 # $item{description} = $content if $content;
    572 # my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
    573 # if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
    574 # foreach my $elem ( @$one_string_elements ) {
    575 # my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
    576 # if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
    577 # push @{ $item{$elem_name} }, $elem->{$elem_name};
    578 # } elsif ( exists $item{$elem_name} ) {
    579 # $item{$elem_name} = [$item{$elem_name}, $elem->{$elem_name}];
    580 # } else {
    581 # $item{$elem_name} = $elem->{$elem_name};
    582 # }
    583 # }
    584 # }
    585 my $parsed = $self->__make_tree (\$item_body, $debug);
    586 # warn Dumper($parsed);
    587 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
    588 foreach my $tag ( @{ $parsed->{1}{children} } ) {
    589 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
    590 my %params;
    591 foreach my $it ( @{ $tag->{children} } ) {
    592 next unless $it->{text};
    593 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
    594 push @{ $params{$it->{type}} }, $it->{text};
    595 } elsif ( exists $params{$it->{type}} ) {
    596 my @arr = ( $params{$it->{type}}, $it->{text} );
    597 $params{$it->{type}} = \@arr;
    598 } else {
    599 $params{$it->{type}} = $it->{text};
    600 }
    601 }
    602 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
    603 push @{ $item{$tag->{type}} }, \%params;
    604 } elsif ( exists $item{$tag->{type}} ) {
    605 my @arr = ( $item{$tag->{type}}, \%params );
    606 $item{$tag->{type}} = \@arr;
    607 } else {
    608 $item{$tag->{type}} = \%params;
    609 }
    610 } else {
    611 my $body = $tag->{text} || $tag->{params};
    612 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
    613 push @{ $item{$tag->{type}} }, $body;
    614 } elsif ( exists $item{$tag->{type}} ) {
    615 my @arr = ( $item{$tag->{type}}, $body );
    616 $item{$tag->{type}} = \@arr;
    617 } else {
    618 $item{$tag->{type}} = $body;
    619 }
    620 }
    621 }
    622 }
    623 # warn Dumper(\%item);
    624 return \%item;
    625 }
    626
    627
    628 sub __parse_item_ATOM {
    629 my ($self, $item_body, $debug) = @_;
    630 return undef unless $item_body;
    631
    632 my %item;
    633 my $embedded = $self->__item_cut_rss_embedded(\$item_body);
    634 if ( ref $embedded ) {
    635 %item = %$embedded;
    636 }
    637 if ( exists $item{summary} ) {
    638 $item{description} = delete $item{summary};
    639 } else {
    640 my $summary = $self->__item_cut_atom_summary(\$item_body);
    641 $item{description} = $summary if $summary;
    642 }
    643 my $content = $self->__item_cut_atom_content(\$item_body);
    644 if ( $content && $item{description} ) {
    645 $item{fulltext} = $content;
    646 } elsif ( $content ) {
    647 $item{description} = $content;
    648 }
    649 my $one_string_elements = $self->__item_cut_single_elements (\$item_body);
    650 # warn Dumper ($one_string_elements);
    651 if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) {
    652 foreach my $elem ( @$one_string_elements ) {
    653 my ($elem_name) = keys %$elem if ref $elem eq 'HASH';
    654 if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) {
    655 push @{$item{$elem_name}}, $elem->{$elem_name};
    656 } elsif ( exists $item{$elem_name} ) {
    657 my @arr = ($item{$elem_name}, $elem->{$elem_name});
    658 $item{$elem_name} = \@arr;
    659 } else {
    660 $item{$elem_name} = $elem->{$elem_name};
    661 }
    662 if ( $elem->{$elem_name}{type} =~ /^image/ ) {
    663 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
    664 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
    665 push @{ $item{enclosure} }, $enclosure;
    666 } elsif ( exists $item{enclosure} ) {
    667 my @arr = ($item{enclosure}, $enclosure);
    668 $item{enclosure} = \@arr;
    669 } else {
    670 $item{enclosure} = $enclosure;
    671 }
    672 }
    673 if ( $elem->{$elem_name}{type} =~ /^video/ ) {
    674 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} };
    675 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) {
    676 push @{ $item{enclosure} }, $enclosure;
    677 } elsif ( exists $item{enclosure} ) {
    678 my @arr = ($item{enclosure}, $enclosure);
    679 $item{enclosure} = \@arr;
    680 } else {
    681 $item{enclosure} = $enclosure;
    682 }
    683 }
    684 }
    685 }
    686 my $parsed = $self->__make_tree (\$item_body, $debug);
    687 # warn Dumper($parsed);
    688 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) {
    689 foreach my $tag ( @{ $parsed->{1}{children} } ) {
    690 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) {
    691 my %params;
    692 foreach my $it ( @{ $tag->{children} } ) {
    693 next unless $it->{text};
    694 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) {
    695 push @{ $params{$it->{type}} }, $it->{text};
    696 } elsif ( exists $params{$it->{type}} ) {
    697 my @arr = ( $params{$it->{type}}, $it->{text} );
    698 $params{$it->{type}} = \@arr;
    699 } else {
    700 $params{$it->{type}} = $it->{text};
    701 }
    702 }
    703 if ( exists $tag->{params} && ref $tag->{params} eq 'HASH' ) {
    704 while ( my ($param, $value) = each %{ $tag->{params} } ) {
    705 if ( exists $params{$param} && ref $params{$param} eq 'ARRAY' ) {
    706 push @{ $params{$param} }, $value;
    707 } elsif ( exists $params{$param} ) {
    708 my @arr = ( $params{$param}, $value );
    709 $params{$param} = \@arr;
    710 } else {
    711 $params{$param} = $value;
    712 }
    713 }
    714 }
    715 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
    716 push @{ $item{$tag->{type}} }, \%params;
    717 } elsif ( exists $item{$tag->{type}} ) {
    718 my @arr = ( $item{$tag->{type}}, \%params );
    719 $item{$tag->{type}} = \@arr;
    720 } else {
    721 $item{$tag->{type}} = \%params;
    722 }
    723 } else {
    724 my $body = $tag->{text} || $tag->{params};
    725 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) {
    726 push @{ $item{$tag->{type}} }, $body;
    727 } elsif ( exists $item{$tag->{type}} ) {
    728 my @arr = ( $item{$tag->{type}}, $body );
    729 $item{$tag->{type}} = \@arr;
    730 } else {
    731 $item{$tag->{type}} = $body;
    732 }
    733 }
    734 }
    735 my $pubDate = exists $item{issued} ? $item{issued} : exists $item{modified} ? $item{modified} : undef;
    736 $item{pubdate} = $pubDate if $pubDate;
    737 }
    738
    739 # warn Dumper(\%item);
    740 return \%item;
    741 }
    742
    743
    744 sub __make_tree {
    745 my ($self, $content, $debug) = @_;
    746
    747 my @elems = split (//,$$content);
    748 # warn "CONTENT: [$$content]\n\n";
    749 my $id = 1;
    750 my $level = 0;
    751 my @stack;
    752 my %tree = (
    753 root => {
    754 id => $id++,
    755 text => '',
    756 type => 'root',
    757 children=> [],
    758 parent => undef,
    759 level => $level,
    760 },
    761 );
    762 my %elem_hash = ( 1 => $tree{root} );
    763 my $current = $tree{root};
    764
    765 while ( @elems ) {
    766 if ( $elems[0] eq '<' && $elems[1] =~ /[\!a-zA-Z]/ ) {
    767 my $tag = $self->__try_tag (\@elems);
    768 if ( ref $tag && $tag->{type} eq 'text' ) {
    769 $current->{text} .= $tag->{content};
    770 splice @elems, 0, $tag->{count};
    771 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
    772 } elsif ( ref $tag && exists $tag->{closed} && $tag->{closed} ) {
    773 $tag->{id} = $id++;
    774 $tag->{parent} = $current;
    775 $tag->{level} = $level+1;
    776 $elem_hash{$tag->{id}} = $tag;
    777 push @{$current->{children}}, $tag;
    778 splice @elems, 0, $tag->{count};
    779 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
    780 } elsif ( ref $tag ) {
    781 $tag->{id} = $id++;
    782 $tag->{children} = [];
    783 $tag->{parent} = $current;
    784 $tag->{level} = ++$level;
    785 $elem_hash{$tag->{id}} = $tag;
    786 push @{$current->{children}}, $tag;
    787 push @stack, $current;
    788 $current = $tag;
    789 splice @elems, 0, $tag->{count};
    790 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n";
    791 } else {
    792 # warn "!!!! Error: RSS analyse. Job on item broken... !!!!\n" if $debug;
    793 return undef;
    794 }
    795 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
    796 my $tag = $self->__try_end (\@elems);
    797 if ( ref $tag && $tag->{type} eq 'text' ) {
    798 $current->{text} .= $tag->{content};
    799 $current->{count} += $tag->{count};
    800 splice @elems, 0, $tag->{count};
    801 } elsif ( ref $tag ) {
    802 if ( $current->{type} ne $tag->{type} ) {
    803 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
    804 return undef;
    805 } else {
    806 $current = pop @stack;
    807 $level = $current->{level};
    808 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
    809 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
    810 }
    811 splice @elems, 0, $tag->{count};
    812 } else {
    813 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
    814 return undef;
    815 }
    816 } else {
    817 $current->{text} .= shift @elems;
    818 $current->{count}++;
    819 }
    820 }
    821 return (\%elem_hash);
    822 }
    823
    824
    825
    826 sub __try_tag {
    827 my ($self, $content) = @_;
    828
    829 my $i = 1;
    830 my %tag;
    831 my $tag = $content->[0];
    832 if ( $content->[$i] eq '!' ) {
    833 # warn "What? Think it's CDATA\n";
    834 my $try_cdata = join '', @$content[1..8];
    835 if ( $try_cdata eq '![CDATA[' ) {
    836 $tag = '';
    837 $i = 9;
    838 while ( !($content->[$i-1] eq '>' && $content->[$i-2] eq ']' && $content->[$i-3] eq ']') && $i < scalar @$content ) {
    839 $tag .= $content->[$i];
    840 $i++;
    841 }
    842 chop $tag; chop $tag; chop $tag;
    843 }
    844 # warn "CDATA Found: [$tag]";
    845 return {
    846 type => 'text',
    847 content => $tag,
    848 count => $i,
    849 };
    850 }
    851 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    852 $tag .= $content->[$i];
    853 $i++;
    854 }
    855 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
    856 return {
    857 type => 'text',
    858 content => $tag,
    859 count => $i,
    860 };
    861 } else {
    862 if ( $tag =~ /^<([\w:-]+)\s*(.*)/si ) {
    863 my $elem_name = $1;
    864 my $elem_body = $2;
    865 unless ( $self->__is_valid_tag ($elem_name) ) {
    866 return {
    867 type => 'text',
    868 content => $tag,
    869 count => $i,
    870 };
    871 } else {
    872 my $params = $self->__parse_params ($elem_body) if $elem_body;
    873 if ( $content->[$i] eq '>' && $content->[$i-1] eq '/' ) {
    874 $tag{closed} = 1;
    875 }
    876 $tag{type} = lc($elem_name);
    877 $tag{count} = $i+1;
    878 $tag{params} = $params if ref $params;
    879 return \%tag;
    880 }
    881 } else {
    882 return {
    883 type => 'text',
    884 content => $tag,
    885 count => $i,
    886 };
    887 }
    888 }
    889 }
    890
    891
    892 sub __try_end {
    893 my ($self, $content) = @_;
    894
    895 my $i = 2;
    896 my %tag;
    897 my $tag = $content->[0].$content->[1];
    898 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
    899 $tag .= $content->[$i];
    900 $i++;
    901 }
    902 if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
    903 return {
    904 type => 'text',
    905 content => $tag,
    906 count => $i,
    907 };
    908 } else {
    909 if ( $tag =~ /^<\/([\w:-]+)/i ) {
    910 my $elem_name = $1;
    911 unless ( $self->__is_valid_tag ($elem_name) ) {
    912 return {
    913 type => 'text',
    914 content => $tag,
    915 count => $i,
    916 };
    917 } else {
    918 $tag{type} = lc($elem_name);
    919 $tag{count} = $i+1;
    920 return \%tag;
    921 }
    922 } else {
    923 return {
    924 type => 'text',
    925 content => $tag,
    926 count => $i,
    927 };
    928 }
    929 }
    930 }
    931
    932
    933 sub __is_valid_tag {
    934 my ($self, $tag) = @_;
    935 foreach my $invtag ( @INVALID_TAGS ) {
    936 return 0 if lc($invtag) eq lc($tag);
    937 }
    938 return 1;
    939 }
    940
    941
    942 sub __item_cut_atom_content {
    943 my ($self, $item_body) = @_;
    944
    945 my %elem;
    946 if ( $$item_body =~ /<content([^>]*?)>(.*?)<\/content([^>]*)>/si ) {
    947 my $content_params = $1;
    948 my $content_body = $2;
    949 my $params = $self->__parse_params ($content_params) if $content_params;
    950 $$item_body =~ s/<content([^>]*?)>(.*?)<\/content([^>]*)>//si;
    951 return $content_body;
    952 }
    953 }
    954
    955
    956 sub __item_cut_atom_summary {
    957 my ($self, $item_body) = @_;
    958
    959 my %elem;
    960 if ( $$item_body =~ /<summary([^>]*)>(.*?)<\/summary([^>]*)>/si ) {
    961 my $content_params = $1;
    962 my $content_body = $2;
    963 my $params = $self->__parse_params ($content_params) if $content_params;
    964 $$item_body =~ s/<summary([^>]*)>(.*?)<\/summary([^>]*)>//si;
    965 return $content_body;
    966 }
    967 }
    968
    969
    970 sub __item_cut_rss_description {
    971 my ($self, $item_body) = @_;
    972
    973 my %elem;
    974 if ( $$item_body =~ /<description([^>]*?)>(.*?)<\/description([^>]*)>/si ) {
    975 my $content_params = $1;
    976 my $content_body = $2;
    977 my $params = $self->__parse_params ($content_params) if $content_params;
    978 $$item_body =~ s/<description([^>]*?)>(.*?)<\/description([^>]*)>//si;
    979 return $content_body;
    980 }
    981 }
    982
    983
    984 sub __item_cut_rss_embedded {
    985 my ($self, $item_body) = @_;
    986
    987 my %elem;
    988 while ( $$item_body =~ /<([^>]*?)>\s*<!\[CDATA\[(.*?)\]\]>\s*<\/([^>]*)>/sgi ) {
    989 my $tag = $3;
    990 my $content_body = $2;
    991 my $content_params = $1;
    992 if ( $content_params =~ /([\w:-]+)\s+(.*)/ ) {
    993 $tag = 1;
    994 $content_params = $2;
    995 }
    996 my $params = $self->__parse_params ($content_params) if $content_params;
    997 $elem{$tag} = $content_body;
    998 $$item_body =~ s/<$tag([^>]*?)>(.*?)<\/$tag([^>]*)>//si;
    999 }
    1000 return scalar keys %elem ? \%elem : undef;
    1001 }
    1002
    1003
    1004
    1005 sub __item_cut_single_elements {
    1006 my ($self, $item_body) = @_;
    1007
    1008 my @elems;
    1009 while ( $$item_body =~ /<([\w\:\-]+)\s*([^>]*?)\/>/sgi ) {
    1010 my $elem_name = $1;
    1011 my $elem_body = $2;
    1012 my $params = $self->__parse_params ($elem_body) if $elem_body;
    1013 if ( $elem_name && ref $params ) {
    1014 push @elems, { $elem_name => $params }
    1015 }
    1016 }
    1017 $$item_body =~ s/<(\w+)\s*([^>]*?)\/>//sgi;
    1018 return ( @elems ? \@elems : undef );
    1019 }
    1020
    1021
    1022 sub __field_prepare {
    1023 my ($self, $text) = @_;
    1024
    1025 # $text =~ s/^[\n\r\x20\t]+//;
    1026 $self->__cdata (\$text);
    1027 $self->__extchar (\$text);
    1028 $text = HTML::Entities::decode_entities($text);
    1029
    1030 # Remove linebreaks inside incorrectly breaked paragraphs
    1031 if (length($text) > 100) {
    1032 my $pcount = 0;
    1033 while ($text =~ /<p>(.+?)(?=<\/?p>|$)/sgi) {
    1034 my $p = $1;
    1035 if (length $p > 50) {
    1036 my ($dcount, $ndcount) = ();
    1037 # Count sentences normally ended vs breaked
    1038 $dcount++ while $p =~ /(\.|\?|\!)['"]?\s*[\r\n]+/g;
    1039 $ndcount++ while $p =~ /([^\.\?\!\s])\s*[\r\n]+/g;
    1040 # Found broken paragraph
    1041 last if $ndcount > $dcount and ++$pcount > 1;
    1042 }
    1043 }
    1044 if ($pcount > 0) {
    1045 $text =~ s/[\n\r]+/ /sg;
    1046 }
    1047 }
    1048 $text =~ s/<br[^>]*>/\n/sgi;
    1049 $text =~ s/<p\s*>/\n\n/sgi;
    1050 $text =~ s/<\/p\s*>//sgi;
    1051 $text = rchannel::Parser::Util::strip_html($text);
    1052 $text = rchannel::Parser::Util::text_cleanup($text);
    1053 return $text;
    1054 }
    1055
    1056
    1057 sub __extchar {
    1058 my ($self, $textref) = @_;
    1059
    1060 for ( $$textref ) {
    1061 s/&#38;/\&/sg;
    1062 s/\&amp;/\&/sgi;
    1063 s/&#171;/«/sg;
    1064 s/&#187;/»/sg;
    1065 s/&#163;/£/sg;
    1066 s/&#150;/&ndash;/sg;
    1067 s/&#151;/&mdash;/sg;
    1068 s/&#132;/"/sg;
    1069 s/&#147;/"/sg;
    1070 s/&#148;/"/sg;
    1071 s/&#180;/'/sg;
    1072 s/&#133;/\.\.\./sg;
    1073 s/&#13;/\n/sg;
    1074 s/&#34;/"/sg;
    1075 s/\xA0/\x20/sg;
    1076 }
    1077 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg;
    1078 # $$textref =~ s/&gt;/>/sgi;
    1079 # $$textref =~ s/&lt;/</sgi;
    1080 # $$textref =~ s/&quot;/"/sgi;
    1081 # $$textref =~ s/&laquo;/«/sgi;
    1082 # $$textref =~ s/&raquo;/»/sgi;
    1083 # $$textref =~ s/&copy;/©/sgi;
    1084 # $$textref =~ s/&ndash;/–/sgi;
    1085 # $$textref =~ s/&mdash;/—/sgi;
    1086 # $$textref =~ s/&deg;/º/sgi;
    1087 # $$textref =~ s/&nbsp;/\x20/sgi;
    1088 }
    1089
    1090 sub __normalise {
    1091 my $chr = shift;
    1092 return sprintf("%04d",$chr);
    1093 }
    1094
    1095 sub __cdata {
    1096 my ($self, $textref) = @_;
    1097 if ( $$textref =~ /^<\!\[CDATA\[/ ) {
    1098 $$textref =~ s/<\!\[CDATA\[//sgi;
    1099 $$textref =~ s/\]\]>//sgi;
    1100 }
    1101 }
    1102
    1103
    1104 sub __parse_rools {
    1105 my ($self, $rools) = @_;
    1106 return unless $rools;
    1107 $rools =~ s/\r//sgi;
    1108 my @rools = split /\n/, $rools;
    1109 return unless @rools;
    1110
    1111 my @parsed;
    1112 foreach my $rool ( @rools ) {
    1113 my %pr;
    1114 next if $rool =~ /^#/;
    1115 $rool =~ s/[\x20\t]+$//;
    1116 $rool =~ s/^[\x20\t]+//;
    1117 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
    1118 $pr{action} = lc($1);
    1119 my $params = $2;
    1120 if ( $pr{action} eq 'use' && $params =~ /^(current)\s+(date)$/ ) {
    1121 $pr{action} = 'rewrite';
    1122 $pr{target} = 'pubdate';
    1123 $pr{command} = 'set';
    1124 $pr{condition} = 'CURRENT_DATETIME';
    1125 push @parsed, \%pr;
    1126 } elsif ( $params =~ /^(\w+)\s+(.*)$/ ) {
    1127 $pr{target} = lc($1);
    1128 $params = $2;
    1129 if ( $params =~ /^([\w=]+)\s+(.*)$/ ) {
    1130 $pr{command} = lc($1);
    1131 $params = $2;
    1132 if ( $pr{action} eq 'filter' && $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
    1133 $pr{subcommand} = lc($1); $params = $2;
    1134 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex)$/;
    1135 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
    1136 $pr{condition} = $params;
    1137 } elsif ( $pr{action} eq 'filter' && $pr{command} eq 'regex' && substr($params,0,1) eq substr($params,-1,1) && substr($params,0,1) =~ /([\/\#\|])/ ) {
    1138 my $delim = $1;
    1139 $params = substr($params,1,length($params)-2);
    1140 my @params = split(//,$params);
    1141 my ($from, $to) = ('','');
    1142 my $prev = '';
    1143 while ( @params ) {
    1144 my $ch = shift @params;
    1145 if ( $ch eq $delim && $prev ne '\\' ) {
    1146 last;
    1147 } else {
    1148 $prev = $ch;
    1149 $from .= $ch;
    1150 }
    1151 }
    1152 $to = join ('', @params);
    1153 $pr{condition} = { from => $from, to => $to };
    1154 } elsif ( ($pr{action} eq 'ignore' || $pr{action} eq 'only') && $pr{command} =~ /^(regex|=|contain|contains)$/ ) {
    1155 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
    1156 $pr{condition} = $params;
    1157 } else {
    1158 next;
    1159 }
    1160 push @parsed, \%pr;
    1161 }
    1162 }
    1163 }
    1164 }
    1165 return ( scalar @parsed ? \@parsed : undef );
    1166 }
    1167
    1168
    1169 sub __parse_related {
    1170 my ($self, $related) = @_;
    1171 return unless ref $related eq 'HASH';
    1172 return unless exists $related->{url} && $related->{url} =~ /^http:\/\//i;
    1173 return unless exists $related->{rel} && $related->{rel} =~ /(news|discussion|teaser)/;
    1174 my $result = { url => $related->{url}, rel => $related->{rel} };
    1175 $result->{type} = $related->{type} if exists $related->{type};
    1176 $result->{title} = $self->__field_prepare($related->{title}) if exists $related->{title} && $related->{title};
    1177
    1178 $result->{author} = $self->__field_prepare($related->{author}) if exists $related->{author} && $related->{author};
    1179 $result->{description} = $self->__field_prepare($related->{description}) if exists $related->{description} && $related->{description};
    1180
    1181 if ( exists $related->{pubdate} && $related->{pubdate} ) {
    1182 my $pubdate = Class::Date::localdate(Date::Parse::str2time($related->{pubdate}));
    1183 $result->{pubdate} = $pubdate if $pubdate;
    1184 }
    1185 if ( $related->{rel} =~ /(news|teaser)/ ) {
    1186 return undef unless $result->{title} && $result->{pubdate};
    1187 } else {
    1188 $result->{title} ||= 'Обсудить';
    1189 }
    1190
    1191 if ( exists $related->{image} && $related->{image} ) {
    1192 if ( ref $related->{image} eq 'HASH' && (exists $related->{image}{url} || exists $related->{image}{href}) ) {
    1193 my $img = rchannel::Image->new( { src => ($related->{image}{url} || $related->{image}{href}) } );
    1194 $result->{image} = $img if ref $img;
    1195 } elsif ( !ref $related->{image} ) {
    1196 my $img = rchannel::Image->new( { src => $related->{image} } );
    1197 $result->{image} = $img if ref $img;
    1198 }
    1199 }
    1200
    1201 return $result;
    1202 }
    1203
    1204
    1205 # TODO IMAGES:
    1206 # enclosure
    1207 # media:content
    1208 # media:thumbnail
    1209 # image
    1210 # img
    1211
    1212 # FOUDNED:
    1213 # author
    1214 # category
    1215 # comments
    1216 # content
    1217 # content:encoded
    1218 # content:format
    1219 # dc:creator
    1220 # dc:date
    1221 # dc:rights
    1222 # dc:subject
    1223 # description
    1224 # enclosure
    1225 # feedburner:awareness
    1226 # feedburner:origLink
    1227 # full-text
    1228 # fulltext
    1229 # guid
    1230 # guide
    1231 # habrahabr:ballsCount
    1232 # habrahabr:commentsCount
    1233 # id
    1234 # image
    1235 # img
    1236 # link
    1237 # media:content
    1238 # media:thumbnail
    1239 # pdalink
    1240 # pubDate
    1241 # pubdate
    1242 # pubid
    1243 # published
    1244 # rambler:full-text
    1245 # rambler:fulltext
    1246 # region
    1247 # section
    1248 # sections
    1249 # source
    1250 # sport
    1251 # summary
    1252 # text
    1253 # title
    1254 # updated
    1255 # wfw:commentRSS
    1256 # wfw:commentRss
    1257 # wmj:fulltext
    1258 # yandex:full-text
    1259
    1260 1;
  • utf8/core/lib/Contenido/Parser/Util.pm

     
    1 package Contenido::Parser::Util;
    2
    3 use strict;
    4
    5 sub clean_invalid_chars { # http://www.w3.org/TR/REC-xml/#NT-Char
    6 my ($cont_ref) = shift;
    7 $$cont_ref =~ s/[\x0-\x8|\xB\xC|\xE-\x1F|\x{d800}-\x{dfff}|\x{fffe}\x{ffff}]//sgi;
    8 }
    9
    10 sub text_cleanup {
    11 my $text = shift;
    12 my $delim = shift || "\n\n";
    13
    14 $text =~ s/^\s+//; $text =~ s/\s+$//;
    15 $text =~ s/\r\n/\n/g;
    16
    17 my @paragfs = $text =~ /\n{2,}/ ? # is paragraphs detected?
    18 split /\n{2,}/, $text : # - by paragraphs only
    19 split /\n+/, $text; # - by any newline
    20
    21 for (@paragfs) {
    22 s/^\s+//mg; s/\s+$//mg; # trim whitespace
    23 s/[[:blank:]]+/ /g; # collapse spaces
    24 }
    25
    26 return join "\n\n", grep length $_, @paragfs;
    27 }
    28
    29 1;
  • utf8/core/lib/Utils/HTML.pm

     
    269 269 $t1 = $t2 = join ' ', @words[0 .. $args{max_words}-1];
    270 270
    271 271 # magic !
    272 my @wds = split ' ', $t1;
    273 return $t1 if $t1 =~ s/^(.+[\w»")]{3,}[.!?])+\s?[А-ЯA-Z«"].+?$/$1/ and scalar(@wds) > $args{min_words};
    272 s/^(.+\w{3,}[»")]?[.!?]+)\s*[А-ЯA-Z«"].+?$/$1/s and (()=/(\s+)/g)>$args{min_words} and return$_ for $t1;
    274 273
    275 274 $t2 =~ s/[.,:;!?\s—-]+$//;
    276 275 $t2.($args{ending} || '');