1 |
98 |
ahitrov |
package Contenido::Parser::RSS; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use warnings; |
5 |
|
|
use locale; |
6 |
|
|
|
7 |
99 |
ahitrov |
use base 'Contenido::Parser'; |
8 |
|
|
|
9 |
|
|
use Contenido::Globals; |
10 |
98 |
ahitrov |
use Utils::HTML; |
11 |
99 |
ahitrov |
use Time::ParseDate; |
12 |
98 |
ahitrov |
#use Date::Parse; |
13 |
|
|
use Data::Dumper; |
14 |
|
|
use Digest::MD5 qw(md5_hex); |
15 |
|
|
#use Class::Date; |
16 |
|
|
use Encode; |
17 |
|
|
use utf8; |
18 |
|
|
|
19 |
|
|
my @INVALID_TAGS = qw ( A ABBREV ACRONYM ADDRESS APP APPLET AREA AU B BANNER BASE BASEFONT BDO BGSOUND BIG BLINK BLOCKQUOTE |
20 |
|
|
BODY BQ BR CAPTION CENTER CITE CODE COL COLGROUP CREDIT DD DEL DFN DIR DIV DL DT EM FN FIG FONT FORM FRAME FRAMESET |
21 |
|
|
H1 H2 H3 H4 H5 H6 HP HR I IMG INPUT INS ISINDEX KBD LANG LH LI LISTING MAP MARQUEE MENU META NEXTID NOBR NOEMBED |
22 |
|
|
NOFRAMES NOTE OL OPTION OVERLAY P PARAM PERSON PLAINTEXT PRE Q S SAMP SELECT SMALL SPAN STRIKE STRONG SUB SUP TAB |
23 |
|
|
TABLE TBODY TD TEXTAREA TFOOT TH THEAD TR TT U UL VAR WBR XMP EMBED |
24 |
|
|
); |
25 |
|
|
|
26 |
|
|
sub new { |
27 |
|
|
my ($proto) = @_; |
28 |
|
|
my $class = ref($proto) || $proto; |
29 |
|
|
my $self = {}; |
30 |
|
|
bless $self, $class; |
31 |
|
|
|
32 |
|
|
return $self; |
33 |
|
|
} |
34 |
|
|
|
35 |
|
|
sub parse { |
36 |
|
|
my ($self, %opts) = @_; |
37 |
|
|
|
38 |
99 |
ahitrov |
my $content; |
39 |
|
|
if ( $opts{content} ) { |
40 |
|
|
$content = delete $opts{content}; |
41 |
|
|
delete $self->{content}; |
42 |
|
|
} elsif ( $self->{success} || $self->{content} ) { |
43 |
|
|
$content = delete $self->{content}; |
44 |
|
|
} else { |
45 |
|
|
$self->{success} = 0; |
46 |
|
|
return $self; |
47 |
|
|
} |
48 |
|
|
my $base_url = delete $self->{base_url} || delete $opts{base_url}; |
49 |
98 |
ahitrov |
my $allow_global_fulltext = delete $opts{allow_fulltext} || 0; |
50 |
|
|
my $content_global_type = delete $opts{content_type} || 1; |
51 |
99 |
ahitrov |
my $debug = $DEBUG; |
52 |
98 |
ahitrov |
my $gui = delete $opts{gui}; |
53 |
|
|
my $description_as_fulltext = delete $opts{description_as_fulltext}; |
54 |
|
|
warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug; |
55 |
|
|
|
56 |
|
|
my $rss_rools = $self->__parse_rools (delete $opts{parser_rss}); |
57 |
|
|
|
58 |
|
|
warn "RSS Rools: ".Dumper ($rss_rools) if $debug; |
59 |
|
|
|
60 |
|
|
my @items; |
61 |
|
|
my $feed = $self->__parse_content(\$content); |
62 |
|
|
|
63 |
|
|
if ( ref $feed eq 'ARRAY' ) { |
64 |
|
|
foreach my $item ( @$feed ) { |
65 |
|
|
my $fulltext_field; |
66 |
|
|
my $content_type = $content_global_type; |
67 |
|
|
my $allow_fulltext = $allow_global_fulltext; |
68 |
|
|
$self->__check_rewrite ( item => $item, rools => $rss_rools ); |
69 |
99 |
ahitrov |
my $date = $self->__parse_date($item->{pubdate}); |
70 |
|
|
my $pubdate = Contenido::DateTime->new( epoch => $date ); |
71 |
|
|
$pubdate = $pubdate->ymd('-').' '.$pubdate->hms; |
72 |
98 |
ahitrov |
next if ref $item->{title}; |
73 |
|
|
next if ref $item->{description}; |
74 |
|
|
$self->__check_ignore ( item => $item, rools => $rss_rools ); |
75 |
|
|
$self->__check_only ( item => $item, rools => $rss_rools ); |
76 |
|
|
$item->{title} = $self->__field_prepare ($item->{title}); |
77 |
|
|
$self->__check_filter ( gui => $gui, field => 'title', item => $item, rools => $rss_rools ); |
78 |
|
|
my $title = $item->{title}; |
79 |
|
|
my $link; |
80 |
|
|
if ( ref $item->{link} eq 'HASH' ) { |
81 |
|
|
if ( ( (exists $item->{link}{type} && $item->{link}{type} eq 'text/html') || !exists $item->{link}{type} ) && exists $item->{link}{href} ) { |
82 |
|
|
$link = $item->{link}{href}; |
83 |
|
|
} |
84 |
|
|
} elsif ( ref $item->{link} eq 'ARRAY' ) { |
85 |
|
|
foreach my $lnk ( @{ $item->{link} } ) { |
86 |
|
|
if ( ref $lnk ) { |
87 |
|
|
if ( ( (exists $lnk->{type} && $lnk->{type} eq 'text/html') || !exists $lnk->{type} ) && exists $lnk->{href} ) { |
88 |
|
|
$link = $lnk->{href}; |
89 |
|
|
} |
90 |
|
|
} else { |
91 |
|
|
$link = $lnk; |
92 |
|
|
last; |
93 |
|
|
} |
94 |
|
|
} |
95 |
|
|
} else { |
96 |
|
|
$link = $item->{'link'} || (ref $item->{'url'} eq 'ARRAY' ? $item->{'url'}->[0] : $item->{'url'}); |
97 |
|
|
} |
98 |
|
|
$link = $self->__field_prepare ($link); |
99 |
|
|
$link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/); |
100 |
|
|
$item->{description} = $self->__field_prepare ($item->{description}); |
101 |
|
|
$self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools ); |
102 |
|
|
my $description = $item->{description}; |
103 |
|
|
if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) { |
104 |
|
|
$allow_fulltext = 1; |
105 |
|
|
} |
106 |
|
|
my $fulltext; |
107 |
|
|
if ( $description_as_fulltext ) { |
108 |
|
|
$fulltext = $description; |
109 |
|
|
$fulltext_field = 'description' |
110 |
|
|
} else { |
111 |
|
|
if ( $gui ) { |
112 |
|
|
foreach my $field ( qw( rambler:fulltext rambler:full-text yandex:full-text mailru:full-text content:encoded full-text fulltext text ) ) { |
113 |
|
|
if ( exists $item->{$field} && $item->{$field} ) { |
114 |
|
|
$fulltext_field = $field; |
115 |
|
|
$fulltext = $item->{$field}; |
116 |
|
|
last; |
117 |
|
|
} |
118 |
|
|
} |
119 |
|
|
} else { |
120 |
|
|
$fulltext = |
121 |
|
|
$item->{'rambler:fulltext'} || |
122 |
|
|
$item->{'rambler:full-text'} || |
123 |
|
|
$item->{'yandex:full-text'} || |
124 |
|
|
$item->{'mailru:full-text'} || |
125 |
|
|
$item->{'content:encoded'} || |
126 |
|
|
$item->{'full-text'} || |
127 |
|
|
$item->{'fulltext'} || |
128 |
|
|
$item->{'text'}; |
129 |
|
|
} |
130 |
|
|
if ( ref $fulltext eq 'HASH') { |
131 |
|
|
my @values = values %$fulltext; |
132 |
|
|
if ( scalar @values == 1 ) { |
133 |
|
|
$fulltext = $values[0]; |
134 |
|
|
} |
135 |
|
|
} |
136 |
|
|
if ( ref $fulltext eq 'ARRAY' ) { |
137 |
|
|
$fulltext = join "\n", @$fulltext; |
138 |
|
|
} |
139 |
|
|
$self->__check_filter ( gui => $gui, field => 'fulltext', item => $item, text => \$fulltext, rools => $rss_rools ); |
140 |
|
|
$fulltext = $self->__field_prepare ($fulltext); |
141 |
|
|
} |
142 |
|
|
if ( $fulltext && !$description ) { |
143 |
|
|
$item->{description} = Utils::HTML::limit_words ( $fulltext, 150, 300 ); |
144 |
|
|
$self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools ); |
145 |
|
|
$description = $item->{description}; |
146 |
|
|
} |
147 |
|
|
$allow_fulltext = 0 unless $fulltext; |
148 |
|
|
my $author; |
149 |
|
|
if ( exists $item->{author} && $item->{author} ) { |
150 |
|
|
if ( ref $item->{author} eq 'HASH' && exists $item->{author}{name} ) { |
151 |
|
|
$author = $item->{author}{name}; |
152 |
|
|
} elsif ( !ref $item->{author} ) { |
153 |
|
|
$author = $item->{author}; |
154 |
|
|
} |
155 |
|
|
} |
156 |
|
|
my $category = []; |
157 |
|
|
if ( exists $item->{category} && ref $item->{category} eq 'ARRAY' ) { |
158 |
|
|
$category = $item->{category}; |
159 |
|
|
} elsif ( exists $item->{category} ) { |
160 |
|
|
$category = [$item->{category}]; |
161 |
|
|
} |
162 |
|
|
my @images; |
163 |
|
|
if ( exists $item->{image} || exists $item->{enclosure} ) { |
164 |
|
|
my @src = ref $item->{image} eq 'ARRAY' ? @{ $item->{image} } : ( $item->{image} ) if exists $item->{image}; |
165 |
|
|
my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; |
166 |
|
|
@att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att; |
167 |
|
|
@images = map { |
168 |
99 |
ahitrov |
my $img = $_; |
169 |
|
|
$img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/; $img; |
170 |
98 |
ahitrov |
} map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; |
171 |
|
|
} |
172 |
|
|
my @videos; |
173 |
|
|
if ( exists $item->{video} || exists $item->{enclosure} ) { |
174 |
|
|
my @src = ref $item->{video} eq 'ARRAY' ? @{ $item->{video} } : ( $item->{video} ) if exists $item->{video}; |
175 |
|
|
my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; |
176 |
|
|
@att = grep { ref $_ eq 'HASH' && $_->{type} =~ /video/ } @att; |
177 |
|
|
@videos = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : (), $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; |
178 |
|
|
} |
179 |
|
|
my @audios; |
180 |
|
|
if ( exists $item->{audio} || exists $item->{enclosure} ) { |
181 |
|
|
my @src = ref $item->{audio} eq 'ARRAY' ? @{ $item->{audio} } : ( $item->{audio} ) if exists $item->{audio}; |
182 |
|
|
my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; |
183 |
|
|
@att = grep { ref $_ eq 'HASH' && $_->{type} =~ /audio/ } @att; |
184 |
|
|
@audios = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; |
185 |
|
|
} |
186 |
|
|
my ($video_url, $audio_url); |
187 |
|
|
if ( $content_type == 2 || @videos || exists $item->{'videourl'} || exists $item->{'video_url'} ) { |
188 |
|
|
$video_url = exists $item->{video} && ref $item->{video} eq 'HASH' && exists $item->{video}{url} ? |
189 |
|
|
$item->{video}{url} : |
190 |
|
|
$item->{'videourl'} || |
191 |
|
|
$item->{'video_url'} || |
192 |
|
|
($item->{'guid'} =~ /^http:/ ? $item->{'guid'} : undef) || |
193 |
|
|
(exists $item->{'link'} && ref $item->{'link'} eq 'HASH' ? $item->{'link'}{'href'} || $item->{'link'}{'url'} : $item->{'link'} ); |
194 |
|
|
$content_type = 2; |
195 |
|
|
} |
196 |
|
|
if ( @audios || exists $item->{'audiourl'} || exists $item->{'audio_url'} ) { |
197 |
|
|
$audio_url = exists $item->{audio} && ref $item->{audio} eq 'HASH' && exists $item->{audio}{url} ? |
198 |
|
|
$item->{audio}{url} : |
199 |
|
|
$item->{'audiourl'} || $item->{'audio_url'}; |
200 |
|
|
$content_type = 2; |
201 |
|
|
} |
202 |
|
|
my $related = []; |
203 |
|
|
if ( exists $item->{'rambler:related'} && $item->{'rambler:related'} ) { |
204 |
|
|
if ( ref $item->{'rambler:related'} eq 'ARRAY' ) { |
205 |
|
|
foreach my $relitem ( @{ $item->{'rambler:related'} } ) { |
206 |
|
|
my $rel = $self->__parse_related ( $relitem ); |
207 |
|
|
push @$related, $rel if ref $rel; |
208 |
|
|
} |
209 |
|
|
} elsif ( ref $item->{'rambler:related'} eq 'HASH' ) { |
210 |
|
|
my $rel = $self->__parse_related ( $item->{'rambler:related'} ); |
211 |
|
|
push @$related, $rel if ref $rel; |
212 |
|
|
} |
213 |
|
|
} |
214 |
|
|
@videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos; |
215 |
99 |
ahitrov |
my @inlined_images; |
216 |
|
|
for ( $description, $fulltext ) { |
217 |
|
|
my $field = $_; |
218 |
|
|
while ( $field =~ /<img ([^>]+)>/sgi ) { |
219 |
|
|
my $image = $self->__parse_params( $1 ); |
220 |
|
|
push @inlined_images, $image if ref $image && exists $image->{src} && $image->{src}; |
221 |
|
|
} |
222 |
|
|
} |
223 |
|
|
if ( @inlined_images ) { |
224 |
|
|
my %images = map { $_->{src} => $_ } @images, @inlined_images; |
225 |
|
|
@images = values %images; |
226 |
|
|
} |
227 |
98 |
ahitrov |
push @items, { |
228 |
|
|
'checksum' => md5_hex(encode_utf8($title.$description)), |
229 |
|
|
'ignore' => $item->{ignore} || 0, |
230 |
|
|
'title' => $title || '', |
231 |
|
|
'title_gui' => $item->{title_gui} || $title || '', |
232 |
|
|
'description' => $description || '', |
233 |
|
|
'description_gui' => $item->{description_gui} || $description || '', |
234 |
|
|
'desc_length' => length( $description || '' ), |
235 |
|
|
'link' => $link || '', |
236 |
|
|
'pubdate' => $pubdate || '', |
237 |
|
|
'fulltext' => $fulltext || '', |
238 |
|
|
'fulltext_gui' => $item->{fulltext_gui} || '', |
239 |
|
|
'fulltext_field' => $fulltext_field || '', |
240 |
|
|
'image' => @images ? $images[0] : undef, |
241 |
|
|
'images' => @images ? \@images : undef, |
242 |
|
|
'video' => @videos ? $videos[0] : undef, |
243 |
|
|
'videos' => @videos ? \@videos : undef, |
244 |
|
|
'categories' => $category, |
245 |
|
|
'video_url' => $video_url || '', |
246 |
|
|
'audio_url' => $audio_url || '', |
247 |
|
|
'author' => $author || '', |
248 |
|
|
'related' => $related, |
249 |
|
|
'content_type' => $content_type, |
250 |
|
|
'allow_fulltext' => $allow_fulltext, |
251 |
|
|
}; |
252 |
|
|
} |
253 |
|
|
} else { |
254 |
99 |
ahitrov |
warn ($@ || 'Something wrong while parsing content'); |
255 |
98 |
ahitrov |
return $self->is_success(0); |
256 |
|
|
} |
257 |
|
|
|
258 |
99 |
ahitrov |
$self->{items} = \@items; |
259 |
98 |
ahitrov |
return $self->is_success(1); |
260 |
|
|
} |
261 |
|
|
|
262 |
|
|
|
263 |
|
|
sub __check_rewrite { |
264 |
|
|
my ($self, %opts) = @_; |
265 |
|
|
my $item = $opts{item}; |
266 |
|
|
return unless ref $item; |
267 |
|
|
return unless ref $opts{rools} eq 'ARRAY'; |
268 |
|
|
my @rools = grep { $_->{action} eq 'rewrite' } @{ $opts{rools} }; |
269 |
|
|
return unless @rools; |
270 |
|
|
foreach my $rool ( @rools ) { |
271 |
|
|
my $field = $rool->{target}; |
272 |
|
|
my $value = $rool->{condition}; |
273 |
|
|
if ( $value eq 'CURRENT_DATETIME' ) { |
274 |
|
|
my $dt = DateTime->now( time_zone => "Europe/Moscow" ); |
275 |
|
|
$value = $dt->ymd('-').'T'.$dt->hms.' MSK'; |
276 |
|
|
} |
277 |
|
|
$item->{$field} = $value if exists $item->{$field}; |
278 |
|
|
} |
279 |
|
|
} |
280 |
|
|
|
281 |
|
|
|
282 |
|
|
sub __check_filter { |
283 |
|
|
my ($self, %opts) = @_; |
284 |
|
|
my $field = $opts{field}; |
285 |
|
|
my $gui = $opts{gui}; |
286 |
|
|
my $item = $opts{item}; |
287 |
|
|
my $text = exists $opts{text} ? $opts{text} : undef; |
288 |
|
|
return unless ref $item; |
289 |
|
|
return unless exists $opts{text} || exists $item->{$field}; |
290 |
|
|
return unless ref $opts{rools} eq 'ARRAY'; |
291 |
|
|
my @rools = grep { $_->{action} eq 'filter' && $_->{target} eq $field } @{ $opts{rools} }; |
292 |
|
|
return unless @rools; |
293 |
|
|
foreach my $rool ( @rools ) { |
294 |
|
|
if ( $rool->{command} eq 'cut' ) { |
295 |
|
|
my $condition = $rool->{condition}; |
296 |
|
|
if ( $rool->{subcommand} eq 'off' ) { |
297 |
|
|
if ( $opts{gui} ) { |
298 |
|
|
my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); |
299 |
|
|
$field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi; |
300 |
|
|
$item->{$field."_gui"} = $field_gui; |
301 |
|
|
} |
302 |
|
|
if ( exists $opts{text} ) { |
303 |
|
|
$$text =~ s/$condition//sgi; |
304 |
|
|
} else { |
305 |
|
|
$item->{$field} =~ s/$condition//sgi; |
306 |
|
|
} |
307 |
|
|
} elsif ( $rool->{subcommand} eq 'from' ) { |
308 |
|
|
if ( $gui ) { |
309 |
|
|
my $cut_text = ''; |
310 |
|
|
my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); |
311 |
|
|
my $pos = index $field_gui, $condition; |
312 |
|
|
if ( $pos >= 0 ) { |
313 |
|
|
$cut_text = substr $field_gui, $pos, -1; |
314 |
|
|
$field_gui = substr $field_gui, 0, $pos; |
315 |
|
|
} |
316 |
|
|
# $field_gui =~ s/($condition)(.*)$/<b style="color:red">$1$2<\/b>/si; |
317 |
|
|
$item->{$field."_gui"} = $field_gui.'<b style="color:red">'.$cut_text.'</b>'; |
318 |
|
|
} |
319 |
|
|
if ( exists $opts{text} ) { |
320 |
|
|
my $pos = index $$text, $condition; |
321 |
|
|
if ( $pos >= 0 ) { |
322 |
|
|
$$text = substr $$text, 0, $pos; |
323 |
|
|
} |
324 |
|
|
} else { |
325 |
|
|
my $pos = index $item->{$field}, $condition; |
326 |
|
|
if ( $pos >= 0 ) { |
327 |
|
|
$item->{$field} = substr $item->{$field}, 0, $pos; |
328 |
|
|
} |
329 |
|
|
} |
330 |
|
|
} elsif ( $rool->{subcommand} eq 'till' ) { |
331 |
|
|
if ( $opts{gui} ) { |
332 |
|
|
my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); |
333 |
|
|
$field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1$2<\/b>/si; |
334 |
|
|
$item->{$field."_gui"} = $field_gui; |
335 |
|
|
} |
336 |
|
|
if ( exists $opts{text} ) { |
337 |
|
|
$$text =~ s/^(.*?)($condition)//si; |
338 |
|
|
} else { |
339 |
|
|
$item->{$field} =~ s/^(.*?)($condition)//si; |
340 |
|
|
} |
341 |
|
|
} elsif ( $rool->{subcommand} eq 'untill' ) { |
342 |
|
|
if ( $opts{gui} ) { |
343 |
|
|
my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); |
344 |
|
|
$field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1<\/b>$2/si; |
345 |
|
|
$item->{$field."_gui"} = $field_gui; |
346 |
|
|
} |
347 |
|
|
if ( exists $opts{text} ) { |
348 |
|
|
$$text =~ s/^(.*?)($condition)/$2/si; |
349 |
|
|
} else { |
350 |
|
|
$item->{$field} =~ s/^(.*?)($condition)/$2/si; |
351 |
|
|
} |
352 |
|
|
} elsif ( $rool->{subcommand} eq 'regex' ) { |
353 |
|
|
if ( $opts{gui} ) { |
354 |
|
|
my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); |
355 |
|
|
if ( substr($condition,0,1) eq '^' ) { |
356 |
|
|
my $cond = reverse($condition); |
357 |
|
|
chop($cond); |
358 |
|
|
$cond = reverse($cond); |
359 |
|
|
$field_gui =~ s/^($cond)/<b style="color:red">$1<\/b>/si; |
360 |
|
|
} elsif ( substr($condition,-1,1) eq '$' ) { |
361 |
|
|
my $cond = $condition; |
362 |
|
|
chop($cond); |
363 |
|
|
$field_gui =~ s/($cond)$/<b style="color:red">$1<\/b>/si; |
364 |
|
|
} else { |
365 |
|
|
$field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi; |
366 |
|
|
} |
367 |
|
|
$item->{$field."_gui"} = $field_gui; |
368 |
|
|
} |
369 |
|
|
if ( exists $opts{text} ) { |
370 |
|
|
$$text =~ s/$condition//sgi; |
371 |
|
|
} else { |
372 |
|
|
$item->{$field} =~ s/$condition//sgi; |
373 |
|
|
} |
374 |
|
|
} |
375 |
|
|
} elsif ( $rool->{command} eq 'regex' ) { |
376 |
|
|
my $from = $rool->{condition}{from}; |
377 |
|
|
my $to = $rool->{condition}{to}; |
378 |
|
|
if ( exists $opts{text} ) { |
379 |
|
|
eval ("\$\$text =~ s/$from/$to/sgi"); |
380 |
|
|
} else { |
381 |
|
|
eval ("\$item->{\$field} =~ s/$from/$to/sgi"); |
382 |
|
|
} |
383 |
|
|
} |
384 |
|
|
} |
385 |
|
|
} |
386 |
|
|
|
387 |
|
|
|
388 |
|
|
sub __check_ignore { |
389 |
|
|
my ($self, %opts) = @_; |
390 |
|
|
my $item = $opts{item}; |
391 |
|
|
return unless ref $item; |
392 |
|
|
return unless ref $opts{rools} eq 'ARRAY'; |
393 |
|
|
my @rools = grep { $_->{action} eq 'ignore' } @{ $opts{rools} }; |
394 |
|
|
return unless @rools; |
395 |
|
|
foreach my $rool ( @rools ) { |
396 |
|
|
my $target = $rool->{target}; |
397 |
|
|
if ( $rool->{command} =~ /^contain/ ) { |
398 |
|
|
$item->{ignore} = 1 if index (lc($item->{$target}), lc($rool->{condition})) >= 0; |
399 |
|
|
} |
400 |
|
|
if ( $rool->{command} eq '=' ) { |
401 |
|
|
$item->{ignore} = 1 if lc($item->{$target}) eq lc($rool->{condition}); |
402 |
|
|
} |
403 |
|
|
if ( $rool->{command} eq 'regex' ) { |
404 |
|
|
my $regex = $rool->{condition}; |
405 |
|
|
$item->{ignore} = 1 if $item->{$target} =~ /$regex/sgi; |
406 |
|
|
} |
407 |
|
|
} |
408 |
|
|
} |
409 |
|
|
|
410 |
|
|
|
411 |
|
|
sub __check_only { |
412 |
|
|
my ($self, %opts) = @_; |
413 |
|
|
my $item = $opts{item}; |
414 |
|
|
return unless ref $item; |
415 |
|
|
return unless ref $opts{rools} eq 'ARRAY'; |
416 |
|
|
my @rools = grep { $_->{action} eq 'only' } @{ $opts{rools} }; |
417 |
|
|
return unless @rools; |
418 |
|
|
foreach my $rool ( @rools ) { |
419 |
|
|
my $target = $rool->{target}; |
420 |
|
|
if ( $rool->{command} =~ /^contain/ ) { |
421 |
|
|
$item->{ignore} = 1 unless index (lc($item->{$target}), lc($rool->{condition})) >= 0; |
422 |
|
|
} |
423 |
|
|
if ( $rool->{command} eq '=' ) { |
424 |
|
|
$item->{ignore} = 1 unless lc($item->{$target}) eq lc($rool->{condition}); |
425 |
|
|
} |
426 |
|
|
if ( $rool->{command} eq 'regex' ) { |
427 |
|
|
my $regex = $rool->{condition}; |
428 |
|
|
$item->{ignore} = 1 unless $item->{$target} =~ /$regex/sgi; |
429 |
|
|
} |
430 |
|
|
} |
431 |
|
|
} |
432 |
|
|
|
433 |
|
|
|
434 |
|
|
sub __feed_type { |
435 |
|
|
my ($self, $contref) = @_; |
436 |
|
|
|
437 |
|
|
my $type; |
438 |
|
|
if ( $$contref =~ /<rss([^>]*)>/ ) { |
439 |
|
|
$type = 'RSS'; |
440 |
|
|
} elsif ( $$contref =~ /<feed\s+([^>]*)>/ ) { |
441 |
|
|
my $feed_params = $1; |
442 |
|
|
my $params = $self->__parse_params ($feed_params); |
443 |
|
|
if ( exists $params->{xmlns} && $params->{xmlns} =~ /purl.org\/atom/ ) { |
444 |
|
|
$type = 'ATOM'; |
445 |
|
|
} elsif ( exists $params->{xmlns} && $params->{xmlns} =~ /www.w3.org\/2005\/Atom/ ) { |
446 |
|
|
$type = 'ATOM'; |
447 |
|
|
} |
448 |
|
|
} elsif ( $$contref =~ /<rdf([^>]*)>/ ) { |
449 |
|
|
$type = 'RDF'; |
450 |
|
|
} |
451 |
|
|
return $type; |
452 |
|
|
} |
453 |
|
|
|
454 |
|
|
|
455 |
|
|
sub __parse_content { |
456 |
|
|
my ($self, $contref) = @_; |
457 |
|
|
|
458 |
|
|
my $feed_type = $self->__feed_type($contref); |
459 |
|
|
# warn "FEED Type = [$feed_type]\n"; |
460 |
|
|
return undef unless $feed_type; |
461 |
|
|
|
462 |
|
|
$$contref =~ s/>\s+</></sgi; |
463 |
|
|
$$contref =~ s/<items(.*?)>(.*?)<\/items([^>]*?)>//sgi; |
464 |
|
|
$$contref =~ s/<\/?br(.*?)>/\n/sgi; |
465 |
|
|
$$contref =~ s/<\/?nobr(.*?)>//sgi; |
466 |
|
|
#$$contref =~ s/<p>/\n\n/sgi; |
467 |
|
|
#$$contref =~ s/<p\s(.*?)>/\n\n/sgi; |
468 |
|
|
$$contref =~ s/<\/?strong\s(.*?)>//sgi; |
469 |
|
|
$$contref =~ s/<\/?s>//sgi; |
470 |
|
|
$$contref =~ s/<\/?i>//sgi; |
471 |
|
|
$$contref =~ s/<\/?b>//sgi; |
472 |
|
|
$$contref =~ s/<\/?strong>//sgi; |
473 |
|
|
#$$contref =~ s/<\/p>//sgi; |
474 |
|
|
#$$contref =~ s/<\/p\s(.*?)>//sgi; |
475 |
|
|
my @items; |
476 |
|
|
|
477 |
|
|
if ( $feed_type eq 'RSS' ) { |
478 |
|
|
while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) { |
479 |
|
|
my $item_params = $1; |
480 |
|
|
my $item_body = $2; |
481 |
|
|
# warn "BODY: [$item_body]\n\n"; |
482 |
|
|
my $params = $self->__parse_params ($item_params); |
483 |
|
|
my $item = $self->__parse_item_RSS ($item_body) || {}; |
484 |
|
|
if ( ref $params eq 'HASH' ) { |
485 |
|
|
foreach my $key ( %$params ) { |
486 |
|
|
if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) { |
487 |
|
|
push @{ $item->{$key} }, $params->{$key}; |
488 |
|
|
} elsif ( exists $item->{$key} ) { |
489 |
|
|
my @arr = ( $item->{$key}, $params->{$key} ); |
490 |
|
|
$item->{$key} = \@arr; |
491 |
|
|
} else { |
492 |
|
|
$item->{$key} = $params->{$key}; |
493 |
|
|
} |
494 |
|
|
} |
495 |
|
|
} |
496 |
|
|
if ( ref $item eq 'HASH' && scalar keys %$item ) { |
497 |
|
|
if ( exists $item->{'feedburner:origlink'} ) { |
498 |
|
|
$item->{link} = ref $item->{'feedburner:origlink'} eq 'ARRAY' ? $item->{'feedburner:origlink'}->[0] : $item->{'feedburner:origlink'}; |
499 |
|
|
} elsif ( !exists $item->{link} ) { |
500 |
|
|
foreach my $key ( qw( guid ) ) { |
501 |
|
|
if ( exists $item->{$key} ) { |
502 |
|
|
$item->{link} = $item->{$key}; |
503 |
|
|
last; |
504 |
|
|
} |
505 |
|
|
} |
506 |
|
|
} |
507 |
|
|
push @items, $item; |
508 |
|
|
} |
509 |
|
|
# warn Dumper($item); |
510 |
|
|
} |
511 |
|
|
} |
512 |
|
|
if ( $feed_type eq 'RDF' ) { |
513 |
|
|
while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) { |
514 |
|
|
my $item_params = $1; |
515 |
|
|
my $item_body = $2; |
516 |
|
|
# warn "BODY: [$item_body]\n\n"; |
517 |
|
|
my $params = $self->__parse_params ($item_params); |
518 |
|
|
my $item = $self->__parse_item_RSS ($item_body) || {}; |
519 |
|
|
if ( ref $params eq 'HASH' ) { |
520 |
|
|
foreach my $key ( %$params ) { |
521 |
|
|
if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) { |
522 |
|
|
push @{ $item->{$key} }, $params->{$key}; |
523 |
|
|
} elsif ( exists $item->{$key} ) { |
524 |
|
|
my @arr = ( $item->{$key}, $params->{$key} ); |
525 |
|
|
$item->{$key} = \@arr; |
526 |
|
|
} else { |
527 |
|
|
$item->{$key} = $params->{$key}; |
528 |
|
|
} |
529 |
|
|
} |
530 |
|
|
} |
531 |
|
|
# warn Dumper($item); |
532 |
|
|
if ( ref $item eq 'HASH' && scalar keys %$item ) { |
533 |
|
|
if ( !exists $item->{pubdate} ) { |
534 |
|
|
foreach my $key ( 'prism:publicationdate', 'dc:date' ) { |
535 |
|
|
if ( exists $item->{$key} ) { |
536 |
|
|
$item->{pubdate} = $item->{$key}; |
537 |
|
|
last; |
538 |
|
|
} |
539 |
|
|
} |
540 |
|
|
} |
541 |
|
|
push @items, $item; |
542 |
|
|
} |
543 |
|
|
} |
544 |
|
|
} |
545 |
|
|
if ( $feed_type eq 'ATOM' ) { |
546 |
|
|
while ( $$contref =~ /<entry(.*?)>(.*?)<\/entry([^>]*?)>/sgi ) { |
547 |
|
|
my $item_params = $1; |
548 |
|
|
my $item_body = $2; |
549 |
|
|
my $item = $self->__parse_item_ATOM ($item_body) || {}; |
550 |
|
|
# warn Dumper($item); |
551 |
|
|
if ( ref $item eq 'HASH' && scalar keys %$item ) { |
552 |
|
|
if ( !exists $item->{pubdate} ) { |
553 |
|
|
foreach my $key ( 'published', 'updated' ) { |
554 |
|
|
if ( exists $item->{$key} ) { |
555 |
|
|
$item->{pubdate} = $item->{$key}; |
556 |
|
|
last; |
557 |
|
|
} |
558 |
|
|
} |
559 |
|
|
} |
560 |
|
|
push @items, $item; |
561 |
|
|
} |
562 |
|
|
} |
563 |
|
|
} |
564 |
|
|
return ( scalar @items ? \@items : undef ); |
565 |
|
|
} |
566 |
|
|
|
567 |
|
|
|
568 |
|
|
sub __parse_params { |
569 |
|
|
my ($self, $params) = @_; |
570 |
|
|
return undef unless $params; |
571 |
|
|
|
572 |
|
|
my %params; |
573 |
|
|
while ( $params =~ /([\w\:]+)(\s*?)=(\s*?)["'](.*?)["']/sgi ) { |
574 |
|
|
my $name = $1; |
575 |
|
|
my $value = $4; |
576 |
|
|
if ( $name && $value ) { |
577 |
|
|
$params{$name} = $value; |
578 |
|
|
} |
579 |
|
|
} |
580 |
|
|
return ( scalar(keys %params) ? \%params : undef ); |
581 |
|
|
} |
582 |
|
|
|
583 |
|
|
|
584 |
|
|
sub __parse_item_RSS { |
585 |
|
|
my ($self, $item_body, $debug) = @_; |
586 |
|
|
return undef unless $item_body; |
587 |
|
|
|
588 |
|
|
my %item; |
589 |
|
|
# my $embedded = $self->__item_cut_rss_embedded(\$item_body); |
590 |
|
|
# if ( ref $embedded ) { |
591 |
|
|
# %item = %$embedded; |
592 |
|
|
# } |
593 |
|
|
# my $content = $self->__item_cut_rss_description(\$item_body); |
594 |
|
|
# $item{description} = $content if $content; |
595 |
|
|
# my $one_string_elements = $self->__item_cut_single_elements (\$item_body); |
596 |
|
|
# if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) { |
597 |
|
|
# foreach my $elem ( @$one_string_elements ) { |
598 |
|
|
# my ($elem_name) = keys %$elem if ref $elem eq 'HASH'; |
599 |
|
|
# if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) { |
600 |
|
|
# push @{ $item{$elem_name} }, $elem->{$elem_name}; |
601 |
|
|
# } elsif ( exists $item{$elem_name} ) { |
602 |
|
|
# $item{$elem_name} = [$item{$elem_name}, $elem->{$elem_name}]; |
603 |
|
|
# } else { |
604 |
|
|
# $item{$elem_name} = $elem->{$elem_name}; |
605 |
|
|
# } |
606 |
|
|
# } |
607 |
|
|
# } |
608 |
|
|
my $parsed = $self->__make_tree (\$item_body, $debug); |
609 |
|
|
# warn Dumper($parsed); |
610 |
|
|
if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) { |
611 |
|
|
foreach my $tag ( @{ $parsed->{1}{children} } ) { |
612 |
|
|
if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) { |
613 |
|
|
my %params; |
614 |
|
|
foreach my $it ( @{ $tag->{children} } ) { |
615 |
|
|
next unless $it->{text}; |
616 |
|
|
if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) { |
617 |
|
|
push @{ $params{$it->{type}} }, $it->{text}; |
618 |
|
|
} elsif ( exists $params{$it->{type}} ) { |
619 |
|
|
my @arr = ( $params{$it->{type}}, $it->{text} ); |
620 |
|
|
$params{$it->{type}} = \@arr; |
621 |
|
|
} else { |
622 |
|
|
$params{$it->{type}} = $it->{text}; |
623 |
|
|
} |
624 |
|
|
} |
625 |
|
|
if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { |
626 |
|
|
push @{ $item{$tag->{type}} }, \%params; |
627 |
|
|
} elsif ( exists $item{$tag->{type}} ) { |
628 |
|
|
my @arr = ( $item{$tag->{type}}, \%params ); |
629 |
|
|
$item{$tag->{type}} = \@arr; |
630 |
|
|
} else { |
631 |
|
|
$item{$tag->{type}} = \%params; |
632 |
|
|
} |
633 |
|
|
} else { |
634 |
|
|
my $body = $tag->{text} || $tag->{params}; |
635 |
|
|
if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { |
636 |
|
|
push @{ $item{$tag->{type}} }, $body; |
637 |
|
|
} elsif ( exists $item{$tag->{type}} ) { |
638 |
|
|
my @arr = ( $item{$tag->{type}}, $body ); |
639 |
|
|
$item{$tag->{type}} = \@arr; |
640 |
|
|
} else { |
641 |
|
|
$item{$tag->{type}} = $body; |
642 |
|
|
} |
643 |
|
|
} |
644 |
|
|
} |
645 |
|
|
} |
646 |
|
|
# warn Dumper(\%item); |
647 |
|
|
return \%item; |
648 |
|
|
} |
649 |
|
|
|
650 |
|
|
|
651 |
|
|
sub __parse_item_ATOM { |
652 |
|
|
my ($self, $item_body, $debug) = @_; |
653 |
|
|
return undef unless $item_body; |
654 |
|
|
|
655 |
|
|
my %item; |
656 |
|
|
my $embedded = $self->__item_cut_rss_embedded(\$item_body); |
657 |
|
|
if ( ref $embedded ) { |
658 |
|
|
%item = %$embedded; |
659 |
|
|
} |
660 |
|
|
if ( exists $item{summary} ) { |
661 |
|
|
$item{description} = delete $item{summary}; |
662 |
|
|
} else { |
663 |
|
|
my $summary = $self->__item_cut_atom_summary(\$item_body); |
664 |
|
|
$item{description} = $summary if $summary; |
665 |
|
|
} |
666 |
|
|
my $content = $self->__item_cut_atom_content(\$item_body); |
667 |
|
|
if ( $content && $item{description} ) { |
668 |
|
|
$item{fulltext} = $content; |
669 |
|
|
} elsif ( $content ) { |
670 |
|
|
$item{description} = $content; |
671 |
|
|
} |
672 |
|
|
my $one_string_elements = $self->__item_cut_single_elements (\$item_body); |
673 |
|
|
# warn Dumper ($one_string_elements); |
674 |
|
|
if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) { |
675 |
|
|
foreach my $elem ( @$one_string_elements ) { |
676 |
|
|
my ($elem_name) = keys %$elem if ref $elem eq 'HASH'; |
677 |
|
|
if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) { |
678 |
|
|
push @{$item{$elem_name}}, $elem->{$elem_name}; |
679 |
|
|
} elsif ( exists $item{$elem_name} ) { |
680 |
|
|
my @arr = ($item{$elem_name}, $elem->{$elem_name}); |
681 |
|
|
$item{$elem_name} = \@arr; |
682 |
|
|
} else { |
683 |
|
|
$item{$elem_name} = $elem->{$elem_name}; |
684 |
|
|
} |
685 |
|
|
if ( $elem->{$elem_name}{type} =~ /^image/ ) { |
686 |
|
|
my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} }; |
687 |
|
|
if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) { |
688 |
|
|
push @{ $item{enclosure} }, $enclosure; |
689 |
|
|
} elsif ( exists $item{enclosure} ) { |
690 |
|
|
my @arr = ($item{enclosure}, $enclosure); |
691 |
|
|
$item{enclosure} = \@arr; |
692 |
|
|
} else { |
693 |
|
|
$item{enclosure} = $enclosure; |
694 |
|
|
} |
695 |
|
|
} |
696 |
|
|
if ( $elem->{$elem_name}{type} =~ /^video/ ) { |
697 |
|
|
my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} }; |
698 |
|
|
if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) { |
699 |
|
|
push @{ $item{enclosure} }, $enclosure; |
700 |
|
|
} elsif ( exists $item{enclosure} ) { |
701 |
|
|
my @arr = ($item{enclosure}, $enclosure); |
702 |
|
|
$item{enclosure} = \@arr; |
703 |
|
|
} else { |
704 |
|
|
$item{enclosure} = $enclosure; |
705 |
|
|
} |
706 |
|
|
} |
707 |
|
|
} |
708 |
|
|
} |
709 |
|
|
my $parsed = $self->__make_tree (\$item_body, $debug); |
710 |
|
|
# warn Dumper($parsed); |
711 |
|
|
if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) { |
712 |
|
|
foreach my $tag ( @{ $parsed->{1}{children} } ) { |
713 |
|
|
if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) { |
714 |
|
|
my %params; |
715 |
|
|
foreach my $it ( @{ $tag->{children} } ) { |
716 |
|
|
next unless $it->{text}; |
717 |
|
|
if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) { |
718 |
|
|
push @{ $params{$it->{type}} }, $it->{text}; |
719 |
|
|
} elsif ( exists $params{$it->{type}} ) { |
720 |
|
|
my @arr = ( $params{$it->{type}}, $it->{text} ); |
721 |
|
|
$params{$it->{type}} = \@arr; |
722 |
|
|
} else { |
723 |
|
|
$params{$it->{type}} = $it->{text}; |
724 |
|
|
} |
725 |
|
|
} |
726 |
|
|
if ( exists $tag->{params} && ref $tag->{params} eq 'HASH' ) { |
727 |
|
|
while ( my ($param, $value) = each %{ $tag->{params} } ) { |
728 |
|
|
if ( exists $params{$param} && ref $params{$param} eq 'ARRAY' ) { |
729 |
|
|
push @{ $params{$param} }, $value; |
730 |
|
|
} elsif ( exists $params{$param} ) { |
731 |
|
|
my @arr = ( $params{$param}, $value ); |
732 |
|
|
$params{$param} = \@arr; |
733 |
|
|
} else { |
734 |
|
|
$params{$param} = $value; |
735 |
|
|
} |
736 |
|
|
} |
737 |
|
|
} |
738 |
|
|
if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { |
739 |
|
|
push @{ $item{$tag->{type}} }, \%params; |
740 |
|
|
} elsif ( exists $item{$tag->{type}} ) { |
741 |
|
|
my @arr = ( $item{$tag->{type}}, \%params ); |
742 |
|
|
$item{$tag->{type}} = \@arr; |
743 |
|
|
} else { |
744 |
|
|
$item{$tag->{type}} = \%params; |
745 |
|
|
} |
746 |
|
|
} else { |
747 |
|
|
my $body = $tag->{text} || $tag->{params}; |
748 |
|
|
if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { |
749 |
|
|
push @{ $item{$tag->{type}} }, $body; |
750 |
|
|
} elsif ( exists $item{$tag->{type}} ) { |
751 |
|
|
my @arr = ( $item{$tag->{type}}, $body ); |
752 |
|
|
$item{$tag->{type}} = \@arr; |
753 |
|
|
} else { |
754 |
|
|
$item{$tag->{type}} = $body; |
755 |
|
|
} |
756 |
|
|
} |
757 |
|
|
} |
758 |
|
|
my $pubDate = exists $item{issued} ? $item{issued} : exists $item{modified} ? $item{modified} : undef; |
759 |
|
|
$item{pubdate} = $pubDate if $pubDate; |
760 |
|
|
} |
761 |
|
|
|
762 |
|
|
# warn Dumper(\%item); |
763 |
|
|
return \%item; |
764 |
|
|
} |
765 |
|
|
|
766 |
|
|
|
767 |
|
|
sub __make_tree { |
768 |
|
|
my ($self, $content, $debug) = @_; |
769 |
|
|
|
770 |
|
|
my @elems = split (//,$$content); |
771 |
|
|
# warn "CONTENT: [$$content]\n\n"; |
772 |
|
|
my $id = 1; |
773 |
|
|
my $level = 0; |
774 |
|
|
my @stack; |
775 |
|
|
my %tree = ( |
776 |
|
|
root => { |
777 |
|
|
id => $id++, |
778 |
|
|
text => '', |
779 |
|
|
type => 'root', |
780 |
|
|
children=> [], |
781 |
|
|
parent => undef, |
782 |
|
|
level => $level, |
783 |
|
|
}, |
784 |
|
|
); |
785 |
|
|
my %elem_hash = ( 1 => $tree{root} ); |
786 |
|
|
my $current = $tree{root}; |
787 |
|
|
|
788 |
|
|
while ( @elems ) { |
789 |
|
|
if ( $elems[0] eq '<' && $elems[1] =~ /[\!a-zA-Z]/ ) { |
790 |
|
|
my $tag = $self->__try_tag (\@elems); |
791 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
792 |
|
|
$current->{text} .= $tag->{content}; |
793 |
|
|
splice @elems, 0, $tag->{count}; |
794 |
|
|
# warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; |
795 |
|
|
} elsif ( ref $tag && exists $tag->{closed} && $tag->{closed} ) { |
796 |
|
|
$tag->{id} = $id++; |
797 |
|
|
$tag->{parent} = $current; |
798 |
|
|
$tag->{level} = $level+1; |
799 |
|
|
$elem_hash{$tag->{id}} = $tag; |
800 |
|
|
push @{$current->{children}}, $tag; |
801 |
|
|
splice @elems, 0, $tag->{count}; |
802 |
|
|
# warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; |
803 |
|
|
} elsif ( ref $tag ) { |
804 |
|
|
$tag->{id} = $id++; |
805 |
|
|
$tag->{children} = []; |
806 |
|
|
$tag->{parent} = $current; |
807 |
|
|
$tag->{level} = ++$level; |
808 |
|
|
$elem_hash{$tag->{id}} = $tag; |
809 |
|
|
push @{$current->{children}}, $tag; |
810 |
|
|
push @stack, $current; |
811 |
|
|
$current = $tag; |
812 |
|
|
splice @elems, 0, $tag->{count}; |
813 |
|
|
# warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; |
814 |
|
|
} else { |
815 |
|
|
# warn "!!!! Error: RSS analyse. Job on item broken... !!!!\n" if $debug; |
816 |
|
|
return undef; |
817 |
|
|
} |
818 |
|
|
} elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) { |
819 |
|
|
my $tag = $self->__try_end (\@elems); |
820 |
|
|
if ( ref $tag && $tag->{type} eq 'text' ) { |
821 |
|
|
$current->{text} .= $tag->{content}; |
822 |
|
|
$current->{count} += $tag->{count}; |
823 |
|
|
splice @elems, 0, $tag->{count}; |
824 |
|
|
} elsif ( ref $tag ) { |
825 |
|
|
if ( $current->{type} ne $tag->{type} ) { |
826 |
|
|
# warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug; |
827 |
|
|
return undef; |
828 |
|
|
} else { |
829 |
|
|
$current = pop @stack; |
830 |
|
|
$level = $current->{level}; |
831 |
|
|
# warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text}; |
832 |
|
|
# warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; |
833 |
|
|
} |
834 |
|
|
splice @elems, 0, $tag->{count}; |
835 |
|
|
} else { |
836 |
|
|
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; |
837 |
|
|
return undef; |
838 |
|
|
} |
839 |
|
|
} else { |
840 |
|
|
$current->{text} .= shift @elems; |
841 |
|
|
$current->{count}++; |
842 |
|
|
} |
843 |
|
|
} |
844 |
|
|
return (\%elem_hash); |
845 |
|
|
} |
846 |
|
|
|
847 |
|
|
|
848 |
|
|
|
849 |
|
|
sub __try_tag { |
850 |
|
|
my ($self, $content) = @_; |
851 |
|
|
|
852 |
|
|
my $i = 1; |
853 |
|
|
my %tag; |
854 |
|
|
my $tag = $content->[0]; |
855 |
|
|
if ( $content->[$i] eq '!' ) { |
856 |
|
|
# warn "What? Think it's CDATA\n"; |
857 |
|
|
my $try_cdata = join '', @$content[1..8]; |
858 |
|
|
if ( $try_cdata eq '![CDATA[' ) { |
859 |
|
|
$tag = ''; |
860 |
|
|
$i = 9; |
861 |
|
|
while ( !($content->[$i-1] eq '>' && $content->[$i-2] eq ']' && $content->[$i-3] eq ']') && $i < scalar @$content ) { |
862 |
|
|
$tag .= $content->[$i]; |
863 |
|
|
$i++; |
864 |
|
|
} |
865 |
|
|
chop $tag; chop $tag; chop $tag; |
866 |
|
|
} |
867 |
|
|
# warn "CDATA Found: [$tag]"; |
868 |
|
|
return { |
869 |
|
|
type => 'text', |
870 |
|
|
content => $tag, |
871 |
|
|
count => $i, |
872 |
|
|
}; |
873 |
|
|
} |
874 |
|
|
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { |
875 |
|
|
$tag .= $content->[$i]; |
876 |
|
|
$i++; |
877 |
|
|
} |
878 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
879 |
|
|
return { |
880 |
|
|
type => 'text', |
881 |
|
|
content => $tag, |
882 |
|
|
count => $i, |
883 |
|
|
}; |
884 |
|
|
} else { |
885 |
|
|
if ( $tag =~ /^<([\w:-]+)\s*(.*)/si ) { |
886 |
|
|
my $elem_name = $1; |
887 |
|
|
my $elem_body = $2; |
888 |
|
|
unless ( $self->__is_valid_tag ($elem_name) ) { |
889 |
|
|
return { |
890 |
|
|
type => 'text', |
891 |
|
|
content => $tag, |
892 |
|
|
count => $i, |
893 |
|
|
}; |
894 |
|
|
} else { |
895 |
|
|
my $params = $self->__parse_params ($elem_body) if $elem_body; |
896 |
|
|
if ( $content->[$i] eq '>' && $content->[$i-1] eq '/' ) { |
897 |
|
|
$tag{closed} = 1; |
898 |
|
|
} |
899 |
|
|
$tag{type} = lc($elem_name); |
900 |
|
|
$tag{count} = $i+1; |
901 |
|
|
$tag{params} = $params if ref $params; |
902 |
|
|
return \%tag; |
903 |
|
|
} |
904 |
|
|
} else { |
905 |
|
|
return { |
906 |
|
|
type => 'text', |
907 |
|
|
content => $tag, |
908 |
|
|
count => $i, |
909 |
|
|
}; |
910 |
|
|
} |
911 |
|
|
} |
912 |
|
|
} |
913 |
|
|
|
914 |
|
|
|
915 |
|
|
sub __try_end { |
916 |
|
|
my ($self, $content) = @_; |
917 |
|
|
|
918 |
|
|
my $i = 2; |
919 |
|
|
my %tag; |
920 |
|
|
my $tag = $content->[0].$content->[1]; |
921 |
|
|
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { |
922 |
|
|
$tag .= $content->[$i]; |
923 |
|
|
$i++; |
924 |
|
|
} |
925 |
|
|
if ( $content->[$i] eq '<' || $i >= scalar @$content ) { |
926 |
|
|
return { |
927 |
|
|
type => 'text', |
928 |
|
|
content => $tag, |
929 |
|
|
count => $i, |
930 |
|
|
}; |
931 |
|
|
} else { |
932 |
|
|
if ( $tag =~ /^<\/([\w:-]+)/i ) { |
933 |
|
|
my $elem_name = $1; |
934 |
|
|
unless ( $self->__is_valid_tag ($elem_name) ) { |
935 |
|
|
return { |
936 |
|
|
type => 'text', |
937 |
|
|
content => $tag, |
938 |
|
|
count => $i, |
939 |
|
|
}; |
940 |
|
|
} else { |
941 |
|
|
$tag{type} = lc($elem_name); |
942 |
|
|
$tag{count} = $i+1; |
943 |
|
|
return \%tag; |
944 |
|
|
} |
945 |
|
|
} else { |
946 |
|
|
return { |
947 |
|
|
type => 'text', |
948 |
|
|
content => $tag, |
949 |
|
|
count => $i, |
950 |
|
|
}; |
951 |
|
|
} |
952 |
|
|
} |
953 |
|
|
} |
954 |
|
|
|
955 |
|
|
|
956 |
|
|
sub __is_valid_tag { |
957 |
|
|
my ($self, $tag) = @_; |
958 |
|
|
foreach my $invtag ( @INVALID_TAGS ) { |
959 |
|
|
return 0 if lc($invtag) eq lc($tag); |
960 |
|
|
} |
961 |
|
|
return 1; |
962 |
|
|
} |
963 |
|
|
|
964 |
|
|
|
965 |
|
|
sub __item_cut_atom_content { |
966 |
|
|
my ($self, $item_body) = @_; |
967 |
|
|
|
968 |
|
|
my %elem; |
969 |
|
|
if ( $$item_body =~ /<content([^>]*?)>(.*?)<\/content([^>]*)>/si ) { |
970 |
|
|
my $content_params = $1; |
971 |
|
|
my $content_body = $2; |
972 |
|
|
my $params = $self->__parse_params ($content_params) if $content_params; |
973 |
|
|
$$item_body =~ s/<content([^>]*?)>(.*?)<\/content([^>]*)>//si; |
974 |
|
|
return $content_body; |
975 |
|
|
} |
976 |
|
|
} |
977 |
|
|
|
978 |
|
|
|
979 |
|
|
sub __item_cut_atom_summary { |
980 |
|
|
my ($self, $item_body) = @_; |
981 |
|
|
|
982 |
|
|
my %elem; |
983 |
|
|
if ( $$item_body =~ /<summary([^>]*)>(.*?)<\/summary([^>]*)>/si ) { |
984 |
|
|
my $content_params = $1; |
985 |
|
|
my $content_body = $2; |
986 |
|
|
my $params = $self->__parse_params ($content_params) if $content_params; |
987 |
|
|
$$item_body =~ s/<summary([^>]*)>(.*?)<\/summary([^>]*)>//si; |
988 |
|
|
return $content_body; |
989 |
|
|
} |
990 |
|
|
} |
991 |
|
|
|
992 |
|
|
|
993 |
|
|
sub __item_cut_rss_description { |
994 |
|
|
my ($self, $item_body) = @_; |
995 |
|
|
|
996 |
|
|
my %elem; |
997 |
|
|
if ( $$item_body =~ /<description([^>]*?)>(.*?)<\/description([^>]*)>/si ) { |
998 |
|
|
my $content_params = $1; |
999 |
|
|
my $content_body = $2; |
1000 |
|
|
my $params = $self->__parse_params ($content_params) if $content_params; |
1001 |
|
|
$$item_body =~ s/<description([^>]*?)>(.*?)<\/description([^>]*)>//si; |
1002 |
|
|
return $content_body; |
1003 |
|
|
} |
1004 |
|
|
} |
1005 |
|
|
|
1006 |
|
|
|
1007 |
|
|
sub __item_cut_rss_embedded { |
1008 |
|
|
my ($self, $item_body) = @_; |
1009 |
|
|
|
1010 |
|
|
my %elem; |
1011 |
|
|
while ( $$item_body =~ /<([^>]*?)>\s*<!\[CDATA\[(.*?)\]\]>\s*<\/([^>]*)>/sgi ) { |
1012 |
|
|
my $tag = $3; |
1013 |
|
|
my $content_body = $2; |
1014 |
|
|
my $content_params = $1; |
1015 |
|
|
if ( $content_params =~ /([\w:-]+)\s+(.*)/ ) { |
1016 |
|
|
$tag = 1; |
1017 |
|
|
$content_params = $2; |
1018 |
|
|
} |
1019 |
|
|
my $params = $self->__parse_params ($content_params) if $content_params; |
1020 |
|
|
$elem{$tag} = $content_body; |
1021 |
|
|
$$item_body =~ s/<$tag([^>]*?)>(.*?)<\/$tag([^>]*)>//si; |
1022 |
|
|
} |
1023 |
|
|
return scalar keys %elem ? \%elem : undef; |
1024 |
|
|
} |
1025 |
|
|
|
1026 |
|
|
|
1027 |
|
|
|
1028 |
|
|
sub __item_cut_single_elements { |
1029 |
|
|
my ($self, $item_body) = @_; |
1030 |
|
|
|
1031 |
|
|
my @elems; |
1032 |
|
|
while ( $$item_body =~ /<([\w\:\-]+)\s*([^>]*?)\/>/sgi ) { |
1033 |
|
|
my $elem_name = $1; |
1034 |
|
|
my $elem_body = $2; |
1035 |
|
|
my $params = $self->__parse_params ($elem_body) if $elem_body; |
1036 |
|
|
if ( $elem_name && ref $params ) { |
1037 |
|
|
push @elems, { $elem_name => $params } |
1038 |
|
|
} |
1039 |
|
|
} |
1040 |
|
|
$$item_body =~ s/<(\w+)\s*([^>]*?)\/>//sgi; |
1041 |
|
|
return ( @elems ? \@elems : undef ); |
1042 |
|
|
} |
1043 |
|
|
|
1044 |
|
|
|
1045 |
|
|
sub __field_prepare { |
1046 |
|
|
my ($self, $text) = @_; |
1047 |
|
|
|
1048 |
|
|
# $text =~ s/^[\n\r\x20\t]+//; |
1049 |
99 |
ahitrov |
$text =~ s/[\n\r\x20\t]+$//; |
1050 |
98 |
ahitrov |
$self->__cdata (\$text); |
1051 |
|
|
$self->__extchar (\$text); |
1052 |
99 |
ahitrov |
# $text = HTML::Entities::decode_entities($text); |
1053 |
98 |
ahitrov |
|
1054 |
|
|
# Remove linebreaks inside incorrectly breaked paragraphs |
1055 |
|
|
if (length($text) > 100) { |
1056 |
|
|
my $pcount = 0; |
1057 |
|
|
while ($text =~ /<p>(.+?)(?=<\/?p>|$)/sgi) { |
1058 |
|
|
my $p = $1; |
1059 |
|
|
if (length $p > 50) { |
1060 |
|
|
my ($dcount, $ndcount) = (); |
1061 |
|
|
# Count sentences normally ended vs breaked |
1062 |
|
|
$dcount++ while $p =~ /(\.|\?|\!)['"]?\s*[\r\n]+/g; |
1063 |
|
|
$ndcount++ while $p =~ /([^\.\?\!\s])\s*[\r\n]+/g; |
1064 |
|
|
# Found broken paragraph |
1065 |
|
|
last if $ndcount > $dcount and ++$pcount > 1; |
1066 |
|
|
} |
1067 |
|
|
} |
1068 |
|
|
if ($pcount > 0) { |
1069 |
|
|
$text =~ s/[\n\r]+/ /sg; |
1070 |
|
|
} |
1071 |
|
|
} |
1072 |
|
|
$text =~ s/<br[^>]*>/\n/sgi; |
1073 |
|
|
$text =~ s/<p\s*>/\n\n/sgi; |
1074 |
|
|
$text =~ s/<\/p\s*>//sgi; |
1075 |
99 |
ahitrov |
# $text = Contenido::Parser::Util::strip_html($text); |
1076 |
|
|
# $text = Contenido::Parser::Util::text_cleanup($text); |
1077 |
98 |
ahitrov |
return $text; |
1078 |
|
|
} |
1079 |
|
|
|
1080 |
|
|
|
1081 |
|
|
sub __extchar { |
1082 |
|
|
my ($self, $textref) = @_; |
1083 |
|
|
|
1084 |
|
|
for ( $$textref ) { |
1085 |
|
|
s/&/\&/sg; |
1086 |
|
|
s/\&/\&/sgi; |
1087 |
99 |
ahitrov |
s/\&/\&/sgi; |
1088 |
|
|
s/\"/"/sgi; |
1089 |
|
|
s/\«/«/sg; |
1090 |
|
|
s/\»/»/sg; |
1091 |
|
|
s/\£/£/sg; |
1092 |
|
|
s/\–/–/sg; |
1093 |
|
|
s/\—/—/sg; |
1094 |
|
|
s/\„/"/sg; |
1095 |
|
|
s/\“/"/sg; |
1096 |
|
|
s/\”/"/sg; |
1097 |
|
|
s/\´/'/sg; |
1098 |
|
|
s/\…/\.\.\./sg; |
1099 |
|
|
s/\ /\n/sg; |
1100 |
|
|
s/\"/"/sg; |
1101 |
98 |
ahitrov |
} |
1102 |
|
|
# $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg; |
1103 |
|
|
# $$textref =~ s/>/>/sgi; |
1104 |
|
|
# $$textref =~ s/</</sgi; |
1105 |
|
|
# $$textref =~ s/"/"/sgi; |
1106 |
|
|
# $$textref =~ s/«/«/sgi; |
1107 |
|
|
# $$textref =~ s/»/»/sgi; |
1108 |
|
|
# $$textref =~ s/©/©/sgi; |
1109 |
|
|
# $$textref =~ s/–/–/sgi; |
1110 |
|
|
# $$textref =~ s/—/—/sgi; |
1111 |
|
|
# $$textref =~ s/°/º/sgi; |
1112 |
|
|
# $$textref =~ s/ /\x20/sgi; |
1113 |
|
|
} |
1114 |
|
|
|
1115 |
|
|
sub __normalise { |
1116 |
|
|
my $chr = shift; |
1117 |
|
|
return sprintf("%04d",$chr); |
1118 |
|
|
} |
1119 |
|
|
|
1120 |
|
|
sub __cdata { |
1121 |
|
|
my ($self, $textref) = @_; |
1122 |
|
|
if ( $$textref =~ /^<\!\[CDATA\[/ ) { |
1123 |
|
|
$$textref =~ s/<\!\[CDATA\[//sgi; |
1124 |
|
|
$$textref =~ s/\]\]>//sgi; |
1125 |
|
|
} |
1126 |
|
|
} |
1127 |
|
|
|
1128 |
|
|
|
1129 |
|
|
sub __parse_rools { |
1130 |
|
|
my ($self, $rools) = @_; |
1131 |
|
|
return unless $rools; |
1132 |
|
|
$rools =~ s/\r//sgi; |
1133 |
|
|
my @rools = split /\n/, $rools; |
1134 |
|
|
return unless @rools; |
1135 |
|
|
|
1136 |
|
|
my @parsed; |
1137 |
|
|
foreach my $rool ( @rools ) { |
1138 |
|
|
my %pr; |
1139 |
|
|
next if $rool =~ /^#/; |
1140 |
|
|
$rool =~ s/[\x20\t]+$//; |
1141 |
|
|
$rool =~ s/^[\x20\t]+//; |
1142 |
|
|
if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) { |
1143 |
|
|
$pr{action} = lc($1); |
1144 |
|
|
my $params = $2; |
1145 |
|
|
if ( $pr{action} eq 'use' && $params =~ /^(current)\s+(date)$/ ) { |
1146 |
|
|
$pr{action} = 'rewrite'; |
1147 |
|
|
$pr{target} = 'pubdate'; |
1148 |
|
|
$pr{command} = 'set'; |
1149 |
|
|
$pr{condition} = 'CURRENT_DATETIME'; |
1150 |
|
|
push @parsed, \%pr; |
1151 |
|
|
} elsif ( $params =~ /^(\w+)\s+(.*)$/ ) { |
1152 |
|
|
$pr{target} = lc($1); |
1153 |
|
|
$params = $2; |
1154 |
|
|
if ( $params =~ /^([\w=]+)\s+(.*)$/ ) { |
1155 |
|
|
$pr{command} = lc($1); |
1156 |
|
|
$params = $2; |
1157 |
|
|
if ( $pr{action} eq 'filter' && $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) { |
1158 |
|
|
$pr{subcommand} = lc($1); $params = $2; |
1159 |
|
|
next unless $pr{subcommand} =~ /^(untill|till|from|off|regex)$/; |
1160 |
|
|
$params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; |
1161 |
|
|
$pr{condition} = $params; |
1162 |
|
|
} elsif ( $pr{action} eq 'filter' && $pr{command} eq 'regex' && substr($params,0,1) eq substr($params,-1,1) && substr($params,0,1) =~ /([\/\#\|])/ ) { |
1163 |
|
|
my $delim = $1; |
1164 |
|
|
$params = substr($params,1,length($params)-2); |
1165 |
|
|
my @params = split(//,$params); |
1166 |
|
|
my ($from, $to) = ('',''); |
1167 |
|
|
my $prev = ''; |
1168 |
|
|
while ( @params ) { |
1169 |
|
|
my $ch = shift @params; |
1170 |
|
|
if ( $ch eq $delim && $prev ne '\\' ) { |
1171 |
|
|
last; |
1172 |
|
|
} else { |
1173 |
|
|
$prev = $ch; |
1174 |
|
|
$from .= $ch; |
1175 |
|
|
} |
1176 |
|
|
} |
1177 |
|
|
$to = join ('', @params); |
1178 |
|
|
$pr{condition} = { from => $from, to => $to }; |
1179 |
|
|
} elsif ( ($pr{action} eq 'ignore' || $pr{action} eq 'only') && $pr{command} =~ /^(regex|=|contain|contains)$/ ) { |
1180 |
|
|
$params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; |
1181 |
|
|
$pr{condition} = $params; |
1182 |
|
|
} else { |
1183 |
|
|
next; |
1184 |
|
|
} |
1185 |
|
|
push @parsed, \%pr; |
1186 |
|
|
} |
1187 |
|
|
} |
1188 |
|
|
} |
1189 |
|
|
} |
1190 |
|
|
return ( scalar @parsed ? \@parsed : undef ); |
1191 |
|
|
} |
1192 |
|
|
|
1193 |
|
|
|
1194 |
|
|
sub __parse_related { |
1195 |
|
|
my ($self, $related) = @_; |
1196 |
|
|
return unless ref $related eq 'HASH'; |
1197 |
|
|
return unless exists $related->{url} && $related->{url} =~ /^http:\/\//i; |
1198 |
|
|
return unless exists $related->{rel} && $related->{rel} =~ /(news|discussion|teaser)/; |
1199 |
|
|
my $result = { url => $related->{url}, rel => $related->{rel} }; |
1200 |
|
|
$result->{type} = $related->{type} if exists $related->{type}; |
1201 |
|
|
$result->{title} = $self->__field_prepare($related->{title}) if exists $related->{title} && $related->{title}; |
1202 |
|
|
|
1203 |
|
|
$result->{author} = $self->__field_prepare($related->{author}) if exists $related->{author} && $related->{author}; |
1204 |
|
|
$result->{description} = $self->__field_prepare($related->{description}) if exists $related->{description} && $related->{description}; |
1205 |
|
|
|
1206 |
|
|
if ( exists $related->{pubdate} && $related->{pubdate} ) { |
1207 |
|
|
my $pubdate = Class::Date::localdate(Date::Parse::str2time($related->{pubdate})); |
1208 |
|
|
$result->{pubdate} = $pubdate if $pubdate; |
1209 |
|
|
} |
1210 |
|
|
if ( $related->{rel} =~ /(news|teaser)/ ) { |
1211 |
|
|
return undef unless $result->{title} && $result->{pubdate}; |
1212 |
|
|
} else { |
1213 |
|
|
$result->{title} ||= 'Обсудить'; |
1214 |
|
|
} |
1215 |
|
|
|
1216 |
|
|
if ( exists $related->{image} && $related->{image} ) { |
1217 |
|
|
if ( ref $related->{image} eq 'HASH' && (exists $related->{image}{url} || exists $related->{image}{href}) ) { |
1218 |
|
|
my $img = rchannel::Image->new( { src => ($related->{image}{url} || $related->{image}{href}) } ); |
1219 |
|
|
$result->{image} = $img if ref $img; |
1220 |
|
|
} elsif ( !ref $related->{image} ) { |
1221 |
|
|
my $img = rchannel::Image->new( { src => $related->{image} } ); |
1222 |
|
|
$result->{image} = $img if ref $img; |
1223 |
|
|
} |
1224 |
|
|
} |
1225 |
|
|
|
1226 |
|
|
return $result; |
1227 |
|
|
} |
1228 |
|
|
|
1229 |
|
|
|
1230 |
99 |
ahitrov |
sub __parse_date { |
1231 |
|
|
my $self = shift; |
1232 |
|
|
my $str = shift; |
1233 |
|
|
|
1234 |
|
|
if ($str=~/(\d{2})(\d{2})(\d{4})T(\d{2})(\d{2})(\d{2})/){ |
1235 |
|
|
return parsedate ("$3-$2-$1 $4:$5:$6"); |
1236 |
|
|
} elsif ($str=~/(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})/){ |
1237 |
|
|
return parsedate ("$1 $2"); |
1238 |
|
|
} else { |
1239 |
|
|
return parsedate($str); |
1240 |
|
|
} |
1241 |
|
|
} |
1242 |
|
|
|
1243 |
|
|
|
1244 |
|
|
|
1245 |
98 |
ahitrov |
# TODO IMAGES: |
1246 |
|
|
# enclosure |
1247 |
|
|
# media:content |
1248 |
|
|
# media:thumbnail |
1249 |
|
|
# image |
1250 |
|
|
# img |
1251 |
|
|
|
1252 |
|
|
# FOUDNED: |
1253 |
|
|
# author |
1254 |
|
|
# category |
1255 |
|
|
# comments |
1256 |
|
|
# content |
1257 |
|
|
# content:encoded |
1258 |
|
|
# content:format |
1259 |
|
|
# dc:creator |
1260 |
|
|
# dc:date |
1261 |
|
|
# dc:rights |
1262 |
|
|
# dc:subject |
1263 |
|
|
# description |
1264 |
|
|
# enclosure |
1265 |
|
|
# feedburner:awareness |
1266 |
|
|
# feedburner:origLink |
1267 |
|
|
# full-text |
1268 |
|
|
# fulltext |
1269 |
|
|
# guid |
1270 |
|
|
# guide |
1271 |
|
|
# habrahabr:ballsCount |
1272 |
|
|
# habrahabr:commentsCount |
1273 |
|
|
# id |
1274 |
|
|
# image |
1275 |
|
|
# img |
1276 |
|
|
# link |
1277 |
|
|
# media:content |
1278 |
|
|
# media:thumbnail |
1279 |
|
|
# pdalink |
1280 |
|
|
# pubDate |
1281 |
|
|
# pubdate |
1282 |
|
|
# pubid |
1283 |
|
|
# published |
1284 |
|
|
# rambler:full-text |
1285 |
|
|
# rambler:fulltext |
1286 |
|
|
# region |
1287 |
|
|
# section |
1288 |
|
|
# sections |
1289 |
|
|
# source |
1290 |
|
|
# sport |
1291 |
|
|
# summary |
1292 |
|
|
# text |
1293 |
|
|
# title |
1294 |
|
|
# updated |
1295 |
|
|
# wfw:commentRSS |
1296 |
|
|
# wfw:commentRss |
1297 |
|
|
# wmj:fulltext |
1298 |
|
|
# yandex:full-text |
1299 |
|
|
|
1300 |
|
|
1; |