package Contenido::Parser::HTML;
use strict;
use warnings;
use locale;
use base 'Contenido::Parser';
use Contenido::Globals;
use Utils::HTML;
use Data::Dumper;
use utf8;
use Encode;
my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones
dealer diler background bg news rss index none btn cards up footer noimage but link excel price
mid graphic busket map girl space catalog bann headline hosting contact schedule redir email
);
my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru
mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net
);
my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban
icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt
ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc
);
my @bad_dimensions = (
{ w => 120, h => 60 },
{ w => 468, h => 60 },
{ w => 120, h => 600 },
{ w => 88, h => 31 },
);
sub new {
my ($proto) = @_;
my $class = ref($proto) || $proto;
my $self = {};
bless $self, $class;
return $self;
}
sub parse {
my ($self, %opts) = @_;
my $content;
if ( $opts{content} ) {
$content = decode('utf-8', delete $opts{content});
delete $self->{content};
} elsif ( $self->{success} || $self->{content} ) {
$content = decode('utf-8', delete $self->{content});
} else {
$self->{success} = 0;
return $self;
}
my $base_url = delete $self->{base_url} || delete $opts{base_url};
my $strip_html = delete $opts{strip_html};
my $debug = $DEBUG;
my $gui = delete $opts{gui};
my $header = decode('utf-8', delete $opts{header});
warn "Header length: ".length($header || '')."\n" if $debug;
my $description = decode('utf-8', delete $opts{description});
warn "Description length: ".length($description || '')."\n" if $debug;
my $minimum = delete $opts{min} || length $description;
my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
warn Dumper ($pre_rools) if $debug;
my $parse_rools = $self->__parse_rools (delete $opts{parser_run});
warn Dumper ($parse_rools) if $debug;
my $post_rools = $self->__parse_rools (delete $opts{parser_end});
warn Dumper ($post_rools) if $debug;
# warn "Experimental. Debug!!!\n" if $debug;
if ( ref $pre_rools eq 'ARRAY' ) {
my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
foreach my $set ( @sets ) {
if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) {
my $value = $set->{condition}{value};
unless ( $value =~ /\D/ ) {
if ( $set->{subcommand} eq 'limit' ) {
$minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value);
} else {
$minimum = int($value);
}
}
}
if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) {
$description = $header;
}
}
}
$minimum ||= 300;
warn "Tag cleaning...\n" if $debug;
$self->__clean_tags (\$content, $pre_rools);
$content =~ s/>\s+>__clean_img (\$content);
warn "Empty div cleaning...\n" if $debug;
while ( $self->__clean_empty_div (\$content) ) {}
warn "Make tree...\n" if $debug;
my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
$self->__extract_img ($shortcuts, $base_url, $debug);
$self->__extract_headers ($shortcuts, $header, $debug);
warn "Getting big texts (min=$minimum)...\n" if $debug;
my $chosen = $self->__dig_big_texts (
structure => $shortcuts,
min => $minimum,
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
debug => $debug );
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
$self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
return $self->is_success(0) unless $gui;
}
if ( $description ) {
my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
$chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
}
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
$self->{error_message} = 'I didn\'t find any valuable text';
return $self->is_success(0) unless $gui;
}
if ( scalar @$chosen > 1 ) {
$chosen = $self->__check_headers ($chosen, $header, $debug);
}
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
$self->{error_message} = 'I didn\'t find any valuable text';
return $self->is_success(0) unless $gui;
}
$self->__strip_html (
chosen => $chosen,
header => $header,
ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
debug => $debug
);
if ( ref $parse_rools eq 'ARRAY' ) {
my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
$self->__glue ( $chosen, $glue, $debug ) if ref $glue;
}
warn "Getting images...\n" if $debug;
my $images = $self->__get_images (
structure => $shortcuts,
chosen => $chosen->[0],
base_url => $base_url,
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
debug => $debug,
);
if ( ref $images eq 'ARRAY' && @$images ) {
$self->{images} = $images;
$self->{image} = $images->[0];
}
if ( $gui ) {
if ( ref $chosen eq 'ARRAY' ) {
foreach my $elem ( @$chosen ) {
$self->__post_rool ($elem, $post_rools, $description);
}
}
$self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
# $self->{html} = $content;
# $self->{tree} = $shortcuts;
$self->{tree} = $tree;
$self->{chosen} = $chosen;
} else {
$self->__post_rool ($chosen->[0], $post_rools, $description);
$self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
$self->{chosen} = $chosen;
map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY';
$tree = undef;
foreach my $key ( keys %$shortcuts ) {
delete $shortcuts->{$key};
}
$shortcuts = undef;
$content = undef;
}
return $self->is_success(1);
}
sub __clean_tags {
my ($self, $content, $rools) = @_;
my @cut_rools;
if ( ref $rools eq 'ARRAY' && @$rools) {
@cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools;
}
my @clean_off_rools;
if ( ref $rools eq 'ARRAY' && @$rools) {
@clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools;
}
$$content =~ s///sgi;
$$content =~ s///sgi;
$$content =~ s/