Revision 99 (by ahitrov, 2011/05/11 18:09:30) Парсер
package Contenido::Parser;

use strict;
use warnings;
use locale;

use Encode;
use URI;
use Data::Dumper;
use Contenido::Globals;
use LWP::UserAgent;
use Contenido::File::Scheme::FILE;
use Contenido::Parser::Util;

sub fetch {
    my ($self, $input, %opts) = @_;

    my ($fh, $content);
    my $encoding = delete $opts{encoding};
    if (not ref $input) {
	no strict "refs";
	my $scheme = uc(scheme($input));
	if ( $scheme eq 'FILE' ) {
		$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
	} else {
		my $request = new HTTP::Request GET => $input;
		my $ua = new LWP::UserAgent;
		$ua->timeout(10);
		my $res = $ua->request($request);
		if ($res->is_success) {
			$self->{headers} = $res->headers;
			my $content_length = $res->headers->header('content-length');
			my $content_type = $res->headers->header('content-type');
			$self->{content_type} = $content_type;
			if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) {
				$encoding = $1;
			}
			my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
			$self->{base_url} = $base_url		if $base_url;
			$content = $res->content;
		} else {
			warn $res->status_line." \n";
			$self->{success} = 0;
			$self->{reason} = $res->status_line;
			return $self;
		}
	}
    } elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) {
	$fh = $input;
    } elsif (ref $input eq "SCALAR") {
	$fh = IO::Scalar->new($input);
    } else {
	warn("Path, scalar ref or fh needed");
	$self->{success} = 0;
	$self->{reason} = 'Path, scalar ref or fh needed';
	return $self;
    }

    if ( ref $fh ) {
	$content = <$fh>;
    }
    if ( $content ) {
	warn Dumper($self);
	unless ( $encoding ) {
		$encoding = $self->__try_content_encoding( substr($content, 0, 350) );
		$self->{encoding} = $encoding;
		if ( $encoding && $encoding ne 'utf-8' ) {
			Encode::from_to($content, $encoding, 'utf-8');
		}
	}
	$self->{content} = $content;
	$self->{success} = 1;
    } else {
	$self->{success} = 0;
	$self->{reason} = 'Content is empty';
    }
    return $self;
}

sub is_success {
    my ($self, $val) = @_;

    if ( defined $val ) {
	$self->{success} = $val;
	return $self;
    } else {
	return $self->{success};
    }
}

sub __try_content_encoding {
    my ($self, $input)= @_;
    if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
	return lc($1);
    } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
	return lc($1);
    } else {
	return undef;
    }
}

sub scheme {
    my $uri = shift;
    my $scheme;

    $scheme = URI->new($uri)->scheme() || "file";

    return $scheme;
}


1;