Line # Revision Author
1 99 ahitrov package Contenido::Parser;
2
3 use strict;
4 use warnings;
5 use locale;
6
7 use Encode;
8 use URI;
9 use Data::Dumper;
10 use Contenido::Globals;
11 use LWP::UserAgent;
12 use Contenido::File::Scheme::FILE;
13 use Contenido::Parser::Util;
14
15 112 ahitrov sub new {
16 my ($proto) = @_;
17 my $class = ref($proto) || $proto;
18 my $self = {};
19 bless $self, $class;
20
21 return $self;
22 }
23
24
25 99 ahitrov sub fetch {
26 my ($self, $input, %opts) = @_;
27
28 my ($fh, $content);
29 my $encoding = delete $opts{encoding};
30 if (not ref $input) {
31 no strict "refs";
32 my $scheme = uc(scheme($input));
33 if ( $scheme eq 'FILE' ) {
34 $fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
35 } else {
36 my $request = new HTTP::Request GET => $input;
37 my $ua = new LWP::UserAgent;
38 $ua->timeout(10);
39 my $res = $ua->request($request);
40 if ($res->is_success) {
41 $self->{headers} = $res->headers;
42 my $content_length = $res->headers->header('content-length');
43 my $content_type = $res->headers->header('content-type');
44 $self->{content_type} = $content_type;
45 if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) {
46 $encoding = $1;
47 }
48 my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
49 $self->{base_url} = $base_url if $base_url;
50 $content = $res->content;
51 } else {
52 warn $res->status_line." \n";
53 $self->{success} = 0;
54 $self->{reason} = $res->status_line;
55 return $self;
56 }
57 }
58 } elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) {
59 $fh = $input;
60 } elsif (ref $input eq "SCALAR") {
61 $fh = IO::Scalar->new($input);
62 } else {
63 warn("Path, scalar ref or fh needed");
64 $self->{success} = 0;
65 $self->{reason} = 'Path, scalar ref or fh needed';
66 return $self;
67 }
68
69 if ( ref $fh ) {
70 $content = <$fh>;
71 }
72 if ( $content ) {
73 unless ( $encoding ) {
74 $encoding = $self->__try_content_encoding( substr($content, 0, 350) );
75 112 ahitrov }
76 if ( $encoding && $encoding ne 'utf-8' ) {
77 warn "Encoding from $encoding\n..." if $DEBUG;
78 Encode::from_to($content, $encoding, 'utf-8');
79 if ( exists $self->{headers} ) {
80 foreach my $header ( keys %{$self->{headers}} ) {
81 if ( ref $self->{headers}{$header} eq 'ARRAY' ) {
82 foreach my $val ( @{$self->{headers}{$header}} ) {
83 Encode::from_to($val, $encoding, 'utf-8');
84 }
85 } else {
86 Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8');
87 }
88 }
89 99 ahitrov }
90 }
91 107 ahitrov $self->{encoding} = $encoding;
92 warn Dumper($self) if $DEBUG;
93 99 ahitrov $self->{content} = $content;
94 $self->{success} = 1;
95 } else {
96 $self->{success} = 0;
97 $self->{reason} = 'Content is empty';
98 }
99 return $self;
100 }
101
102 sub is_success {
103 my ($self, $val) = @_;
104
105 if ( defined $val ) {
106 $self->{success} = $val;
107 return $self;
108 } else {
109 return $self->{success};
110 }
111 }
112
113 sub __try_content_encoding {
114 my ($self, $input)= @_;
115 if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
116 return lc($1);
117 112 ahitrov } elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
118 return lc($1);
119 99 ahitrov } elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
120 return lc($1);
121 } else {
122 return undef;
123 }
124 }
125
126 sub scheme {
127 my $uri = shift;
128 my $scheme;
129
130 $scheme = URI->new($uri)->scheme() || "file";
131
132 return $scheme;
133 }
134
135
136 1;