1 |
99 |
ahitrov |
package Contenido::Parser; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
use warnings; |
5 |
|
|
use locale; |
6 |
|
|
|
7 |
|
|
use Encode; |
8 |
|
|
use URI; |
9 |
|
|
use Data::Dumper; |
10 |
|
|
use Contenido::Globals; |
11 |
|
|
use LWP::UserAgent; |
12 |
|
|
use Contenido::File::Scheme::FILE; |
13 |
|
|
use Contenido::Parser::Util; |
14 |
|
|
|
15 |
112 |
ahitrov |
sub new { |
16 |
|
|
my ($proto) = @_; |
17 |
|
|
my $class = ref($proto) || $proto; |
18 |
|
|
my $self = {}; |
19 |
|
|
bless $self, $class; |
20 |
|
|
|
21 |
|
|
return $self; |
22 |
|
|
} |
23 |
|
|
|
24 |
|
|
|
25 |
99 |
ahitrov |
sub fetch { |
26 |
|
|
my ($self, $input, %opts) = @_; |
27 |
|
|
|
28 |
|
|
my ($fh, $content); |
29 |
|
|
my $encoding = delete $opts{encoding}; |
30 |
|
|
if (not ref $input) { |
31 |
|
|
no strict "refs"; |
32 |
|
|
my $scheme = uc(scheme($input)); |
33 |
|
|
if ( $scheme eq 'FILE' ) { |
34 |
|
|
$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input); |
35 |
|
|
} else { |
36 |
|
|
my $request = new HTTP::Request GET => $input; |
37 |
|
|
my $ua = new LWP::UserAgent; |
38 |
|
|
$ua->timeout(10); |
39 |
|
|
my $res = $ua->request($request); |
40 |
|
|
if ($res->is_success) { |
41 |
|
|
$self->{headers} = $res->headers; |
42 |
|
|
my $content_length = $res->headers->header('content-length'); |
43 |
|
|
my $content_type = $res->headers->header('content-type'); |
44 |
|
|
$self->{content_type} = $content_type; |
45 |
|
|
if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) { |
46 |
|
|
$encoding = $1; |
47 |
|
|
} |
48 |
|
|
my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : ''; |
49 |
|
|
$self->{base_url} = $base_url if $base_url; |
50 |
|
|
$content = $res->content; |
51 |
|
|
} else { |
52 |
|
|
warn $res->status_line." \n"; |
53 |
|
|
$self->{success} = 0; |
54 |
|
|
$self->{reason} = $res->status_line; |
55 |
|
|
return $self; |
56 |
|
|
} |
57 |
|
|
} |
58 |
|
|
} elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) { |
59 |
|
|
$fh = $input; |
60 |
|
|
} elsif (ref $input eq "SCALAR") { |
61 |
|
|
$fh = IO::Scalar->new($input); |
62 |
|
|
} else { |
63 |
|
|
warn("Path, scalar ref or fh needed"); |
64 |
|
|
$self->{success} = 0; |
65 |
|
|
$self->{reason} = 'Path, scalar ref or fh needed'; |
66 |
|
|
return $self; |
67 |
|
|
} |
68 |
|
|
|
69 |
|
|
if ( ref $fh ) { |
70 |
|
|
$content = <$fh>; |
71 |
|
|
} |
72 |
|
|
if ( $content ) { |
73 |
|
|
unless ( $encoding ) { |
74 |
|
|
$encoding = $self->__try_content_encoding( substr($content, 0, 350) ); |
75 |
112 |
ahitrov |
} |
76 |
|
|
if ( $encoding && $encoding ne 'utf-8' ) { |
77 |
|
|
warn "Encoding from $encoding\n..." if $DEBUG; |
78 |
|
|
Encode::from_to($content, $encoding, 'utf-8'); |
79 |
|
|
if ( exists $self->{headers} ) { |
80 |
|
|
foreach my $header ( keys %{$self->{headers}} ) { |
81 |
|
|
if ( ref $self->{headers}{$header} eq 'ARRAY' ) { |
82 |
|
|
foreach my $val ( @{$self->{headers}{$header}} ) { |
83 |
|
|
Encode::from_to($val, $encoding, 'utf-8'); |
84 |
|
|
} |
85 |
|
|
} else { |
86 |
|
|
Encode::from_to($self->{headers}{$header}, $encoding, 'utf-8'); |
87 |
|
|
} |
88 |
|
|
} |
89 |
99 |
ahitrov |
} |
90 |
|
|
} |
91 |
107 |
ahitrov |
$self->{encoding} = $encoding; |
92 |
|
|
warn Dumper($self) if $DEBUG; |
93 |
99 |
ahitrov |
$self->{content} = $content; |
94 |
|
|
$self->{success} = 1; |
95 |
|
|
} else { |
96 |
|
|
$self->{success} = 0; |
97 |
|
|
$self->{reason} = 'Content is empty'; |
98 |
|
|
} |
99 |
|
|
return $self; |
100 |
|
|
} |
101 |
|
|
|
102 |
|
|
sub is_success { |
103 |
|
|
my ($self, $val) = @_; |
104 |
|
|
|
105 |
|
|
if ( defined $val ) { |
106 |
|
|
$self->{success} = $val; |
107 |
|
|
return $self; |
108 |
|
|
} else { |
109 |
|
|
return $self->{success}; |
110 |
|
|
} |
111 |
|
|
} |
112 |
|
|
|
113 |
|
|
sub __try_content_encoding { |
114 |
|
|
my ($self, $input)= @_; |
115 |
|
|
if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { |
116 |
|
|
return lc($1); |
117 |
112 |
ahitrov |
} elsif ( $input =~ /charset[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) { |
118 |
|
|
return lc($1); |
119 |
99 |
ahitrov |
} elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) { |
120 |
|
|
return lc($1); |
121 |
|
|
} else { |
122 |
|
|
return undef; |
123 |
|
|
} |
124 |
|
|
} |
125 |
|
|
|
126 |
|
|
sub scheme { |
127 |
|
|
my $uri = shift; |
128 |
|
|
my $scheme; |
129 |
|
|
|
130 |
|
|
$scheme = URI->new($uri)->scheme() || "file"; |
131 |
|
|
|
132 |
|
|
return $scheme; |
133 |
|
|
} |
134 |
|
|
|
135 |
|
|
|
136 |
|
|
1; |