1 |
3 |
ahitrov@rambler.ru |
package Jevix::MakeupUtf; |
2 |
|
|
|
3 |
|
|
# ========================================== |
4 |
|
|
# |
5 |
|
|
# Jevix Version 0.9.5 (utf-8) |
6 |
|
|
# |
7 |
|
|
# Developed by Igor Askarov |
8 |
|
|
# |
9 |
|
|
# Please send all suggestions to |
10 |
|
|
# Igor Askarov <juks@juks.ru> |
11 |
|
|
# http://www.jevix.ru/ |
12 |
|
|
# |
13 |
|
|
# Release date: 21/01/2008 |
14 |
|
|
# |
15 |
|
|
# === Methods list========================== |
16 |
|
|
# |
17 |
|
|
# new the constructor |
18 |
|
|
# procces entry sub |
19 |
|
|
# preset presets selector |
20 |
|
|
# makeup makeup the text |
21 |
|
|
# quotes quotes processor |
22 |
|
|
# cuttags tags processor |
23 |
|
|
# tagend looking fo tag end |
24 |
|
|
# planttags sub to bring the tags back |
25 |
|
|
# vanish sub to remove all the stuff and bring the text to plain mode |
26 |
|
|
# parseTagsAllowString parse the tagsAllow string to hash |
27 |
|
|
# parseTagsDenyString parse the tagsDeny string to hash |
28 |
|
|
# |
29 |
|
|
# ========================================== |
30 |
|
|
|
31 |
|
|
use strict; |
32 |
|
|
use warnings; |
33 |
|
|
|
34 |
|
|
#UTFMODE# |
35 |
|
|
use Encode; |
36 |
|
|
use utf8; |
37 |
|
|
#UTFMODE# |
38 |
|
|
|
39 |
|
|
my $markLength = 7; |
40 |
|
|
my $strip; |
41 |
|
|
my $result; |
42 |
|
|
my $tags; |
43 |
|
|
my $conf; |
44 |
|
|
|
45 |
|
|
my @singleTags = qw/link input spacer img br hr/; |
46 |
|
|
my @breakingTags = qw/p td div hr/; |
47 |
|
|
my @spaceTags = qw/br/; |
48 |
|
|
my @tagsToEat = qw/script style/; |
49 |
|
|
|
50 |
|
|
# ==The constructor |
51 |
|
|
sub new { |
52 |
|
|
my Jevix::MakeupUtf $class = shift; |
53 |
|
|
|
54 |
|
|
return $class; |
55 |
|
|
} |
56 |
|
|
|
57 |
|
|
# ==Here we've got the input |
58 |
|
|
sub process($$$) { |
59 |
|
|
my($class, $text, $userConfig) = @_; |
60 |
|
|
|
61 |
|
|
#UTFMODE# |
62 |
|
|
$$text = decode("utf8", $$text); |
63 |
|
|
#UTFMODE# |
64 |
|
|
|
65 |
|
|
$conf = $userConfig ? $userConfig : {presetBasic=>1}; |
66 |
|
|
$class->preset(); |
67 |
|
|
|
68 |
|
|
$strip = ""; |
69 |
|
|
$tags = []; |
70 |
|
|
|
71 |
|
|
$result = {}; |
72 |
|
|
$result->{error} = 0; |
73 |
|
|
$result->{errorLog} = []; |
74 |
|
|
|
75 |
|
|
if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); } |
76 |
|
|
if($conf->{quotes}) { $class->quotes($conf); } |
77 |
|
|
$class->makeup($conf); |
78 |
|
|
|
79 |
|
|
$result->{text} = ""; |
80 |
|
|
if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; } |
81 |
|
|
|
82 |
|
|
#UTFMODE# |
83 |
|
|
Encode::_utf8_off($result->{text}); |
84 |
|
|
#UTFMODE# |
85 |
|
|
|
86 |
|
|
return $result; |
87 |
|
|
} |
88 |
|
|
|
89 |
|
|
# ==Choosing default setup when necessary |
90 |
|
|
sub preset($$) { |
91 |
|
|
my ($class) = @_; |
92 |
|
|
|
93 |
|
|
if(!$conf || $conf->{presetBasic}) { |
94 |
|
|
$conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode |
95 |
|
|
$conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to <br/> |
96 |
|
|
$conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs |
97 |
|
|
$conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary |
98 |
|
|
$conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol |
99 |
|
|
$conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters |
100 |
|
|
$conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces |
101 |
|
|
$conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces |
102 |
|
|
$conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside <nobr> |
103 |
|
|
$conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag |
104 |
|
|
$conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces |
105 |
|
|
$conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes |
106 |
|
|
$conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type |
107 |
|
|
$conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type |
108 |
|
|
$conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions |
109 |
|
|
$conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters |
110 |
|
|
} |
111 |
|
|
|
112 |
|
|
# If tagsAllow came as a string |
113 |
|
|
if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) { |
114 |
|
|
my $tmp = $class->parseTagsAllowString($conf->{tagsAllow}); |
115 |
|
|
$conf->{tagsAllow} = $tmp->{tagsAllow}; |
116 |
|
|
$conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes}; |
117 |
|
|
} |
118 |
|
|
|
119 |
|
|
# If tagsDeny came as a string |
120 |
|
|
if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) { |
121 |
|
|
$conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny}); |
122 |
|
|
} |
123 |
|
|
} |
124 |
|
|
|
125 |
|
|
# ==Imposing clear text |
126 |
|
|
sub makeup($$) { |
127 |
|
|
my ($class, $conf) = @_; |
128 |
|
|
|
129 |
|
|
# ==Misc |
130 |
|
|
# Prepositions |
131 |
|
|
my $prp_rus = "а|без|безо|в|вне|во|да|для|до|за|и|из|изо|или|к|как|на|над|надо|не|ни|но|о|об|обо|около|от|ото|по|под|подо|при|про|с|сквозь|со|у|через"; |
132 |
|
|
my $prp_eng = "aboard|about|above|absent|across|after|against|along|alongside|amid|amidst|among|amongst|around|as|astride|at|atop|before|behind|below|beneath|beside|besides|between|beyond|but|by|despite|down|during|except|following|for|from|in|inside|into|like|mid|minus|near|nearest|notwithstanding|of|off|on|onto|opposite|out|outside|over|past|re|round|save|since|than|through|throughout|till|to|toward|towards|under|underneath|unlike|until|up|upon|via|with|within|without"; |
133 |
|
|
my $prp = "$prp_rus|$prp_eng"; |
134 |
|
|
|
135 |
|
|
my $letters = "A-Za-zА-Яа-яЁёЙй"; # Characters |
136 |
|
|
my $cap_letters = "A-ZА-ЯЁё"; # Capital characters |
137 |
|
|
|
138 |
|
|
my $sp = " \xA0\t"; # space class |
139 |
|
|
my $rt = "\r?\n"; # cr class |
140 |
|
|
|
141 |
|
|
my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag |
142 |
|
|
my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag |
143 |
|
|
my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start |
144 |
|
|
my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end |
145 |
|
|
my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag |
146 |
|
|
my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag |
147 |
|
|
my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where <nobr> is open |
148 |
|
|
my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no <nobr> open |
149 |
|
|
my $ts = "\x00\x0F"; # Tag start |
150 |
|
|
my $te = "\x0F\x00"; # Tag end |
151 |
|
|
|
152 |
|
|
my $brt = "<br *\/?>"; # br tag in text mode |
153 |
|
|
my $pst = "<p>"; |
154 |
|
|
my $pet = "</p>"; |
155 |
|
|
|
156 |
|
|
# Codes, metasymbols or what ever? |
157 |
|
|
my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg); |
158 |
|
|
if(!$conf->{codeMode}) { |
159 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); |
160 |
|
|
$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; |
161 |
|
|
} elsif($conf->{codeMode} == 1) { |
162 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); |
163 |
|
|
$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; |
164 |
|
|
} else { |
165 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); |
166 |
|
|
$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
# Wiping edge spaces |
170 |
|
|
if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n]*(.+?)[$sp\r\n]*$/$1/isg; } |
171 |
|
|
|
172 |
|
|
# Wiping spaces between tags (</td> </tr>) |
173 |
|
|
if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; } |
174 |
|
|
|
175 |
|
|
# Wiping multispaces |
176 |
|
|
if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; } |
177 |
|
|
|
178 |
|
|
# Wiping redundant spaces |
179 |
|
|
if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))|(\()(?<![:%;]-\()[$sp]+}{$1 ? $2 : $3}eig; } |
180 |
|
|
|
181 |
|
|
if($conf->{nbsp}) { |
182 |
|
|
# Prepositions with |
183 |
|
|
$strip =~ s/(^|\x00|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm; |
184 |
|
|
|
185 |
|
|
# with digits |
186 |
|
|
$strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+| | )(?:(?=[0-9]{2,})|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg; |
187 |
|
|
} |
188 |
|
|
|
189 |
|
|
# Put composite words inside <nobr> |
190 |
|
|
if($conf->{compositeWords}) { $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); |
191 |
|
|
$b =~ s{(^|[$sp\x00]| )([$letters]+(?:-[$letters]+)+)(?=$|[$sp\x00])}{ |
192 |
|
|
my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3; |
193 |
|
|
if(length($e) <= $conf->{compositeWordsLength}) { "$d<nobr>$e<\/nobr>" } else {"$d$e$f"} |
194 |
|
|
}eig; "$a$b$c"; |
195 |
|
|
}eisg; } |
196 |
|
|
|
197 |
|
|
# Dots |
198 |
|
|
if($conf->{dots}) { $strip =~ s/\.{3}|…|…/$cdots/ig; } |
199 |
|
|
|
200 |
|
|
# Dashes |
201 |
|
|
if($conf->{dashes}) { |
202 |
|
|
# Hyphen |
203 |
|
|
$strip =~ s/([^$sp])([$sp]| | )(-{1,2}|—|—|—)/$1$cnbsp$cdash/ig; |
204 |
|
|
# "Speech" hyphen |
205 |
|
|
$strip =~ s/((?:^|$ps|$br|$brt(?:$rt)*|[$rt]))[$sp]*(?: )*(-{1,2}|—|—|—)[$sp]*(?: )*(.)/$1$cdash$cnbsp$3/ig; |
206 |
|
|
} |
207 |
|
|
|
208 |
|
|
# Misc stuff |
209 |
|
|
if($conf->{misc}) { |
210 |
|
|
# Fracs |
211 |
|
|
$strip =~ s{(?:(?<=[$sp\x00])|(?<=^))([13])/([24])(?:(?=[$sp\x00])|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg; |
212 |
|
|
# Copyright & registered |
213 |
|
|
$strip =~ s{(?:(?<=[$sp\x00])|(?<=^))(\([cr]\)|©|©)(?:(?=[$sp\x00?!;.,])|(?=$))}{ if((lc($1) eq "(c)") || (lc($1) eq "©") || ($1 eq "©")) {$ccopy} elsif((lc($1) eq "(r)") || (lc($1) eq "®") || ($1 eq "®")) {$creg} else { $2 } }eig; |
214 |
|
|
} |
215 |
|
|
|
216 |
|
|
# Paragraphs |
217 |
|
|
if($conf->{paragraphs}) { $strip =~ s{(^|$pe(?:$rt$rt)?|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?<!$pe)(?:(?=$)|(?=$rt$rt)|(?=$ps))}{ my ($a, $b, $c) = ($1,$2,$3||""); (($b =~ /^[ \r\n]+$/) || ($b =~ /^(<br *\/?>|$br)+$/)) ? "$a$b$c" : "$a<p>$b</p>";}eisg; } |
218 |
|
|
|
219 |
|
|
# Line break |
220 |
|
|
if($conf->{lineBreaks}) { $strip =~ s/(?<!$pt)(?<!$br)(?<!$br\r)(?<!$pe\r\n\r\n)(?<!$pe\n\n)(?<!$pe\r\n)(?<!$pe\n)(?<!$pe\r)(?<!$pe)(?<!$pet\r\n\r\n)(?<!$pet\r\n\r)(?<!$pet\n\n)(?<!$pet\r\n)(?<!$pet\n)(?<!$pet\r)(?<!$pet)(?<!$pst)($rt)(?!$brt)/<br \/>$1/isg; } |
221 |
|
|
} |
222 |
|
|
|
223 |
|
|
# ==impose quotes |
224 |
|
|
sub quotes($$) { |
225 |
|
|
my ($class, $conf) = @_; |
226 |
|
|
|
227 |
|
|
my $i; |
228 |
|
|
my ($a_open, $b_open) = (0,0); |
229 |
|
|
my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0); |
230 |
|
|
my ($qaStart, $qaEnd, $qbStart, $qbEnd); |
231 |
|
|
my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,); |
232 |
|
|
|
233 |
|
|
# space class |
234 |
|
|
my $sp =" \t\xA0"; |
235 |
|
|
# characters |
236 |
|
|
my $letters = "A-Za-zА-Яа-яЁёЙй"; |
237 |
|
|
|
238 |
|
|
@qs_ansi = ("«", "“", "„", "‘", "‚", '"'); |
239 |
|
|
@qe_ansi = ("»", "”", "“", "’", "‘", '"'); |
240 |
|
|
@qs_html = ("«", "“", "„", "‘", "‚", """); |
241 |
|
|
@qe_html = ("»", "”", "“", "’", "‘", """); |
242 |
|
|
# << `` .. ` . " |
243 |
|
|
@qs_ent = ("«", "“", "„", "‘", "‚", """); |
244 |
|
|
# >> '' '' ' ` " |
245 |
|
|
@qe_ent = ("»", "”", "“", "’", "‘", """); |
246 |
|
|
|
247 |
|
|
# Quotes collection |
248 |
|
|
if(!$conf->{codeMode}) { |
249 |
|
|
@qs = @qs_ansi; @qe = @qe_ansi; |
250 |
|
|
} elsif ($conf->{codeMode} == 1) { |
251 |
|
|
@qs = @qs_html; @qe = @qe_html; |
252 |
|
|
} else { |
253 |
|
|
@qs = @qs_ent; @qe = @qe_ent; |
254 |
|
|
} |
255 |
|
|
|
256 |
|
|
# Getting configuration setting |
257 |
|
|
$conf->{qaType} ||= 0; |
258 |
|
|
$conf->{qbType} ||= 1; |
259 |
|
|
$conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0; |
260 |
|
|
$conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1; |
261 |
|
|
|
262 |
|
|
# Selecting quotes as requested by user |
263 |
|
|
($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]); |
264 |
|
|
($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]); |
265 |
|
|
|
266 |
|
|
# Resetting all the quotes inside text to <"> |
267 |
|
|
my $qa = join('|', @qs_ansi) . '|' . join('|', @qe_ansi) . '|' . join('|', @qs_html) . '|' . join('|', @qe_html) . '|' . join('|', @qs_ent) . '|' . join('|', @qe_ent); |
268 |
|
|
$strip =~ s/(?:(?:(?<=[^$letters])|(?<=^))($qa))|(?:($qa)(?:(?=[^$letters])|(?=$)))/\"/ig; |
269 |
|
|
|
270 |
|
|
my $spread = 1; |
271 |
|
|
my $mv = 0; |
272 |
|
|
my $mvn = 0; |
273 |
|
|
my @st; |
274 |
|
|
$i = 0; |
275 |
|
|
my $skip = 0; |
276 |
|
|
my @space; # Space tags flag |
277 |
|
|
my @break; # Text break flags |
278 |
|
|
|
279 |
|
|
$st[$_] = '' foreach(0..$spread + 1); |
280 |
|
|
$space[$_] = 0 foreach(0 + 1..$spread + 1); |
281 |
|
|
$break[$_] = 0 foreach(0 + 1..$spread + 1); |
282 |
|
|
$space[0] = 1; |
283 |
|
|
$break[0] = 1; |
284 |
|
|
|
285 |
|
|
while(1) { |
286 |
|
|
# Skipping tags |
287 |
|
|
foreach(0..$spread) { |
288 |
|
|
do { |
289 |
|
|
$skip = 0; |
290 |
|
|
if($i + $_ + $mv <= length($strip)) { |
291 |
|
|
if($i + $_ + $mv + 1 < length($strip)) { |
292 |
|
|
if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) { |
293 |
|
|
$space[$_ + 1] |= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1; |
294 |
|
|
$break[$_ + 1] |= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1; |
295 |
|
|
$mv += $markLength; |
296 |
|
|
if(!$_) { $mvn = $mv; } |
297 |
|
|
$st[$_ + 1] = ""; |
298 |
|
|
$skip = 1; |
299 |
|
|
} |
300 |
|
|
} |
301 |
|
|
if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); } |
302 |
|
|
} |
303 |
|
|
} while($skip); |
304 |
|
|
} |
305 |
|
|
|
306 |
|
|
$i += $mvn; |
307 |
|
|
$mv = 0; |
308 |
|
|
$mvn = 0; |
309 |
|
|
|
310 |
|
|
($cp, $c, $cn) = ($st[0], $st[1], $st[2]); |
311 |
|
|
$cp_is_sp = (($cp =~ /[^0-9$letters]/) || $space[0] || $space[1] || $break[0] || !$i) ? 1 : 0; |
312 |
|
|
$cn_is_sp = (($cn =~ /[^0-9$letters]/) || $space[2] || $break[2] || $cn eq '') ? 1 : 0; |
313 |
|
|
|
314 |
|
|
# Reset state if breaking tag appears |
315 |
|
|
if($break[1] || $i == length($strip)) { |
316 |
|
|
if($a_open || $b_open) { |
317 |
|
|
# Log quote error if appears |
318 |
|
|
if($conf->{logErrors}) { |
319 |
|
|
my $quoteErrSampleLength = 100; |
320 |
|
|
my $z = $i - 1; |
321 |
|
|
my $y; |
322 |
|
|
while(1) { |
323 |
|
|
if(substr($strip, $z, 1) eq " " || substr($strip, $z, 1) eq "\xA0" || !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}} |
324 |
|
|
last if(!$z); |
325 |
|
|
$z--; |
326 |
|
|
} |
327 |
|
|
my $sample = substr($strip, $y, ($i - $y)); |
328 |
|
|
$sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g; |
329 |
|
|
$sample =~ s/<\/?[a-z]+.*?>//g; |
330 |
|
|
push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"}); |
331 |
|
|
$result->{error} = 1; |
332 |
|
|
} |
333 |
|
|
} |
334 |
|
|
$a_open = 0; |
335 |
|
|
$b_open = 0; |
336 |
|
|
} |
337 |
|
|
|
338 |
|
|
if($c eq '"') { |
339 |
|
|
if(!$a_open) { |
340 |
|
|
$a_open = 1; |
341 |
|
|
substr($strip, $i, 1) = $qaStart; |
342 |
|
|
$i += length($qaStart) - 1; |
343 |
|
|
} elsif ($a_open && (($i == length($strip) - 1) || (!$b_open && $cn_is_sp))) { |
344 |
|
|
$a_open = 0; |
345 |
|
|
substr($strip, $i, 1) = $qaEnd; |
346 |
|
|
$i += length($qaEnd) - 1; |
347 |
|
|
} elsif ($a_open && !$b_open) { |
348 |
|
|
$b_open = 1; |
349 |
|
|
substr($strip, $i, 1) = $qbStart; |
350 |
|
|
$i += length($qbStart) - 1; |
351 |
|
|
} elsif ($a_open && $b_open) { |
352 |
|
|
$b_open = 0; |
353 |
|
|
substr($strip, $i, 1) = $qbEnd; |
354 |
|
|
$i += length($qbEnd) - 1; |
355 |
|
|
} |
356 |
|
|
} |
357 |
|
|
|
358 |
|
|
last if($i == length($strip)); |
359 |
|
|
$st[0] = $st[1]; |
360 |
|
|
$space[0] = $space[1]; |
361 |
|
|
$break[0] = $break[1]; |
362 |
|
|
$space[$_] = 0 foreach(0 + 1..$spread + 1); |
363 |
|
|
$break[$_] = 0 foreach(0 + 1..$spread + 1); |
364 |
|
|
$i++; |
365 |
|
|
} |
366 |
|
|
} |
367 |
|
|
|
368 |
|
|
# ==Cutting the tags away |
369 |
|
|
sub cuttags($$$$) { |
370 |
|
|
my($class, $text, $conf, $result) = @_; |
371 |
|
|
my $i = 0; # loop counter |
372 |
|
|
my $hop; # Jump length |
373 |
|
|
my ($c, $cn); # current & next character |
374 |
|
|
my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag |
375 |
|
|
my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags |
376 |
|
|
my @tagsOpen; # an array storing the info about all the tags currently open |
377 |
|
|
|
378 |
|
|
# space class |
379 |
|
|
my $sp =" \t\xA0"; |
380 |
|
|
|
381 |
|
|
while(1) { |
382 |
|
|
$hop = index($$text, "<", $i); |
383 |
|
|
|
384 |
|
|
if($hop < 0) { |
385 |
|
|
$strip .= substr($$text, $i, length($$text) - $i); |
386 |
|
|
last; |
387 |
|
|
} elsif($hop > 0) { |
388 |
|
|
$strip .= substr($$text, $i, $hop - $i); |
389 |
|
|
$i = $hop; |
390 |
|
|
} |
391 |
|
|
|
392 |
|
|
($c, $cn) = unpack("aa", substr($$text, $i, 2)); |
393 |
|
|
|
394 |
|
|
$isTag = 0; |
395 |
|
|
|
396 |
|
|
# =If tag opens |
397 |
|
|
$isTagStart = ($cn =~ /!|[a-z]/i) ? 1 : 0; |
398 |
|
|
if($isTagStart || ($cn eq "/")) { $isTag = 1; } |
399 |
|
|
|
400 |
|
|
if($isTag) { |
401 |
|
|
$ts = $i; # Tag start position |
402 |
|
|
$te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position |
403 |
|
|
|
404 |
|
|
if($te) { |
405 |
|
|
$tagBody = substr($$text, $ts, $te - $ts + 1); |
406 |
|
|
$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; |
407 |
|
|
$tagName =~ tr/A-Z/a-z/; |
408 |
|
|
} |
409 |
|
|
|
410 |
|
|
if($te && $tagName) { |
411 |
|
|
$tagBody = substr($$text, $ts, $te - $ts + 1); |
412 |
|
|
$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; |
413 |
|
|
$tagName =~ tr/A-Z/a-z/; |
414 |
|
|
|
415 |
|
|
# =Flags |
416 |
|
|
# Detecting whether the tag is single (self-closing) or double |
417 |
|
|
$isSingleClosed = 0; |
418 |
|
|
$isSingle = 0; |
419 |
|
|
if($isTagStart) { |
420 |
|
|
if(grep{$tagName eq $_} @singleTags) { |
421 |
|
|
$isSingle = 1; |
422 |
|
|
} elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") { |
423 |
|
|
$isSingle = 1; |
424 |
|
|
$isSingleClosed = 1; |
425 |
|
|
} |
426 |
|
|
} |
427 |
|
|
|
428 |
|
|
# Detecting wether this is space tag or not |
429 |
|
|
$isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0; |
430 |
|
|
|
431 |
|
|
# Detecting wether this is breaking tag or not |
432 |
|
|
$isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0; |
433 |
|
|
|
434 |
|
|
# Tag Length |
435 |
|
|
$tl = $te - $ts + 1; |
436 |
|
|
|
437 |
|
|
# Updating the status for tags open |
438 |
|
|
if($conf->{checkHTML} && !$isSingle) { |
439 |
|
|
if($isTagStart) { |
440 |
|
|
push(@tagsOpen, $tagName); |
441 |
|
|
} else { |
442 |
|
|
if($tagsOpen[$#tagsOpen] ne $tagName) { |
443 |
|
|
# HTML error |
444 |
|
|
$result->{error} = 1; |
445 |
|
|
if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); } |
446 |
|
|
} else { |
447 |
|
|
pop(@tagsOpen); |
448 |
|
|
} |
449 |
|
|
} |
450 |
|
|
} |
451 |
|
|
|
452 |
|
|
# Eating tag content for some tags like <script> |
453 |
|
|
$tagContent = ""; |
454 |
|
|
$cl = 0; |
455 |
|
|
if((grep{$tagName eq $_} @tagsToEat) && $isTagStart) { |
456 |
|
|
$cl = index($$text, "</$tagName>", $ts + $tl) - $ts - $tl; |
457 |
|
|
if($cl > 0) { |
458 |
|
|
$tagContent = substr($$text, $ts + $tl, $cl); |
459 |
|
|
} else { |
460 |
|
|
$cl = 0; |
461 |
|
|
$result->{error} = 1; |
462 |
|
|
if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Can't find <$tagName> end!"}); } |
463 |
|
|
} |
464 |
|
|
} |
465 |
|
|
|
466 |
|
|
# Should I drop all the tags by default? |
467 |
|
|
my $dropTag = 0; |
468 |
|
|
if($conf->{tagsDenyAll}) { $dropTag = 1; } |
469 |
|
|
|
470 |
|
|
# Checking deny list |
471 |
|
|
if(defined($conf->{tagsDeny}) && !$dropTag) { |
472 |
|
|
if($conf->{tagsDeny}->{$tagName}) { $dropTag = 1; } |
473 |
|
|
} |
474 |
|
|
|
475 |
|
|
# Checking allow list |
476 |
|
|
if(defined($conf->{tagsAllow}) && $dropTag) { |
477 |
|
|
if($conf->{tagsAllow}->{$tagName}) { $dropTag = 0; } |
478 |
|
|
} |
479 |
|
|
|
480 |
|
|
# Nobr tag status |
481 |
|
|
if($tagName eq "nobr" && $isTagStart) { |
482 |
|
|
$nobrIsOpen = 1; |
483 |
|
|
} elsif(($tagName eq "nobr" && !$isTagStart) || (grep{$tagName eq $_} @breakingTags)) { |
484 |
|
|
$nobrIsOpen = 0; |
485 |
|
|
} |
486 |
|
|
|
487 |
|
|
# =Final part |
488 |
|
|
if(!$dropTag) { |
489 |
|
|
# =Processing tags |
490 |
|
|
# Tag name to lower case |
491 |
|
|
if($conf->{tagNamesToLower}) { |
492 |
|
|
if($isTagStart) { $tagBody = "<" . $tagName . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); } |
493 |
|
|
else { $tagBody =~ tr/A-Z/a-z/; } |
494 |
|
|
} |
495 |
|
|
# Tag name to upper case |
496 |
|
|
if($conf->{tagNamesToUpper}) { |
497 |
|
|
if($isTagStart) { $tagBody = "<" . uc($tagName) . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); } |
498 |
|
|
else { $tagBody =~ tr/a-z/A-Z/; } |
499 |
|
|
} |
500 |
|
|
# =Tag parameters to lower or upper case |
501 |
|
|
if($isTagStart && ($conf->{tagAttributesToLower} || $conf->{tagAttributesToUpper})) { |
502 |
|
|
# Regular parameters |
503 |
|
|
my $tmp = ""; |
504 |
|
|
while ($tagBody =~ m/([^\s]*\s*)(?:([a-z\r]+)(\s*)(?==)(=\s*))?/ig ) { |
505 |
|
|
$tmp .= $1 if ($1); if($conf->{tagAttributesToLower}) { if($2) { $tmp .= lc($2); } } else { if($2) { $tmp .= uc($2); } } $tmp .= $3 if ($3); $tmp .= $4 if ($4); $tmp .= $5 if ($5); |
506 |
|
|
} |
507 |
|
|
|
508 |
|
|
# Single parameters (like <checked>) |
509 |
|
|
if($conf->{tagAttributesToLower}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{lc($1)}eig; } |
510 |
|
|
elsif($conf->{tagAttributesToUpper}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{uc($1)}eig; } |
511 |
|
|
} |
512 |
|
|
|
513 |
|
|
# Simple XSS & tag attributes protection |
514 |
|
|
if($isTagStart && ($conf->{simpleXSS} || $conf->{tagsAllow}->{$tagName}->{validAttributes} || $conf->{tagsAllow}->{$tagName}->{invalidAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || $conf->{tagsDenyAllAttributes})) { |
515 |
|
|
$tagBody =~ s{(?<!<)(\s*)([a-z]+)([$sp]*=[$sp]*)("[^"]+"|[^$sp/>]+)} { |
516 |
|
|
my ($a, $b, $c, $d) = ($1||'', $2, $3, $4); |
517 |
|
|
if($conf->{simpleXSS} && ($b =~ /^on/ig || $d =~ /javascript|expression/ig)) { |
518 |
|
|
''; |
519 |
|
|
} elsif(($conf->{tagsDenyAllAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || ($conf->{tagsAllow}->{$tagName}->{invalidAttributes} && $conf->{tagsAllow}->{$tagName}->{invalidAttributes}->{$b})) |
520 |
|
|
&& !(($conf->{tagsAllow}->{$tagName}->{validAttributes} && $conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) |
521 |
|
|
|| $conf->{tagsAllow}->{$tagName}->{allowAllAttributes}) |
522 |
|
|
) { |
523 |
|
|
''; |
524 |
|
|
} elsif($conf->{tagsAllow}->{$tagName}->{validAttributes} && !$conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) { |
525 |
|
|
''; |
526 |
|
|
} else { |
527 |
|
|
$a . $b . $c . $d; |
528 |
|
|
} |
529 |
|
|
}eig; |
530 |
|
|
} |
531 |
|
|
|
532 |
|
|
# Close single tag |
533 |
|
|
if($conf->{tagCloseSingle} && $isSingle && !$isSingleClosed) { |
534 |
|
|
if(substr($tagBody, length($tagBody) - 2, 1) ne "/") { |
535 |
|
|
if(substr($tagBody, length($tagBody) - 2, 1) ne " ") { substr($tagBody, length($tagBody) - 2, 1) .= " /"; } else { substr($tagBody, length($tagBody) - 2, 1) .= "/"; } |
536 |
|
|
} |
537 |
|
|
} |
538 |
|
|
|
539 |
|
|
# Quote attribute values |
540 |
|
|
if($conf->{tagQuoteValues} && $isTagStart) { |
541 |
|
|
my $tmp = ""; |
542 |
|
|
# 1 23 4 5 6 |
543 |
|
|
while($tagBody =~ m/([<a-z >]+)?((=)(\s*)([^ >]+)([ >]+))?/ig) { |
544 |
|
|
$tmp .= $1 if($1); |
545 |
|
|
if($2) { |
546 |
|
|
$tmp .= $3 if($3); |
547 |
|
|
$tmp .= $4 if($4); |
548 |
|
|
if($5 && substr($5, 0, 1) ne '"' && substr($5, length($5) - 1, 1) ne '"') { $tmp .= "\"$5\""; } else { $tmp .= $5; } |
549 |
|
|
$tmp .= $6 if($6); |
550 |
|
|
} |
551 |
|
|
} |
552 |
|
|
$tagBody = $tmp; |
553 |
|
|
} |
554 |
|
|
|
555 |
|
|
# Unquote attribute values |
556 |
|
|
if($conf->{tagUnQuoteValues}) { |
557 |
|
|
$tagBody =~ s/([a-z]+)(\s*)(=)(\s*)"([^\=\s">]+)"/$1$2$3$4$5/ig; #" |
558 |
|
|
} |
559 |
|
|
|
560 |
|
|
# Saving the tag |
561 |
|
|
push(@$tags, {name=>$tagName, body=>$tagBody, content=>$tagContent}); |
562 |
|
|
|
563 |
|
|
# Forming flagSet |
564 |
|
|
# |
565 |
|
|
# |byte2: _ _ _ _ _ _ isSpace isBreaking| byte1: _ _ _ _ _ p br| byte0: _ _ _ _ nobr isTagStart |
566 |
|
|
$flagSet2 = 0; |
567 |
|
|
if($isSpace) { $flagSet2 |= 2; } |
568 |
|
|
if($isBreaking) { $flagSet2 |= 1; } |
569 |
|
|
$flagSet1 = 0; |
570 |
|
|
if($tagName eq "br") { $flagSet1 |= 1; } |
571 |
|
|
if($tagName eq "p") { $flagSet1 |= 2; } |
572 |
|
|
$flagSet0 = 0; |
573 |
|
|
if($isTagStart) { $flagSet0 |= 1; } |
574 |
|
|
if($nobrIsOpen) { $flagSet0 |= 2; } |
575 |
|
|
# Planting the marker |
576 |
|
|
$strip .= "\x00\x0F" . chr($flagSet2) . chr($flagSet1) . chr($flagSet0) . "\x0F\x00"; |
577 |
|
|
} |
578 |
|
|
|
579 |
|
|
# Moving the pointer (tag end position + content length) |
580 |
|
|
$i = $te + $cl; |
581 |
|
|
|
582 |
|
|
# Eating crs & lfs after dropped tag |
583 |
|
|
if($conf->{tagLf} && $dropTag) { |
584 |
|
|
while(1) { |
585 |
|
|
if(substr($$text, $i + 1, 1) eq "\r") { $i++; } elsif(substr($$text, $i + 1, 1) eq "\n") { $i++; last; } else { last } |
586 |
|
|
} |
587 |
|
|
} |
588 |
|
|
} |
589 |
|
|
} else { |
590 |
|
|
# This is not a tag, just add the "<" to result |
591 |
|
|
$strip .= $c; |
592 |
|
|
} |
593 |
|
|
|
594 |
|
|
last if($i == length($$text)); |
595 |
|
|
$i++; |
596 |
|
|
} |
597 |
|
|
} |
598 |
|
|
|
599 |
|
|
# ==Find where tag ends |
600 |
|
|
sub tagend($$$) { |
601 |
|
|
my ($text, $i) = @_; |
602 |
|
|
|
603 |
|
|
my $gotcha = 0; |
604 |
|
|
my $quote = 0; |
605 |
|
|
|
606 |
|
|
$i |= 0; |
607 |
|
|
|
608 |
|
|
while (1) { |
609 |
|
|
if (substr($$text, $i, 1) eq '"') { $quote ^= 1; } |
610 |
|
|
if (!$quote && substr($$text, $i, 1) eq '>') { $gotcha = $i; } |
611 |
|
|
last if ($i == length($$text) || $gotcha); |
612 |
|
|
$i++; |
613 |
|
|
} |
614 |
|
|
|
615 |
|
|
return $gotcha; |
616 |
|
|
} |
617 |
|
|
|
618 |
|
|
# ==Bring everything back to HTML |
619 |
|
|
sub planttags($$) { |
620 |
|
|
my ($class, $result) = @_; |
621 |
|
|
my $i = 0; |
622 |
|
|
my $max = length($strip); |
623 |
|
|
my $ctag = 0; |
624 |
|
|
my $step; |
625 |
|
|
|
626 |
|
|
while (1) { |
627 |
|
|
if($i < $max - 2 && substr($strip, $i, 2) eq "\x00\x0F") { |
628 |
|
|
$result->{text} .= $$tags[$ctag]->{body}; |
629 |
|
|
if($$tags[$ctag]->{content}) { $result->{text} .= $$tags[$ctag]->{content}; } |
630 |
|
|
$i += $markLength; |
631 |
|
|
$ctag++; |
632 |
|
|
} else { |
633 |
|
|
if($i < $max - 2) { $step = index($strip, "\x00\x0F", $i) - $i; } else { $step = $max - $i; } |
634 |
|
|
if($step < 0) { $step = $max - $i; } |
635 |
|
|
|
636 |
|
|
if($step >= 0) { |
637 |
|
|
$result->{text} .= substr($strip, $i, $step); |
638 |
|
|
$i += $step; |
639 |
|
|
} |
640 |
|
|
} |
641 |
|
|
|
642 |
|
|
last if($i == $max); |
643 |
|
|
} |
644 |
|
|
} |
645 |
|
|
|
646 |
|
|
# ==Bring the text to plain mode== |
647 |
|
|
sub vanish($$) { |
648 |
|
|
my($class, $text) = @_; |
649 |
|
|
|
650 |
|
|
$$text =~ s/«|“|„|‘|‚|"|»|”|“|’|«|“|„|‘|‚|"|»|”|’|«|“|„|‘|‚|"|»|”|’/"/ig; |
651 |
|
|
$$text =~ s/ | | / /ig; |
652 |
|
|
$$text =~ s/—|–|—|–|—|–/-/ig; |
653 |
|
|
$$text =~ s/…|…|…/.../ig; |
654 |
|
|
$$text =~ s/©|©|©/(c)/ig; |
655 |
|
|
$$text =~ s/®|®|®/(r)/ig; |
656 |
|
|
$$text =~ s/¼|¼|¼/1\/4/ig; |
657 |
|
|
$$text =~ s/½|½|½/1\/2/ig; |
658 |
|
|
$$text =~ s/¾|¾|¾/3\/4/ig; |
659 |
|
|
} |
660 |
|
|
|
661 |
|
|
# ==Parse the tagsAllow string advanced format== |
662 |
|
|
sub parseTagsAllowString($$) { |
663 |
|
|
my($class, $string) = @_; |
664 |
|
|
|
665 |
|
|
return {tagsAllow=>{}, tagsDenyAllAttributes=>0} if(!$string); |
666 |
|
|
|
667 |
|
|
my $tagsAllow = {}; |
668 |
|
|
my $tagsDenyAllAttributes = 0; |
669 |
|
|
|
670 |
|
|
# Should I deny all tag attributes by default? |
671 |
|
|
if(substr($string,0,1) eq '|') { |
672 |
|
|
$tagsDenyAllAttributes = 1; |
673 |
|
|
substr($string,0,1) = ''; |
674 |
|
|
}; |
675 |
|
|
|
676 |
|
|
# Parsing the Configuration String |
677 |
|
|
while($string =~ /([a-z:|]+)/ig) { |
678 |
|
|
my $tBody = $1; |
679 |
|
|
my ($tagName) = ($tBody =~ /^([a-z]+)/i)[0]; |
680 |
|
|
|
681 |
|
|
last if(!$tagName); |
682 |
|
|
|
683 |
|
|
my $attrList = (); |
684 |
|
|
$tagsAllow->{$tagName}->{val}=1; |
685 |
|
|
|
686 |
|
|
if($tBody =~ /^$tagName\|$/i) { |
687 |
|
|
$tagsAllow->{$tagName}->{denyAllAttributes}=1; |
688 |
|
|
} elsif($tBody =~ /^$tagName\:$/i) { |
689 |
|
|
$tagsAllow->{$tagName}->{allowAllAttributes}=1; |
690 |
|
|
} else { |
691 |
|
|
while($tBody =~ /:([a-z]+)/ig) { |
692 |
|
|
$tagsAllow->{$tagName}->{validAttributes}->{$1}=1; |
693 |
|
|
} |
694 |
|
|
|
695 |
|
|
while($tBody =~ /\|([a-z]+)/ig) { |
696 |
|
|
if(!$tagsAllow->{$tagName}->{validAttributes}->{$1}) { |
697 |
|
|
$tagsAllow->{$tagName}->{invalidAttributes}->{$1}=1; |
698 |
|
|
} |
699 |
|
|
} |
700 |
|
|
} |
701 |
|
|
} |
702 |
|
|
|
703 |
|
|
return {tagsAllow=>$tagsAllow, tagsDenyAllAttributes=>$tagsDenyAllAttributes}; |
704 |
|
|
} |
705 |
|
|
|
706 |
|
|
# ==Parse the tagsAllow string advanced format== |
707 |
|
|
sub parseTagsDenyString($$) { |
708 |
|
|
my($class, $string) = @_; |
709 |
|
|
|
710 |
|
|
return {} if(!$string); |
711 |
|
|
|
712 |
|
|
my $tagsDeny = {}; |
713 |
|
|
while($string =~ /([a-z]+)/ig) { |
714 |
|
|
$tagsDeny->{$1}->{val}=1; |
715 |
|
|
} |
716 |
|
|
|
717 |
|
|
return $tagsDeny; |
718 |
|
|
} |
719 |
|
|
|
720 |
|
|
return 1; |