1 |
3 |
ahitrov@rambler.ru |
package Jevix::Makeup; |
2 |
|
|
|
3 |
|
|
# ========================================== |
4 |
|
|
# |
5 |
|
|
# Jevix Version 0.9.5 (windows-1251) |
6 |
|
|
# |
7 |
|
|
# Developed by Igor Askarov |
8 |
|
|
# |
9 |
|
|
# Please send all suggestions to |
10 |
|
|
# Igor Askarov <juks@juks.ru> |
11 |
|
|
# http://www.jevix.ru/ |
12 |
|
|
# |
13 |
|
|
# Release date: 21/01/2008 |
14 |
|
|
# |
15 |
|
|
# === Methods list========================== |
16 |
|
|
# |
17 |
|
|
# new the constructor |
18 |
|
|
# procces entry sub |
19 |
|
|
# preset presets selector |
20 |
|
|
# makeup makeup the text |
21 |
|
|
# quotes quotes processor |
22 |
|
|
# cuttags tags processor |
23 |
|
|
# tagend looking fo tag end |
24 |
|
|
# planttags sub to bring the tags back |
25 |
|
|
# vanish sub to remove all the stuff and bring the text to plain mode |
26 |
|
|
# parseTagsAllowString parse the tagsAllow string to hash |
27 |
|
|
# parseTagsDenyString parse the tagsDeny string to hash |
28 |
|
|
# |
29 |
|
|
# ========================================== |
30 |
|
|
|
31 |
|
|
use strict; |
32 |
|
|
use warnings; |
33 |
|
|
|
34 |
|
|
my $markLength = 7; |
35 |
|
|
my $strip; |
36 |
|
|
my $result; |
37 |
|
|
my $tags; |
38 |
|
|
my $conf; |
39 |
|
|
|
40 |
|
|
my @singleTags = qw/link input spacer img br hr/; |
41 |
|
|
my @breakingTags = qw/p td div hr/; |
42 |
|
|
my @spaceTags = qw/br/; |
43 |
|
|
my @tagsToEat = qw/script style/; |
44 |
|
|
|
45 |
|
|
# ==The constructor |
46 |
|
|
sub new { |
47 |
|
|
my Jevix::Makeup $class = shift; |
48 |
|
|
|
49 |
|
|
return $class; |
50 |
|
|
} |
51 |
|
|
|
52 |
|
|
# ==Here we've got the input |
53 |
|
|
sub process($$$) { |
54 |
|
|
my($class, $text, $userConfig) = @_; |
55 |
|
|
|
56 |
|
|
$conf = $userConfig ? $userConfig : {presetBasic=>1}; |
57 |
|
|
$class->preset(); |
58 |
|
|
|
59 |
|
|
$strip = ""; |
60 |
|
|
$tags = []; |
61 |
|
|
|
62 |
|
|
$result = {}; |
63 |
|
|
$result->{error} = 0; |
64 |
|
|
$result->{errorLog} = []; |
65 |
|
|
|
66 |
|
|
if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); } |
67 |
|
|
if($conf->{quotes}) { $class->quotes($conf); } |
68 |
|
|
$class->makeup($conf); |
69 |
|
|
|
70 |
|
|
$result->{text} = ""; |
71 |
|
|
if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; } |
72 |
|
|
|
73 |
|
|
return $result; |
74 |
|
|
} |
75 |
|
|
|
76 |
|
|
# ==Choosing default setup when necessary |
77 |
|
|
sub preset($$) { |
78 |
|
|
my ($class) = @_; |
79 |
|
|
|
80 |
|
|
if(!$conf || $conf->{presetBasic}) { |
81 |
|
|
$conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode |
82 |
|
|
$conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to <br/> |
83 |
|
|
$conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs |
84 |
|
|
$conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary |
85 |
|
|
$conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol |
86 |
|
|
$conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters |
87 |
|
|
$conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces |
88 |
|
|
$conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces |
89 |
|
|
$conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside <nobr> |
90 |
|
|
$conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag |
91 |
|
|
$conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces |
92 |
|
|
$conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes |
93 |
|
|
$conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type |
94 |
|
|
$conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type |
95 |
|
|
$conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions |
96 |
|
|
$conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters |
97 |
|
|
} |
98 |
|
|
|
99 |
|
|
# If tagsAllow came as a string |
100 |
|
|
if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) { |
101 |
|
|
my $tmp = $class->parseTagsAllowString($conf->{tagsAllow}); |
102 |
|
|
$conf->{tagsAllow} = $tmp->{tagsAllow}; |
103 |
|
|
$conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes}; |
104 |
|
|
} |
105 |
|
|
|
106 |
|
|
# If tagsDeny came as a string |
107 |
|
|
if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) { |
108 |
|
|
$conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny}); |
109 |
|
|
} |
110 |
|
|
} |
111 |
|
|
|
112 |
|
|
# ==Imposing clear text |
113 |
|
|
sub makeup($$) { |
114 |
|
|
my ($class, $conf) = @_; |
115 |
|
|
|
116 |
|
|
# ==Misc |
117 |
|
|
# Prepositions |
118 |
|
|
my $prp_rus = "�|���|����|�|���|��|��|���|��|��|�|��|���|���|�|���|��|���|����|��|��|��|�|��|���|�����|��|���|��|���|����|���|���|�|������|��|�|�����"; |
119 |
|
|
my $prp_eng = "aboard|about|above|absent|across|after|against|along|alongside|amid|amidst|among|amongst|around|as|astride|at|atop|before|behind|below|beneath|beside|besides|between|beyond|but|by|despite|down|during|except|following|for|from|in|inside|into|like|mid|minus|near|nearest|notwithstanding|of|off|on|onto|opposite|out|outside|over|past|re|round|save|since|than|through|throughout|till|to|toward|towards|under|underneath|unlike|until|up|upon|via|with|within|without"; |
120 |
|
|
my $prp = "$prp_rus|$prp_eng"; |
121 |
|
|
|
122 |
|
|
my $letters = "A-Za-z�-��-ѳ���"; # Characters |
123 |
|
|
my $cap_letters = "A-Z�-�"; # Capital characters |
124 |
|
|
|
125 |
|
|
my $sp = " \xA0\t"; # space class |
126 |
|
|
my $rt = "\r?\n"; # cr class |
127 |
|
|
|
128 |
|
|
my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag |
129 |
|
|
my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag |
130 |
|
|
my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start |
131 |
|
|
my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end |
132 |
|
|
my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag |
133 |
|
|
my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag |
134 |
|
|
my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where <nobr> is open |
135 |
|
|
my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no <nobr> open |
136 |
|
|
my $ts = "\x00\x0F"; # Tag start |
137 |
|
|
my $te = "\x0F\x00"; # Tag end |
138 |
|
|
|
139 |
|
|
my $brt = "<br *\/?>"; # br tag in text mode |
140 |
|
|
my $pst = "<p>"; |
141 |
|
|
my $pet = "</p>"; |
142 |
|
|
|
143 |
|
|
# Codes, metasymbols or what ever? |
144 |
|
|
my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg); |
145 |
|
|
if(!$conf->{codeMode}) { |
146 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("�", "�", "�", "�", "�"); |
147 |
|
|
$cfracs = {'1/4'=>"?", '1/2'=>"?", '3/4'=>"?"}; |
148 |
|
|
} elsif($conf->{codeMode} == 1) { |
149 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); |
150 |
|
|
$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; |
151 |
|
|
} else { |
152 |
|
|
($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); |
153 |
|
|
$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
# Wiping edge spaces |
157 |
|
|
if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n]*(.+?)[$sp\r\n]*$/$1/isg; } |
158 |
|
|
|
159 |
|
|
# Wiping spaces between tags (</td> </tr>) |
160 |
|
|
if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; } |
161 |
|
|
|
162 |
|
|
# Wiping multispaces |
163 |
|
|
if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; } |
164 |
|
|
|
165 |
|
|
# Wiping redundant spaces |
166 |
|
|
if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))|(\()(?<![:%;]-\()[$sp]+}{$1 ? $2 : $3}eig; } |
167 |
|
|
|
168 |
|
|
if($conf->{nbsp}) { |
169 |
|
|
# Prepositions with |
170 |
|
|
$strip =~ s/(^|\x00|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm; |
171 |
|
|
|
172 |
|
|
# with digits |
173 |
|
|
$strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+| | )(?:(?=[0-9]{2,})|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg; |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
# Put composite words inside <nobr> |
177 |
|
|
if($conf->{compositeWords}) { $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); |
178 |
|
|
$b =~ s{(^|[$sp\x00]| )([$letters]+(?:-[$letters]+)+)(?=$|[$sp\x00])}{ |
179 |
|
|
my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3; |
180 |
|
|
if(length($e) <= $conf->{compositeWordsLength}) { "$d<nobr>$e<\/nobr>" } else {"$d$e$f"} |
181 |
|
|
}eig; "$a$b$c"; |
182 |
|
|
}eisg; } |
183 |
|
|
|
184 |
|
|
# Dots |
185 |
|
|
if($conf->{dots}) { $strip =~ s/\.{3}|�|…/$cdots/ig; } |
186 |
|
|
|
187 |
|
|
# Dashes |
188 |
|
|
if($conf->{dashes}) { |
189 |
|
|
# Hyphen |
190 |
|
|
$strip =~ s/([^$sp])([$sp]| | )(-{1,2}|�|—|—)/$1$cnbsp$cdash/ig; |
191 |
|
|
# "Speech" hyphen |
192 |
|
|
$strip =~ s/((?:^|$ps|$br|$brt(?:$rt)*|[$rt]))[$sp]*(?: )*(-{1,2}|�|—|—)[$sp]*(?: )*(.)/$1$cdash$cnbsp$3/ig; |
193 |
|
|
} |
194 |
|
|
|
195 |
|
|
# Misc stuff |
196 |
|
|
if($conf->{misc}) { |
197 |
|
|
# Fracs |
198 |
|
|
$strip =~ s{(?:(?<=[$sp\x00])|(?<=^))([13])/([24])(?:(?=[$sp\x00])|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg; |
199 |
|
|
# Copyright & registered |
200 |
|
|
$strip =~ s{(?:(?<=[$sp\x00])|(?<=^))(\([cr]\)|©|�)(?:(?=[$sp\x00?!;.,])|(?=$))}{ if((lc($1) eq "(c)") || (lc($1) eq "©") || ($1 eq "�")) {$ccopy} elsif((lc($1) eq "(r)") || (lc($1) eq "®") || ($1 eq "�")) {$creg} else { $2 } }eig; |
201 |
|
|
} |
202 |
|
|
|
203 |
|
|
# Paragraphs |
204 |
|
|
if($conf->{paragraphs}) { $strip =~ s{(^|$pe(?:$rt$rt)?|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?<!$pe)(?:(?=$)|(?=$rt$rt)|(?=$ps))}{ my ($a, $b, $c) = ($1,$2,$3||""); (($b =~ /^[ \r\n]+$/) || ($b =~ /^(<br *\/?>|$br)+$/)) ? "$a$b$c" : "$a<p>$b</p>";}eisg; } |
205 |
|
|
|
206 |
|
|
# Line break |
207 |
|
|
if($conf->{lineBreaks}) { $strip =~ s/(?<!$pt)(?<!$br)(?<!$br\r)(?<!$pe\r\n\r\n)(?<!$pe\n\n)(?<!$pe\r\n)(?<!$pe\n)(?<!$pe\r)(?<!$pe)(?<!$pet\r\n\r\n)(?<!$pet\r\n\r)(?<!$pet\n\n)(?<!$pet\r\n)(?<!$pet\n)(?<!$pet\r)(?<!$pet)(?<!$pst)($rt)(?!$brt)/<br \/>$1/isg; } |
208 |
|
|
} |
209 |
|
|
|
210 |
|
|
# ==impose quotes |
211 |
|
|
sub quotes($$) { |
212 |
|
|
my ($class, $conf) = @_; |
213 |
|
|
|
214 |
|
|
my $i; |
215 |
|
|
my ($a_open, $b_open) = (0,0); |
216 |
|
|
my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0); |
217 |
|
|
my ($qaStart, $qaEnd, $qbStart, $qbEnd); |
218 |
|
|
my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,); |
219 |
|
|
|
220 |
|
|
# space class |
221 |
|
|
my $sp =" \t\xA0"; |
222 |
|
|
# characters |
223 |
|
|
my $letters = "A-Za-z�-��-ѳ���"; |
224 |
|
|
|
225 |
|
|
@qs_ansi = ("�", "�", "�", "�", "�", '"'); |
226 |
|
|
@qe_ansi = ("�", "�", "�", "�", "�", '"'); |
227 |
|
|
@qs_html = ("«", "“", "„", "‘", "‚", """); |
228 |
|
|
@qe_html = ("»", "”", "“", "’", "‘", """); |
229 |
|
|
# << `` .. ` . " |
230 |
|
|
@qs_ent = ("«", "“", "„", "‘", "‚", """); |
231 |
|
|
# >> '' '' ' ` " |
232 |
|
|
@qe_ent = ("»", "”", "“", "’", "‘", """); |
233 |
|
|
|
234 |
|
|
# Quotes collection |
235 |
|
|
if(!$conf->{codeMode}) { |
236 |
|
|
@qs = @qs_ansi; @qe = @qe_ansi; |
237 |
|
|
} elsif ($conf->{codeMode} == 1) { |
238 |
|
|
@qs = @qs_html; @qe = @qe_html; |
239 |
|
|
} else { |
240 |
|
|
@qs = @qs_ent; @qe = @qe_ent; |
241 |
|
|
} |
242 |
|
|
|
243 |
|
|
# Getting configuration setting |
244 |
|
|
$conf->{qaType} ||= 0; |
245 |
|
|
$conf->{qbType} ||= 1; |
246 |
|
|
$conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0; |
247 |
|
|
$conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1; |
248 |
|
|
|
249 |
|
|
# Selecting quotes as requested by user |
250 |
|
|
($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]); |
251 |
|
|
($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]); |
252 |
|
|
|
253 |
|
|
# Resetting all the quotes inside text to <"> |
254 |
|
|
my $qa = join('|', @qs_ansi) . '|' . join('|', @qe_ansi) . '|' . join('|', @qs_html) . '|' . join('|', @qe_html) . '|' . join('|', @qs_ent) . '|' . join('|', @qe_ent); |
255 |
|
|
$strip =~ s/(?:(?:(?<=[^$letters])|(?<=^))($qa))|(?:($qa)(?:(?=[^$letters])|(?=$)))/\"/ig; |
256 |
|
|
|
257 |
|
|
my $spread = 1; |
258 |
|
|
my $mv = 0; |
259 |
|
|
my $mvn = 0; |
260 |
|
|
my @st; |
261 |
|
|
$i = 0; |
262 |
|
|
my $skip = 0; |
263 |
|
|
my @space; # Space tags flag |
264 |
|
|
my @break; # Text break flags |
265 |
|
|
|
266 |
|
|
$st[$_] = '' foreach(0..$spread + 1); |
267 |
|
|
$space[$_] = 0 foreach(0 + 1..$spread + 1); |
268 |
|
|
$break[$_] = 0 foreach(0 + 1..$spread + 1); |
269 |
|
|
$space[0] = 1; |
270 |
|
|
$break[0] = 1; |
271 |
|
|
|
272 |
|
|
while(1) { |
273 |
|
|
# Skipping tags |
274 |
|
|
foreach(0..$spread) { |
275 |
|
|
do { |
276 |
|
|
$skip = 0; |
277 |
|
|
if($i + $_ + $mv <= length($strip)) { |
278 |
|
|
if($i + $_ + $mv + 1 < length($strip)) { |
279 |
|
|
if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) { |
280 |
|
|
$space[$_ + 1] |= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1; |
281 |
|
|
$break[$_ + 1] |= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1; |
282 |
|
|
$mv += $markLength; |
283 |
|
|
if(!$_) { $mvn = $mv; } |
284 |
|
|
$st[$_ + 1] = ""; |
285 |
|
|
$skip = 1; |
286 |
|
|
} |
287 |
|
|
} |
288 |
|
|
if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); } |
289 |
|
|
} |
290 |
|
|
} while($skip); |
291 |
|
|
} |
292 |
|
|
|
293 |
|
|
$i += $mvn; |
294 |
|
|
$mv = 0; |
295 |
|
|
$mvn = 0; |
296 |
|
|
|
297 |
|
|
($cp, $c, $cn) = ($st[0], $st[1], $st[2]); |
298 |
|
|
$cp_is_sp = (($cp =~ /[^0-9$letters]/) || $space[0] || $space[1] || $break[0] || !$i) ? 1 : 0; |
299 |
|
|
$cn_is_sp = (($cn =~ /[^0-9$letters]/) || $space[2] || $break[2] || $cn eq '') ? 1 : 0; |
300 |
|
|
|
301 |
|
|
# Reset state if breaking tag appears |
302 |
|
|
if($break[1] || $i == length($strip)) { |
303 |
|
|
if($a_open || $b_open) { |
304 |
|
|
# Log quote error if appears |
305 |
|
|
if($conf->{logErrors}) { |
306 |
|
|
my $quoteErrSampleLength = 100; |
307 |
|
|
my $z = $i - 1; |
308 |
|
|
my $y; |
309 |
|
|
while(1) { |
310 |
|
|
if(substr($strip, $z, 1) eq " " || substr($strip, $z, 1) eq "\xA0" || !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}} |
311 |
|
|
last if(!$z); |
312 |
|
|
$z--; |
313 |
|
|
} |
314 |
|
|
my $sample = substr($strip, $y, ($i - $y)); |
315 |
|
|
$sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g; |
316 |
|
|
$sample =~ s/<\/?[a-z]+.*?>//g; |
317 |
|
|
push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"}); |
318 |
|
|
$result->{error} = 1; |
319 |
|
|
} |
320 |
|
|
} |
321 |
|
|
$a_open = 0; |
322 |
|
|
$b_open = 0; |
323 |
|
|
} |
324 |
|
|
|
325 |
|
|
if($c eq '"') { |
326 |
|
|
if(!$a_open) { |
327 |
|
|
$a_open = 1; |
328 |
|
|
substr($strip, $i, 1) = $qaStart; |
329 |
|
|
$i += length($qaStart) - 1; |
330 |
|
|
} elsif ($a_open && (($i == length($strip) - 1) || (!$b_open && $cn_is_sp))) { |
331 |
|
|
$a_open = 0; |
332 |
|
|
substr($strip, $i, 1) = $qaEnd; |
333 |
|
|
$i += length($qaEnd) - 1; |
334 |
|
|
} elsif ($a_open && !$b_open) { |
335 |
|
|
$b_open = 1; |
336 |
|
|
substr($strip, $i, 1) = $qbStart; |
337 |
|
|
$i += length($qbStart) - 1; |
338 |
|
|
} elsif ($a_open && $b_open) { |
339 |
|
|
$b_open = 0; |
340 |
|
|
substr($strip, $i, 1) = $qbEnd; |
341 |
|
|
$i += length($qbEnd) - 1; |
342 |
|
|
} |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
last if($i == length($strip)); |
346 |
|
|
$st[0] = $st[1]; |
347 |
|
|
$space[0] = $space[1]; |
348 |
|
|
$break[0] = $break[1]; |
349 |
|
|
$space[$_] = 0 foreach(0 + 1..$spread + 1); |
350 |
|
|
$break[$_] = 0 foreach(0 + 1..$spread + 1); |
351 |
|
|
$i++; |
352 |
|
|
} |
353 |
|
|
} |
354 |
|
|
|
355 |
|
|
# ==Cutting the tags away |
356 |
|
|
sub cuttags($$$$) { |
357 |
|
|
my($class, $text, $conf, $result) = @_; |
358 |
|
|
my $i = 0; # loop counter |
359 |
|
|
my $hop; # Jump length |
360 |
|
|
my ($c, $cn); # current & next character |
361 |
|
|
my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag |
362 |
|
|
my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags |
363 |
|
|
my @tagsOpen; # an array storing the info about all the tags currently open |
364 |
|
|
|
365 |
|
|
# space class |
366 |
|
|
my $sp =" \t\xA0"; |
367 |
|
|
|
368 |
|
|
while(1) { |
369 |
|
|
$hop = index($$text, "<", $i); |
370 |
|
|
|
371 |
|
|
if($hop < 0) { |
372 |
|
|
$strip .= substr($$text, $i, length($$text) - $i); |
373 |
|
|
last; |
374 |
|
|
} elsif($hop > 0) { |
375 |
|
|
$strip .= substr($$text, $i, $hop - $i); |
376 |
|
|
$i = $hop; |
377 |
|
|
} |
378 |
|
|
|
379 |
|
|
($c, $cn) = unpack("aa", substr($$text, $i, 2)); |
380 |
|
|
|
381 |
|
|
$isTag = 0; |
382 |
|
|
|
383 |
|
|
# =If tag opens |
384 |
|
|
$isTagStart = ($cn =~ /!|[a-z]/i) ? 1 : 0; |
385 |
|
|
if($isTagStart || ($cn eq "/")) { $isTag = 1; } |
386 |
|
|
|
387 |
|
|
if($isTag) { |
388 |
|
|
$ts = $i; # Tag start position |
389 |
|
|
$te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position |
390 |
|
|
|
391 |
|
|
if($te) { |
392 |
|
|
$tagBody = substr($$text, $ts, $te - $ts + 1); |
393 |
|
|
$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; |
394 |
|
|
$tagName =~ tr/A-Z/a-z/; |
395 |
|
|
} |
396 |
|
|
|
397 |
|
|
if($te && $tagName) { |
398 |
|
|
$tagBody = substr($$text, $ts, $te - $ts + 1); |
399 |
|
|
$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; |
400 |
|
|
$tagName =~ tr/A-Z/a-z/; |
401 |
|
|
|
402 |
|
|
# =Flags |
403 |
|
|
# Detecting whether the tag is single (self-closing) or double |
404 |
|
|
$isSingleClosed = 0; |
405 |
|
|
$isSingle = 0; |
406 |
|
|
if($isTagStart) { |
407 |
|
|
if(grep{$tagName eq $_} @singleTags) { |
408 |
|
|
$isSingle = 1; |
409 |
|
|
} elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") { |
410 |
|
|
$isSingle = 1; |
411 |
|
|
$isSingleClosed = 1; |
412 |
|
|
} |
413 |
|
|
} |
414 |
|
|
|
415 |
|
|
# Detecting wether this is space tag or not |
416 |
|
|
$isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0; |
417 |
|
|
|
418 |
|
|
# Detecting wether this is breaking tag or not |
419 |
|
|
$isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0; |
420 |
|
|
|
421 |
|
|
# Tag Length |
422 |
|
|
$tl = $te - $ts + 1; |
423 |
|
|
|
424 |
|
|
# Updating the status for tags open |
425 |
|
|
if($conf->{checkHTML} && !$isSingle) { |
426 |
|
|
if($isTagStart) { |
427 |
|
|
push(@tagsOpen, $tagName); |
428 |
|
|
} else { |
429 |
|
|
if($tagsOpen[$#tagsOpen] ne $tagName) { |
430 |
|
|
# HTML error |
431 |
|
|
$result->{error} = 1; |
432 |
|
|
if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); } |
433 |
|
|
} else { |
434 |
|
|
pop(@tagsOpen); |
435 |
|
|
} |
436 |
|
|
} |
437 |
|
|
} |
438 |
|
|
|
439 |
|
|
# Eating tag content for some tags like <script> |
440 |
|
|
$tagContent = ""; |
441 |
|
|
$cl = 0; |
442 |
|
|
if((grep{$tagName eq $_} @tagsToEat) && $isTagStart) { |
443 |
|
|
$cl = index($$text, "</$tagName>", $ts + $tl) - $ts - $tl; |
444 |
|
|
if($cl > 0) { |
445 |
|
|
$tagContent = substr($$text, $ts + $tl, $cl); |
446 |
|
|
} else { |
447 |
|
|
$cl = 0; |
448 |
|
|
$result->{error} = 1; |
449 |
|
|
if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Can't find <$tagName> end!"}); } |
450 |
|
|
} |
451 |
|
|
} |
452 |
|
|
|
453 |
|
|
# Should I drop all the tags by default? |
454 |
|
|
my $dropTag = 0; |
455 |
|
|
if($conf->{tagsDenyAll}) { $dropTag = 1; } |
456 |
|
|
|
457 |
|
|
# Checking deny list |
458 |
|
|
if(defined($conf->{tagsDeny}) && !$dropTag) { |
459 |
|
|
if($conf->{tagsDeny}->{$tagName}) { $dropTag = 1; } |
460 |
|
|
} |
461 |
|
|
|
462 |
|
|
# Checking allow list |
463 |
|
|
if(defined($conf->{tagsAllow}) && $dropTag) { |
464 |
|
|
if($conf->{tagsAllow}->{$tagName}) { $dropTag = 0; } |
465 |
|
|
} |
466 |
|
|
|
467 |
|
|
# Nobr tag status |
468 |
|
|
if($tagName eq "nobr" && $isTagStart) { |
469 |
|
|
$nobrIsOpen = 1; |
470 |
|
|
} elsif(($tagName eq "nobr" && !$isTagStart) || (grep{$tagName eq $_} @breakingTags)) { |
471 |
|
|
$nobrIsOpen = 0; |
472 |
|
|
} |
473 |
|
|
|
474 |
|
|
# =Final part |
475 |
|
|
if(!$dropTag) { |
476 |
|
|
# =Processing tags |
477 |
|
|
# Tag name to lower case |
478 |
|
|
if($conf->{tagNamesToLower}) { |
479 |
|
|
if($isTagStart) { $tagBody = "<" . $tagName . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); } |
480 |
|
|
else { $tagBody =~ tr/A-Z/a-z/; } |
481 |
|
|
} |
482 |
|
|
# Tag name to upper case |
483 |
|
|
if($conf->{tagNamesToUpper}) { |
484 |
|
|
if($isTagStart) { $tagBody = "<" . uc($tagName) . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); } |
485 |
|
|
else { $tagBody =~ tr/a-z/A-Z/; } |
486 |
|
|
} |
487 |
|
|
# =Tag parameters to lower or upper case |
488 |
|
|
if($isTagStart && ($conf->{tagAttributesToLower} || $conf->{tagAttributesToUpper})) { |
489 |
|
|
# Regular parameters |
490 |
|
|
my $tmp = ""; |
491 |
|
|
while ($tagBody =~ m/([^\s]*\s*)(?:([a-z\r]+)(\s*)(?==)(=\s*))?/ig ) { |
492 |
|
|
$tmp .= $1 if ($1); if($conf->{tagAttributesToLower}) { if($2) { $tmp .= lc($2); } } else { if($2) { $tmp .= uc($2); } } $tmp .= $3 if ($3); $tmp .= $4 if ($4); $tmp .= $5 if ($5); |
493 |
|
|
} |
494 |
|
|
|
495 |
|
|
# Single parameters (like <checked>) |
496 |
|
|
if($conf->{tagAttributesToLower}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{lc($1)}eig; } |
497 |
|
|
elsif($conf->{tagAttributesToUpper}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{uc($1)}eig; } |
498 |
|
|
} |
499 |
|
|
|
500 |
|
|
# Simple XSS & tag attributes protection |
501 |
|
|
if($isTagStart && ($conf->{simpleXSS} || $conf->{tagsAllow}->{$tagName}->{validAttributes} || $conf->{tagsAllow}->{$tagName}->{invalidAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || $conf->{tagsDenyAllAttributes})) { |
502 |
|
|
$tagBody =~ s{(?<!<)(\s*)([a-z]+)([$sp]*=[$sp]*)("[^"]+"|[^$sp/>]+)} { |
503 |
|
|
my ($a, $b, $c, $d) = ($1||'', $2, $3, $4); |
504 |
|
|
if($conf->{simpleXSS} && ($b =~ /^on/ig || $d =~ /javascript|expression/ig)) { |
505 |
|
|
''; |
506 |
|
|
} elsif(($conf->{tagsDenyAllAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || ($conf->{tagsAllow}->{$tagName}->{invalidAttributes} && $conf->{tagsAllow}->{$tagName}->{invalidAttributes}->{$b})) |
507 |
|
|
&& !(($conf->{tagsAllow}->{$tagName}->{validAttributes} && $conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) |
508 |
|
|
|| $conf->{tagsAllow}->{$tagName}->{allowAllAttributes}) |
509 |
|
|
) { |
510 |
|
|
''; |
511 |
|
|
} elsif($conf->{tagsAllow}->{$tagName}->{validAttributes} && !$conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) { |
512 |
|
|
''; |
513 |
|
|
} else { |
514 |
|
|
$a . $b . $c . $d; |
515 |
|
|
} |
516 |
|
|
}eig; |
517 |
|
|
} |
518 |
|
|
|
519 |
|
|
# Close single tag |
520 |
|
|
if($conf->{tagCloseSingle} && $isSingle && !$isSingleClosed) { |
521 |
|
|
if(substr($tagBody, length($tagBody) - 2, 1) ne "/") { |
522 |
|
|
if(substr($tagBody, length($tagBody) - 2, 1) ne " ") { substr($tagBody, length($tagBody) - 2, 1) .= " /"; } else { substr($tagBody, length($tagBody) - 2, 1) .= "/"; } |
523 |
|
|
} |
524 |
|
|
} |
525 |
|
|
|
526 |
|
|
# Quote attribute values |
527 |
|
|
if($conf->{tagQuoteValues} && $isTagStart) { |
528 |
|
|
my $tmp = ""; |
529 |
|
|
# 1 23 4 5 6 |
530 |
|
|
while($tagBody =~ m/([<a-z >]+)?((=)(\s*)([^ >]+)([ >]+))?/ig) { |
531 |
|
|
$tmp .= $1 if($1); |
532 |
|
|
if($2) { |
533 |
|
|
$tmp .= $3 if($3); |
534 |
|
|
$tmp .= $4 if($4); |
535 |
|
|
if($5 && substr($5, 0, 1) ne '"' && substr($5, length($5) - 1, 1) ne '"') { $tmp .= "\"$5\""; } else { $tmp .= $5; } |
536 |
|
|
$tmp .= $6 if($6); |
537 |
|
|
} |
538 |
|
|
} |
539 |
|
|
$tagBody = $tmp; |
540 |
|
|
} |
541 |
|
|
|
542 |
|
|
# Unquote attribute values |
543 |
|
|
if($conf->{tagUnQuoteValues}) { |
544 |
|
|
$tagBody =~ s/([a-z]+)(\s*)(=)(\s*)"([^\=\s">]+)"/$1$2$3$4$5/ig; #" |
545 |
|
|
} |
546 |
|
|
|
547 |
|
|
# Saving the tag |
548 |
|
|
push(@$tags, {name=>$tagName, body=>$tagBody, content=>$tagContent}); |
549 |
|
|
|
550 |
|
|
# Forming flagSet |
551 |
|
|
# |
552 |
|
|
# |byte2: _ _ _ _ _ _ isSpace isBreaking| byte1: _ _ _ _ _ p br| byte0: _ _ _ _ nobr isTagStart |
553 |
|
|
$flagSet2 = 0; |
554 |
|
|
if($isSpace) { $flagSet2 |= 2; } |
555 |
|
|
if($isBreaking) { $flagSet2 |= 1; } |
556 |
|
|
$flagSet1 = 0; |
557 |
|
|
if($tagName eq "br") { $flagSet1 |= 1; } |
558 |
|
|
if($tagName eq "p") { $flagSet1 |= 2; } |
559 |
|
|
$flagSet0 = 0; |
560 |
|
|
if($isTagStart) { $flagSet0 |= 1; } |
561 |
|
|
if($nobrIsOpen) { $flagSet0 |= 2; } |
562 |
|
|
# Planting the marker |
563 |
|
|
$strip .= "\x00\x0F" . chr($flagSet2) . chr($flagSet1) . chr($flagSet0) . "\x0F\x00"; |
564 |
|
|
} |
565 |
|
|
|
566 |
|
|
# Moving the pointer (tag end position + content length) |
567 |
|
|
$i = $te + $cl; |
568 |
|
|
|
569 |
|
|
# Eating crs & lfs after dropped tag |
570 |
|
|
if($conf->{tagLf} && $dropTag) { |
571 |
|
|
while(1) { |
572 |
|
|
if(substr($$text, $i + 1, 1) eq "\r") { $i++; } elsif(substr($$text, $i + 1, 1) eq "\n") { $i++; last; } else { last } |
573 |
|
|
} |
574 |
|
|
} |
575 |
|
|
} |
576 |
|
|
} else { |
577 |
|
|
# This is not a tag, just add the "<" to result |
578 |
|
|
$strip .= $c; |
579 |
|
|
} |
580 |
|
|
|
581 |
|
|
last if($i == length($$text)); |
582 |
|
|
$i++; |
583 |
|
|
} |
584 |
|
|
} |
585 |
|
|
|
586 |
|
|
# ==Find where tag ends |
587 |
|
|
sub tagend($$$) { |
588 |
|
|
my ($text, $i) = @_; |
589 |
|
|
|
590 |
|
|
my $gotcha = 0; |
591 |
|
|
my $quote = 0; |
592 |
|
|
|
593 |
|
|
$i |= 0; |
594 |
|
|
|
595 |
|
|
while (1) { |
596 |
|
|
if (substr($$text, $i, 1) eq '"') { $quote ^= 1; } |
597 |
|
|
if (!$quote && substr($$text, $i, 1) eq '>') { $gotcha = $i; } |
598 |
|
|
last if ($i == length($$text) || $gotcha); |
599 |
|
|
$i++; |
600 |
|
|
} |
601 |
|
|
|
602 |
|
|
return $gotcha; |
603 |
|
|
} |
604 |
|
|
|
605 |
|
|
# ==Bring everything back to HTML |
606 |
|
|
sub planttags($$) { |
607 |
|
|
my ($class, $result) = @_; |
608 |
|
|
my $i = 0; |
609 |
|
|
my $max = length($strip); |
610 |
|
|
my $ctag = 0; |
611 |
|
|
my $step; |
612 |
|
|
|
613 |
|
|
while (1) { |
614 |
|
|
if($i < $max - 2 && substr($strip, $i, 2) eq "\x00\x0F") { |
615 |
|
|
$result->{text} .= $$tags[$ctag]->{body}; |
616 |
|
|
if($$tags[$ctag]->{content}) { $result->{text} .= $$tags[$ctag]->{content}; } |
617 |
|
|
$i += $markLength; |
618 |
|
|
$ctag++; |
619 |
|
|
} else { |
620 |
|
|
if($i < $max - 2) { $step = index($strip, "\x00\x0F", $i) - $i; } else { $step = $max - $i; } |
621 |
|
|
if($step < 0) { $step = $max - $i; } |
622 |
|
|
|
623 |
|
|
if($step >= 0) { |
624 |
|
|
$result->{text} .= substr($strip, $i, $step); |
625 |
|
|
$i += $step; |
626 |
|
|
} |
627 |
|
|
} |
628 |
|
|
|
629 |
|
|
last if($i == $max); |
630 |
|
|
} |
631 |
|
|
} |
632 |
|
|
|
633 |
|
|
# ==Bring the text to plain mode== |
634 |
|
|
sub vanish($$) { |
635 |
|
|
my($class, $text) = @_; |
636 |
|
|
|
637 |
|
|
$$text =~ s/«|“|„|‘|‚|"|»|”|“|’|«|“|„|‘|‚|"|»|”|’|�|�|�|�|�|"|�|�|�/"/ig; |
638 |
|
|
$$text =~ s/ | |�/ /ig; |
639 |
|
|
$$text =~ s/—|–|—|–|�|�/-/ig; |
640 |
|
|
$$text =~ s/…|…|�/.../ig; |
641 |
|
|
$$text =~ s/©|©|�/(c)/ig; |
642 |
|
|
$$text =~ s/®|®|�/(r)/ig; |
643 |
|
|
$$text =~ s/¼|¼/1\/4/ig; |
644 |
|
|
$$text =~ s/½|½/1\/2/ig; |
645 |
|
|
$$text =~ s/¾|¾/3\/4/ig; |
646 |
|
|
} |
647 |
|
|
|
648 |
|
|
# ==Parse the tagsAllow string advanced format== |
649 |
|
|
sub parseTagsAllowString($$) { |
650 |
|
|
my($class, $string) = @_; |
651 |
|
|
|
652 |
|
|
return {tagsAllow=>{}, tagsDenyAllAttributes=>0} if(!$string); |
653 |
|
|
|
654 |
|
|
my $tagsAllow = {}; |
655 |
|
|
my $tagsDenyAllAttributes = 0; |
656 |
|
|
|
657 |
|
|
# Should I deny all tag attributes by default? |
658 |
|
|
if(substr($string,0,1) eq '|') { |
659 |
|
|
$tagsDenyAllAttributes = 1; |
660 |
|
|
substr($string,0,1) = ''; |
661 |
|
|
}; |
662 |
|
|
|
663 |
|
|
# Parsing the Configuration String |
664 |
|
|
while($string =~ /([a-z:|]+)/ig) { |
665 |
|
|
my $tBody = $1; |
666 |
|
|
my ($tagName) = ($tBody =~ /^([a-z]+)/i)[0]; |
667 |
|
|
|
668 |
|
|
last if(!$tagName); |
669 |
|
|
|
670 |
|
|
my $attrList = (); |
671 |
|
|
$tagsAllow->{$tagName}->{val}=1; |
672 |
|
|
|
673 |
|
|
if($tBody =~ /^$tagName\|$/i) { |
674 |
|
|
$tagsAllow->{$tagName}->{denyAllAttributes}=1; |
675 |
|
|
} elsif($tBody =~ /^$tagName\:$/i) { |
676 |
|
|
$tagsAllow->{$tagName}->{allowAllAttributes}=1; |
677 |
|
|
} else { |
678 |
|
|
while($tBody =~ /:([a-z]+)/ig) { |
679 |
|
|
$tagsAllow->{$tagName}->{validAttributes}->{$1}=1; |
680 |
|
|
} |
681 |
|
|
|
682 |
|
|
while($tBody =~ /\|([a-z]+)/ig) { |
683 |
|
|
if(!$tagsAllow->{$tagName}->{validAttributes}->{$1}) { |
684 |
|
|
$tagsAllow->{$tagName}->{invalidAttributes}->{$1}=1; |
685 |
|
|
} |
686 |
|
|
} |
687 |
|
|
} |
688 |
|
|
} |
689 |
|
|
|
690 |
|
|
return {tagsAllow=>$tagsAllow, tagsDenyAllAttributes=>$tagsDenyAllAttributes}; |
691 |
|
|
} |
692 |
|
|
|
693 |
|
|
# ==Parse the tagsAllow string advanced format== |
694 |
|
|
sub parseTagsDenyString($$) { |
695 |
|
|
my($class, $string) = @_; |
696 |
|
|
|
697 |
|
|
return {} if(!$string); |
698 |
|
|
|
699 |
|
|
my $tagsDeny = {}; |
700 |
|
|
while($string =~ /([a-z]+)/ig) { |
701 |
|
|
$tagsDeny->{$1}->{val}=1; |
702 |
|
|
} |
703 |
|
|
|
704 |
|
|
return $tagsDeny; |
705 |
|
|
} |
706 |
|
|
|
707 |
|
|
return 1; |