Line # Revision Author
1 3 ahitrov@rambler.ru package Jevix::Makeup;
2
3 # ==========================================
4 #
5 # Jevix Version 0.9.5 (windows-1251)
6 #
7 # Developed by Igor Askarov
8 #
9 # Please send all suggestions to
10 # Igor Askarov <juks@juks.ru>
11 # http://www.jevix.ru/
12 #
13 # Release date: 21/01/2008
14 #
15 # === Methods list==========================
16 #
17 # new the constructor
18 # procces entry sub
19 # preset presets selector
20 # makeup makeup the text
21 # quotes quotes processor
22 # cuttags tags processor
23 # tagend looking fo tag end
24 # planttags sub to bring the tags back
25 # vanish sub to remove all the stuff and bring the text to plain mode
26 # parseTagsAllowString parse the tagsAllow string to hash
27 # parseTagsDenyString parse the tagsDeny string to hash
28 #
29 # ==========================================
30
31 use strict;
32 use warnings;
33
34 my $markLength = 7;
35 my $strip;
36 my $result;
37 my $tags;
38 my $conf;
39
40 my @singleTags = qw/link input spacer img br hr/;
41 my @breakingTags = qw/p td div hr/;
42 my @spaceTags = qw/br/;
43 my @tagsToEat = qw/script style/;
44
45 # ==The constructor
46 sub new {
47 my Jevix::Makeup $class = shift;
48
49 return $class;
50 }
51
52 # ==Here we've got the input
53 sub process($$$) {
54 my($class, $text, $userConfig) = @_;
55
56 $conf = $userConfig ? $userConfig : {presetBasic=>1};
57 $class->preset();
58
59 $strip = "";
60 $tags = [];
61
62 $result = {};
63 $result->{error} = 0;
64 $result->{errorLog} = [];
65
66 if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); }
67 if($conf->{quotes}) { $class->quotes($conf); }
68 $class->makeup($conf);
69
70 $result->{text} = "";
71 if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; }
72
73 return $result;
74 }
75
76 # ==Choosing default setup when necessary
77 sub preset($$) {
78 my ($class) = @_;
79
80 if(!$conf || $conf->{presetBasic}) {
81 $conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode
82 $conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to <br/>
83 $conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs
84 $conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary
85 $conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol
86 $conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters
87 $conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces
88 $conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces
89 $conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside <nobr>
90 $conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag
91 $conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces
92 $conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes
93 $conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type
94 $conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type
95 $conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions
96 $conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters
97 }
98
99 # If tagsAllow came as a string
100 if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) {
101 my $tmp = $class->parseTagsAllowString($conf->{tagsAllow});
102 $conf->{tagsAllow} = $tmp->{tagsAllow};
103 $conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes};
104 }
105
106 # If tagsDeny came as a string
107 if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) {
108 $conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny});
109 }
110 }
111
112 # ==Imposing clear text
113 sub makeup($$) {
114 my ($class, $conf) = @_;
115
116 # ==Misc
117 # Prepositions
118 my $prp_rus = "�|���|����|�|���|��|��|���|��|��|�|��|���|���|�|���|��|���|����|��|��|��|�|��|���|�����|��|���|��|���|����|���|���|�|������|��|�|�����";
119 my $prp_eng = "aboard|about|above|absent|across|after|against|along|alongside|amid|amidst|among|amongst|around|as|astride|at|atop|before|behind|below|beneath|beside|besides|between|beyond|but|by|despite|down|during|except|following|for|from|in|inside|into|like|mid|minus|near|nearest|notwithstanding|of|off|on|onto|opposite|out|outside|over|past|re|round|save|since|than|through|throughout|till|to|toward|towards|under|underneath|unlike|until|up|upon|via|with|within|without";
120 my $prp = "$prp_rus|$prp_eng";
121
122 my $letters = "A-Za-z�-��-ѳ���"; # Characters
123 my $cap_letters = "A-Z�-�"; # Capital characters
124
125 my $sp = " \xA0\t"; # space class
126 my $rt = "\r?\n"; # cr class
127
128 my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag
129 my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag
130 my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start
131 my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end
132 my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag
133 my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag
134 my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where <nobr> is open
135 my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no <nobr> open
136 my $ts = "\x00\x0F"; # Tag start
137 my $te = "\x0F\x00"; # Tag end
138
139 my $brt = "<br *\/?>"; # br tag in text mode
140 my $pst = "<p>";
141 my $pet = "</p>";
142
143 # Codes, metasymbols or what ever?
144 my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg);
145 if(!$conf->{codeMode}) {
146 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("�", "�", "�", "�", "�");
147 $cfracs = {'1/4'=>"?", '1/2'=>"?", '3/4'=>"?"};
148 } elsif($conf->{codeMode} == 1) {
149 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("&#151;", "&#160;", "&#133;", "&#169;", "&#174;");
150 $cfracs = {'1/4'=>"&#188;", '1/2'=>"&#189;", '3/4'=>"&#190;"};
151 } else {
152 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("&mdash;", "&nbsp;", "&hellip;", "&copy;", "&reg;");
153 $cfracs = {'1/4'=>"&frac14;", '1/2'=>"&frac12;", '3/4'=>"&frac34;"};
154 }
155
156 # Wiping edge spaces
157 if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n]*(.+?)[$sp\r\n]*$/$1/isg; }
158
159 # Wiping spaces between tags (</td> </tr>)
160 if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; }
161
162 # Wiping multispaces
163 if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; }
164
165 # Wiping redundant spaces
166 if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))|(\()(?<![:%;]-\()[$sp]+}{$1 ? $2 : $3}eig; }
167
168 if($conf->{nbsp}) {
169 # Prepositions with &nbsp;
170 $strip =~ s/(^|\x00|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm;
171
172 # &nbsp; with digits
173 $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+|&nbsp;|&#160;)(?:(?=[0-9]{2,})|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg;
174 }
175
176 # Put composite words inside <nobr>
177 if($conf->{compositeWords}) { $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3);
178 $b =~ s{(^|[$sp\x00]|&nbsp;)([$letters]+(?:-[$letters]+)+)(?=$|[$sp\x00])}{
179 my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3;
180 if(length($e) <= $conf->{compositeWordsLength}) { "$d<nobr>$e<\/nobr>" } else {"$d$e$f"}
181 }eig; "$a$b$c";
182 }eisg; }
183
184 # Dots
185 if($conf->{dots}) { $strip =~ s/\.{3}|�|&hellip;/$cdots/ig; }
186
187 # Dashes
188 if($conf->{dashes}) {
189 # Hyphen
190 $strip =~ s/([^$sp])([$sp]|&#160;|&nbsp;)(-{1,2}|�|&mdash;|&#151;)/$1$cnbsp$cdash/ig;
191 # "Speech" hyphen
192 $strip =~ s/((?:^|$ps|$br|$brt(?:$rt)*|[$rt]))[$sp]*(?:&nbsp;)*(-{1,2}|�|&mdash;|&#151;)[$sp]*(?:&nbsp;)*(.)/$1$cdash$cnbsp$3/ig;
193 }
194
195 # Misc stuff
196 if($conf->{misc}) {
197 # Fracs
198 $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))([13])/([24])(?:(?=[$sp\x00])|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg;
199 # Copyright & registered
200 $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))(\([cr]\)|&copy;|�)(?:(?=[$sp\x00?!;.,])|(?=$))}{ if((lc($1) eq "(c)") || (lc($1) eq "&copy;") || ($1 eq "�")) {$ccopy} elsif((lc($1) eq "(r)") || (lc($1) eq "&reg;") || ($1 eq "�")) {$creg} else { $2 } }eig;
201 }
202
203 # Paragraphs
204 if($conf->{paragraphs}) { $strip =~ s{(^|$pe(?:$rt$rt)?|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?<!$pe)(?:(?=$)|(?=$rt$rt)|(?=$ps))}{ my ($a, $b, $c) = ($1,$2,$3||""); (($b =~ /^[ \r\n]+$/) || ($b =~ /^(<br *\/?>|$br)+$/)) ? "$a$b$c" : "$a<p>$b</p>";}eisg; }
205
206 # Line break
207 if($conf->{lineBreaks}) { $strip =~ s/(?<!$pt)(?<!$br)(?<!$br\r)(?<!$pe\r\n\r\n)(?<!$pe\n\n)(?<!$pe\r\n)(?<!$pe\n)(?<!$pe\r)(?<!$pe)(?<!$pet\r\n\r\n)(?<!$pet\r\n\r)(?<!$pet\n\n)(?<!$pet\r\n)(?<!$pet\n)(?<!$pet\r)(?<!$pet)(?<!$pst)($rt)(?!$brt)/<br \/>$1/isg; }
208 }
209
210 # ==impose quotes
211 sub quotes($$) {
212 my ($class, $conf) = @_;
213
214 my $i;
215 my ($a_open, $b_open) = (0,0);
216 my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0);
217 my ($qaStart, $qaEnd, $qbStart, $qbEnd);
218 my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,);
219
220 # space class
221 my $sp =" \t\xA0";
222 # characters
223 my $letters = "A-Za-z�-��-ѳ���";
224
225 @qs_ansi = ("�", "�", "�", "�", "�", '"');
226 @qe_ansi = ("�", "�", "�", "�", "�", '"');
227 @qs_html = ("&#171;", "&#147;", "&#132;", "&#145;", "&#130;", "&#34;");
228 @qe_html = ("&#187;", "&#148;", "&#147;", "&#146;", "&#145;", "&#34;");
229 # << `` .. ` . "
230 @qs_ent = ("&laquo;", "&ldquo;", "&bdquo;", "&lsquo;", "&sbquo;", "&quot;");
231 # >> '' '' ' ` "
232 @qe_ent = ("&raquo;", "&rdquo;", "&ldquo;", "&rsquo;", "&lsquo;", "&quot;");
233
234 # Quotes collection
235 if(!$conf->{codeMode}) {
236 @qs = @qs_ansi; @qe = @qe_ansi;
237 } elsif ($conf->{codeMode} == 1) {
238 @qs = @qs_html; @qe = @qe_html;
239 } else {
240 @qs = @qs_ent; @qe = @qe_ent;
241 }
242
243 # Getting configuration setting
244 $conf->{qaType} ||= 0;
245 $conf->{qbType} ||= 1;
246 $conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0;
247 $conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1;
248
249 # Selecting quotes as requested by user
250 ($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]);
251 ($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]);
252
253 # Resetting all the quotes inside text to <">
254 my $qa = join('|', @qs_ansi) . '|' . join('|', @qe_ansi) . '|' . join('|', @qs_html) . '|' . join('|', @qe_html) . '|' . join('|', @qs_ent) . '|' . join('|', @qe_ent);
255 $strip =~ s/(?:(?:(?<=[^$letters])|(?<=^))($qa))|(?:($qa)(?:(?=[^$letters])|(?=$)))/\"/ig;
256
257 my $spread = 1;
258 my $mv = 0;
259 my $mvn = 0;
260 my @st;
261 $i = 0;
262 my $skip = 0;
263 my @space; # Space tags flag
264 my @break; # Text break flags
265
266 $st[$_] = '' foreach(0..$spread + 1);
267 $space[$_] = 0 foreach(0 + 1..$spread + 1);
268 $break[$_] = 0 foreach(0 + 1..$spread + 1);
269 $space[0] = 1;
270 $break[0] = 1;
271
272 while(1) {
273 # Skipping tags
274 foreach(0..$spread) {
275 do {
276 $skip = 0;
277 if($i + $_ + $mv <= length($strip)) {
278 if($i + $_ + $mv + 1 < length($strip)) {
279 if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) {
280 $space[$_ + 1] |= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1;
281 $break[$_ + 1] |= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1;
282 $mv += $markLength;
283 if(!$_) { $mvn = $mv; }
284 $st[$_ + 1] = "";
285 $skip = 1;
286 }
287 }
288 if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); }
289 }
290 } while($skip);
291 }
292
293 $i += $mvn;
294 $mv = 0;
295 $mvn = 0;
296
297 ($cp, $c, $cn) = ($st[0], $st[1], $st[2]);
298 $cp_is_sp = (($cp =~ /[^0-9$letters]/) || $space[0] || $space[1] || $break[0] || !$i) ? 1 : 0;
299 $cn_is_sp = (($cn =~ /[^0-9$letters]/) || $space[2] || $break[2] || $cn eq '') ? 1 : 0;
300
301 # Reset state if breaking tag appears
302 if($break[1] || $i == length($strip)) {
303 if($a_open || $b_open) {
304 # Log quote error if appears
305 if($conf->{logErrors}) {
306 my $quoteErrSampleLength = 100;
307 my $z = $i - 1;
308 my $y;
309 while(1) {
310 if(substr($strip, $z, 1) eq " " || substr($strip, $z, 1) eq "\xA0" || !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}}
311 last if(!$z);
312 $z--;
313 }
314 my $sample = substr($strip, $y, ($i - $y));
315 $sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g;
316 $sample =~ s/<\/?[a-z]+.*?>//g;
317 push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"});
318 $result->{error} = 1;
319 }
320 }
321 $a_open = 0;
322 $b_open = 0;
323 }
324
325 if($c eq '"') {
326 if(!$a_open) {
327 $a_open = 1;
328 substr($strip, $i, 1) = $qaStart;
329 $i += length($qaStart) - 1;
330 } elsif ($a_open && (($i == length($strip) - 1) || (!$b_open && $cn_is_sp))) {
331 $a_open = 0;
332 substr($strip, $i, 1) = $qaEnd;
333 $i += length($qaEnd) - 1;
334 } elsif ($a_open && !$b_open) {
335 $b_open = 1;
336 substr($strip, $i, 1) = $qbStart;
337 $i += length($qbStart) - 1;
338 } elsif ($a_open && $b_open) {
339 $b_open = 0;
340 substr($strip, $i, 1) = $qbEnd;
341 $i += length($qbEnd) - 1;
342 }
343 }
344
345 last if($i == length($strip));
346 $st[0] = $st[1];
347 $space[0] = $space[1];
348 $break[0] = $break[1];
349 $space[$_] = 0 foreach(0 + 1..$spread + 1);
350 $break[$_] = 0 foreach(0 + 1..$spread + 1);
351 $i++;
352 }
353 }
354
355 # ==Cutting the tags away
356 sub cuttags($$$$) {
357 my($class, $text, $conf, $result) = @_;
358 my $i = 0; # loop counter
359 my $hop; # Jump length
360 my ($c, $cn); # current & next character
361 my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag
362 my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags
363 my @tagsOpen; # an array storing the info about all the tags currently open
364
365 # space class
366 my $sp =" \t\xA0";
367
368 while(1) {
369 $hop = index($$text, "<", $i);
370
371 if($hop < 0) {
372 $strip .= substr($$text, $i, length($$text) - $i);
373 last;
374 } elsif($hop > 0) {
375 $strip .= substr($$text, $i, $hop - $i);
376 $i = $hop;
377 }
378
379 ($c, $cn) = unpack("aa", substr($$text, $i, 2));
380
381 $isTag = 0;
382
383 # =If tag opens
384 $isTagStart = ($cn =~ /!|[a-z]/i) ? 1 : 0;
385 if($isTagStart || ($cn eq "/")) { $isTag = 1; }
386
387 if($isTag) {
388 $ts = $i; # Tag start position
389 $te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position
390
391 if($te) {
392 $tagBody = substr($$text, $ts, $te - $ts + 1);
393 $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
394 $tagName =~ tr/A-Z/a-z/;
395 }
396
397 if($te && $tagName) {
398 $tagBody = substr($$text, $ts, $te - $ts + 1);
399 $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
400 $tagName =~ tr/A-Z/a-z/;
401
402 # =Flags
403 # Detecting whether the tag is single (self-closing) or double
404 $isSingleClosed = 0;
405 $isSingle = 0;
406 if($isTagStart) {
407 if(grep{$tagName eq $_} @singleTags) {
408 $isSingle = 1;
409 } elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") {
410 $isSingle = 1;
411 $isSingleClosed = 1;
412 }
413 }
414
415 # Detecting wether this is space tag or not
416 $isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0;
417
418 # Detecting wether this is breaking tag or not
419 $isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0;
420
421 # Tag Length
422 $tl = $te - $ts + 1;
423
424 # Updating the status for tags open
425 if($conf->{checkHTML} && !$isSingle) {
426 if($isTagStart) {
427 push(@tagsOpen, $tagName);
428 } else {
429 if($tagsOpen[$#tagsOpen] ne $tagName) {
430 # HTML error
431 $result->{error} = 1;
432 if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); }
433 } else {
434 pop(@tagsOpen);
435 }
436 }
437 }
438
439 # Eating tag content for some tags like <script>
440 $tagContent = "";
441 $cl = 0;
442 if((grep{$tagName eq $_} @tagsToEat) && $isTagStart) {
443 $cl = index($$text, "</$tagName>", $ts + $tl) - $ts - $tl;
444 if($cl > 0) {
445 $tagContent = substr($$text, $ts + $tl, $cl);
446 } else {
447 $cl = 0;
448 $result->{error} = 1;
449 if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Can't find <$tagName> end!"}); }
450 }
451 }
452
453 # Should I drop all the tags by default?
454 my $dropTag = 0;
455 if($conf->{tagsDenyAll}) { $dropTag = 1; }
456
457 # Checking deny list
458 if(defined($conf->{tagsDeny}) && !$dropTag) {
459 if($conf->{tagsDeny}->{$tagName}) { $dropTag = 1; }
460 }
461
462 # Checking allow list
463 if(defined($conf->{tagsAllow}) && $dropTag) {
464 if($conf->{tagsAllow}->{$tagName}) { $dropTag = 0; }
465 }
466
467 # Nobr tag status
468 if($tagName eq "nobr" && $isTagStart) {
469 $nobrIsOpen = 1;
470 } elsif(($tagName eq "nobr" && !$isTagStart) || (grep{$tagName eq $_} @breakingTags)) {
471 $nobrIsOpen = 0;
472 }
473
474 # =Final part
475 if(!$dropTag) {
476 # =Processing tags
477 # Tag name to lower case
478 if($conf->{tagNamesToLower}) {
479 if($isTagStart) { $tagBody = "<" . $tagName . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
480 else { $tagBody =~ tr/A-Z/a-z/; }
481 }
482 # Tag name to upper case
483 if($conf->{tagNamesToUpper}) {
484 if($isTagStart) { $tagBody = "<" . uc($tagName) . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
485 else { $tagBody =~ tr/a-z/A-Z/; }
486 }
487 # =Tag parameters to lower or upper case
488 if($isTagStart && ($conf->{tagAttributesToLower} || $conf->{tagAttributesToUpper})) {
489 # Regular parameters
490 my $tmp = "";
491 while ($tagBody =~ m/([^\s]*\s*)(?:([a-z\r]+)(\s*)(?==)(=\s*))?/ig ) {
492 $tmp .= $1 if ($1); if($conf->{tagAttributesToLower}) { if($2) { $tmp .= lc($2); } } else { if($2) { $tmp .= uc($2); } } $tmp .= $3 if ($3); $tmp .= $4 if ($4); $tmp .= $5 if ($5);
493 }
494
495 # Single parameters (like <checked>)
496 if($conf->{tagAttributesToLower}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{lc($1)}eig; }
497 elsif($conf->{tagAttributesToUpper}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{uc($1)}eig; }
498 }
499
500 # Simple XSS & tag attributes protection
501 if($isTagStart && ($conf->{simpleXSS} || $conf->{tagsAllow}->{$tagName}->{validAttributes} || $conf->{tagsAllow}->{$tagName}->{invalidAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || $conf->{tagsDenyAllAttributes})) {
502 $tagBody =~ s{(?<!<)(\s*)([a-z]+)([$sp]*=[$sp]*)("[^"]+"|[^$sp/>]+)} {
503 my ($a, $b, $c, $d) = ($1||'', $2, $3, $4);
504 if($conf->{simpleXSS} && ($b =~ /^on/ig || $d =~ /javascript|expression/ig)) {
505 '';
506 } elsif(($conf->{tagsDenyAllAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || ($conf->{tagsAllow}->{$tagName}->{invalidAttributes} && $conf->{tagsAllow}->{$tagName}->{invalidAttributes}->{$b}))
507 && !(($conf->{tagsAllow}->{$tagName}->{validAttributes} && $conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b})
508 || $conf->{tagsAllow}->{$tagName}->{allowAllAttributes})
509 ) {
510 '';
511 } elsif($conf->{tagsAllow}->{$tagName}->{validAttributes} && !$conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) {
512 '';
513 } else {
514 $a . $b . $c . $d;
515 }
516 }eig;
517 }
518
519 # Close single tag
520 if($conf->{tagCloseSingle} && $isSingle && !$isSingleClosed) {
521 if(substr($tagBody, length($tagBody) - 2, 1) ne "/") {
522 if(substr($tagBody, length($tagBody) - 2, 1) ne " ") { substr($tagBody, length($tagBody) - 2, 1) .= " /"; } else { substr($tagBody, length($tagBody) - 2, 1) .= "/"; }
523 }
524 }
525
526 # Quote attribute values
527 if($conf->{tagQuoteValues} && $isTagStart) {
528 my $tmp = "";
529 # 1 23 4 5 6
530 while($tagBody =~ m/([<a-z >]+)?((=)(\s*)([^ >]+)([ >]+))?/ig) {
531 $tmp .= $1 if($1);
532 if($2) {
533 $tmp .= $3 if($3);
534 $tmp .= $4 if($4);
535 if($5 && substr($5, 0, 1) ne '"' && substr($5, length($5) - 1, 1) ne '"') { $tmp .= "\"$5\""; } else { $tmp .= $5; }
536 $tmp .= $6 if($6);
537 }
538 }
539 $tagBody = $tmp;
540 }
541
542 # Unquote attribute values
543 if($conf->{tagUnQuoteValues}) {
544 $tagBody =~ s/([a-z]+)(\s*)(=)(\s*)"([^\=\s">]+)"/$1$2$3$4$5/ig; #"
545 }
546
547 # Saving the tag
548 push(@$tags, {name=>$tagName, body=>$tagBody, content=>$tagContent});
549
550 # Forming flagSet
551 #
552 # |byte2: _ _ _ _ _ _ isSpace isBreaking| byte1: _ _ _ _ _ p br| byte0: _ _ _ _ nobr isTagStart
553 $flagSet2 = 0;
554 if($isSpace) { $flagSet2 |= 2; }
555 if($isBreaking) { $flagSet2 |= 1; }
556 $flagSet1 = 0;
557 if($tagName eq "br") { $flagSet1 |= 1; }
558 if($tagName eq "p") { $flagSet1 |= 2; }
559 $flagSet0 = 0;
560 if($isTagStart) { $flagSet0 |= 1; }
561 if($nobrIsOpen) { $flagSet0 |= 2; }
562 # Planting the marker
563 $strip .= "\x00\x0F" . chr($flagSet2) . chr($flagSet1) . chr($flagSet0) . "\x0F\x00";
564 }
565
566 # Moving the pointer (tag end position + content length)
567 $i = $te + $cl;
568
569 # Eating crs & lfs after dropped tag
570 if($conf->{tagLf} && $dropTag) {
571 while(1) {
572 if(substr($$text, $i + 1, 1) eq "\r") { $i++; } elsif(substr($$text, $i + 1, 1) eq "\n") { $i++; last; } else { last }
573 }
574 }
575 }
576 } else {
577 # This is not a tag, just add the "<" to result
578 $strip .= $c;
579 }
580
581 last if($i == length($$text));
582 $i++;
583 }
584 }
585
586 # ==Find where tag ends
587 sub tagend($$$) {
588 my ($text, $i) = @_;
589
590 my $gotcha = 0;
591 my $quote = 0;
592
593 $i |= 0;
594
595 while (1) {
596 if (substr($$text, $i, 1) eq '"') { $quote ^= 1; }
597 if (!$quote && substr($$text, $i, 1) eq '>') { $gotcha = $i; }
598 last if ($i == length($$text) || $gotcha);
599 $i++;
600 }
601
602 return $gotcha;
603 }
604
605 # ==Bring everything back to HTML
606 sub planttags($$) {
607 my ($class, $result) = @_;
608 my $i = 0;
609 my $max = length($strip);
610 my $ctag = 0;
611 my $step;
612
613 while (1) {
614 if($i < $max - 2 && substr($strip, $i, 2) eq "\x00\x0F") {
615 $result->{text} .= $$tags[$ctag]->{body};
616 if($$tags[$ctag]->{content}) { $result->{text} .= $$tags[$ctag]->{content}; }
617 $i += $markLength;
618 $ctag++;
619 } else {
620 if($i < $max - 2) { $step = index($strip, "\x00\x0F", $i) - $i; } else { $step = $max - $i; }
621 if($step < 0) { $step = $max - $i; }
622
623 if($step >= 0) {
624 $result->{text} .= substr($strip, $i, $step);
625 $i += $step;
626 }
627 }
628
629 last if($i == $max);
630 }
631 }
632
633 # ==Bring the text to plain mode==
634 sub vanish($$) {
635 my($class, $text) = @_;
636
637 $$text =~ s/&laquo;|&ldquo;|&bdquo;|&lsquo;|&sbquo;|&quot;|&raquo;|&rdquo;|&ldquo;|&rsquo;|&#171;|&#147;|&#132;|&#145;|&#130;|&#34;|&#187;|&#148;|&#146;|�|�|�|�|�|"|�|�|�/"/ig;
638 $$text =~ s/&nbsp;|&#160;|�/ /ig;
639 $$text =~ s/&mdash;|&ndash;|&#151;|&#150;|�|�/-/ig;
640 $$text =~ s/&hellip;|&#133;|�/.../ig;
641 $$text =~ s/&copy;|&#169;|�/(c)/ig;
642 $$text =~ s/&reg;|&#174;|�/(r)/ig;
643 $$text =~ s/&frac14;|&#188;/1\/4/ig;
644 $$text =~ s/&frac12;|&#189;/1\/2/ig;
645 $$text =~ s/&frac34;|&#190;/3\/4/ig;
646 }
647
648 # ==Parse the tagsAllow string advanced format==
649 sub parseTagsAllowString($$) {
650 my($class, $string) = @_;
651
652 return {tagsAllow=>{}, tagsDenyAllAttributes=>0} if(!$string);
653
654 my $tagsAllow = {};
655 my $tagsDenyAllAttributes = 0;
656
657 # Should I deny all tag attributes by default?
658 if(substr($string,0,1) eq '|') {
659 $tagsDenyAllAttributes = 1;
660 substr($string,0,1) = '';
661 };
662
663 # Parsing the Configuration String
664 while($string =~ /([a-z:|]+)/ig) {
665 my $tBody = $1;
666 my ($tagName) = ($tBody =~ /^([a-z]+)/i)[0];
667
668 last if(!$tagName);
669
670 my $attrList = ();
671 $tagsAllow->{$tagName}->{val}=1;
672
673 if($tBody =~ /^$tagName\|$/i) {
674 $tagsAllow->{$tagName}->{denyAllAttributes}=1;
675 } elsif($tBody =~ /^$tagName\:$/i) {
676 $tagsAllow->{$tagName}->{allowAllAttributes}=1;
677 } else {
678 while($tBody =~ /:([a-z]+)/ig) {
679 $tagsAllow->{$tagName}->{validAttributes}->{$1}=1;
680 }
681
682 while($tBody =~ /\|([a-z]+)/ig) {
683 if(!$tagsAllow->{$tagName}->{validAttributes}->{$1}) {
684 $tagsAllow->{$tagName}->{invalidAttributes}->{$1}=1;
685 }
686 }
687 }
688 }
689
690 return {tagsAllow=>$tagsAllow, tagsDenyAllAttributes=>$tagsDenyAllAttributes};
691 }
692
693 # ==Parse the tagsAllow string advanced format==
694 sub parseTagsDenyString($$) {
695 my($class, $string) = @_;
696
697 return {} if(!$string);
698
699 my $tagsDeny = {};
700 while($string =~ /([a-z]+)/ig) {
701 $tagsDeny->{$1}->{val}=1;
702 }
703
704 return $tagsDeny;
705 }
706
707 return 1;