Line # Revision Author
1 3 ahitrov@rambler.ru package Jevix::MakeupUtf;
2
3 # ==========================================
4 #
5 # Jevix Version 0.9.5 (utf-8)
6 #
7 # Developed by Igor Askarov
8 #
9 # Please send all suggestions to
10 # Igor Askarov <juks@juks.ru>
11 # http://www.jevix.ru/
12 #
13 # Release date: 21/01/2008
14 #
15 # === Methods list==========================
16 #
17 # new the constructor
18 # procces entry sub
19 # preset presets selector
20 # makeup makeup the text
21 # quotes quotes processor
22 # cuttags tags processor
23 # tagend looking fo tag end
24 # planttags sub to bring the tags back
25 # vanish sub to remove all the stuff and bring the text to plain mode
26 # parseTagsAllowString parse the tagsAllow string to hash
27 # parseTagsDenyString parse the tagsDeny string to hash
28 #
29 # ==========================================
30
31 use strict;
32 use warnings;
33
34 #UTFMODE#
35 use Encode;
36 use utf8;
37 #UTFMODE#
38
39 my $markLength = 7;
40 my $strip;
41 my $result;
42 my $tags;
43 my $conf;
44
45 my @singleTags = qw/link input spacer img br hr/;
46 my @breakingTags = qw/p td div hr/;
47 my @spaceTags = qw/br/;
48 my @tagsToEat = qw/script style/;
49
50 # ==The constructor
51 sub new {
52 my Jevix::MakeupUtf $class = shift;
53
54 return $class;
55 }
56
57 # ==Here we've got the input
58 sub process($$$) {
59 my($class, $text, $userConfig) = @_;
60
61 #UTFMODE#
62 $$text = decode("utf8", $$text);
63 #UTFMODE#
64
65 $conf = $userConfig ? $userConfig : {presetBasic=>1};
66 $class->preset();
67
68 $strip = "";
69 $tags = [];
70
71 $result = {};
72 $result->{error} = 0;
73 $result->{errorLog} = [];
74
75 if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); }
76 if($conf->{quotes}) { $class->quotes($conf); }
77 $class->makeup($conf);
78
79 $result->{text} = "";
80 if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; }
81
82 #UTFMODE#
83 Encode::_utf8_off($result->{text});
84 #UTFMODE#
85
86 return $result;
87 }
88
89 # ==Choosing default setup when necessary
90 sub preset($$) {
91 my ($class) = @_;
92
93 if(!$conf || $conf->{presetBasic}) {
94 $conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode
95 $conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to <br/>
96 $conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs
97 $conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary
98 $conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol
99 $conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters
100 $conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces
101 $conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces
102 $conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside <nobr>
103 $conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag
104 $conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces
105 $conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes
106 $conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type
107 $conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type
108 $conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions
109 $conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters
110 }
111
112 # If tagsAllow came as a string
113 if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) {
114 my $tmp = $class->parseTagsAllowString($conf->{tagsAllow});
115 $conf->{tagsAllow} = $tmp->{tagsAllow};
116 $conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes};
117 }
118
119 # If tagsDeny came as a string
120 if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) {
121 $conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny});
122 }
123 }
124
125 # ==Imposing clear text
126 sub makeup($$) {
127 my ($class, $conf) = @_;
128
129 # ==Misc
130 # Prepositions
131 my $prp_rus = "а|без|безо|в|вне|во|да|для|до|за|и|из|изо|или|к|как|на|над|надо|не|ни|но|о|об|обо|около|от|ото|по|под|подо|при|про|с|сквозь|со|у|через";
132 my $prp_eng = "aboard|about|above|absent|across|after|against|along|alongside|amid|amidst|among|amongst|around|as|astride|at|atop|before|behind|below|beneath|beside|besides|between|beyond|but|by|despite|down|during|except|following|for|from|in|inside|into|like|mid|minus|near|nearest|notwithstanding|of|off|on|onto|opposite|out|outside|over|past|re|round|save|since|than|through|throughout|till|to|toward|towards|under|underneath|unlike|until|up|upon|via|with|within|without";
133 my $prp = "$prp_rus|$prp_eng";
134
135 my $letters = "A-Za-zА-Яа-яЁёЙй"; # Characters
136 my $cap_letters = "A-ZА-ЯЁё"; # Capital characters
137
138 my $sp = " \xA0\t"; # space class
139 my $rt = "\r?\n"; # cr class
140
141 my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag
142 my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag
143 my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start
144 my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end
145 my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag
146 my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag
147 my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where <nobr> is open
148 my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no <nobr> open
149 my $ts = "\x00\x0F"; # Tag start
150 my $te = "\x0F\x00"; # Tag end
151
152 my $brt = "<br *\/?>"; # br tag in text mode
153 my $pst = "<p>";
154 my $pet = "</p>";
155
156 # Codes, metasymbols or what ever?
157 my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg);
158 if(!$conf->{codeMode}) {
159 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®");
160 $cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"};
161 } elsif($conf->{codeMode} == 1) {
162 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("&#151;", "&#160;", "&#133;", "&#169;", "&#174;");
163 $cfracs = {'1/4'=>"&#188;", '1/2'=>"&#189;", '3/4'=>"&#190;"};
164 } else {
165 ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("&mdash;", "&nbsp;", "&hellip;", "&copy;", "&reg;");
166 $cfracs = {'1/4'=>"&frac14;", '1/2'=>"&frac12;", '3/4'=>"&frac34;"};
167 }
168
169 # Wiping edge spaces
170 if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n]*(.+?)[$sp\r\n]*$/$1/isg; }
171
172 # Wiping spaces between tags (</td> </tr>)
173 if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; }
174
175 # Wiping multispaces
176 if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; }
177
178 # Wiping redundant spaces
179 if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))|(\()(?<![:%;]-\()[$sp]+}{$1 ? $2 : $3}eig; }
180
181 if($conf->{nbsp}) {
182 # Prepositions with &nbsp;
183 $strip =~ s/(^|\x00|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm;
184
185 # &nbsp; with digits
186 $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+|&nbsp;|&#160;)(?:(?=[0-9]{2,})|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg;
187 }
188
189 # Put composite words inside <nobr>
190 if($conf->{compositeWords}) { $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3);
191 $b =~ s{(^|[$sp\x00]|&nbsp;)([$letters]+(?:-[$letters]+)+)(?=$|[$sp\x00])}{
192 my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3;
193 if(length($e) <= $conf->{compositeWordsLength}) { "$d<nobr>$e<\/nobr>" } else {"$d$e$f"}
194 }eig; "$a$b$c";
195 }eisg; }
196
197 # Dots
198 if($conf->{dots}) { $strip =~ s/\.{3}|…|&hellip;/$cdots/ig; }
199
200 # Dashes
201 if($conf->{dashes}) {
202 # Hyphen
203 $strip =~ s/([^$sp])([$sp]|&#160;|&nbsp;)(-{1,2}|—|&mdash;|&#151;)/$1$cnbsp$cdash/ig;
204 # "Speech" hyphen
205 $strip =~ s/((?:^|$ps|$br|$brt(?:$rt)*|[$rt]))[$sp]*(?:&nbsp;)*(-{1,2}|—|&mdash;|&#151;)[$sp]*(?:&nbsp;)*(.)/$1$cdash$cnbsp$3/ig;
206 }
207
208 # Misc stuff
209 if($conf->{misc}) {
210 # Fracs
211 $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))([13])/([24])(?:(?=[$sp\x00])|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg;
212 # Copyright & registered
213 $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))(\([cr]\)|&copy;|©)(?:(?=[$sp\x00?!;.,])|(?=$))}{ if((lc($1) eq "(c)") || (lc($1) eq "&copy;") || ($1 eq "©")) {$ccopy} elsif((lc($1) eq "(r)") || (lc($1) eq "&reg;") || ($1 eq "®")) {$creg} else { $2 } }eig;
214 }
215
216 # Paragraphs
217 if($conf->{paragraphs}) { $strip =~ s{(^|$pe(?:$rt$rt)?|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?<!$pe)(?:(?=$)|(?=$rt$rt)|(?=$ps))}{ my ($a, $b, $c) = ($1,$2,$3||""); (($b =~ /^[ \r\n]+$/) || ($b =~ /^(<br *\/?>|$br)+$/)) ? "$a$b$c" : "$a<p>$b</p>";}eisg; }
218
219 # Line break
220 if($conf->{lineBreaks}) { $strip =~ s/(?<!$pt)(?<!$br)(?<!$br\r)(?<!$pe\r\n\r\n)(?<!$pe\n\n)(?<!$pe\r\n)(?<!$pe\n)(?<!$pe\r)(?<!$pe)(?<!$pet\r\n\r\n)(?<!$pet\r\n\r)(?<!$pet\n\n)(?<!$pet\r\n)(?<!$pet\n)(?<!$pet\r)(?<!$pet)(?<!$pst)($rt)(?!$brt)/<br \/>$1/isg; }
221 }
222
223 # ==impose quotes
224 sub quotes($$) {
225 my ($class, $conf) = @_;
226
227 my $i;
228 my ($a_open, $b_open) = (0,0);
229 my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0);
230 my ($qaStart, $qaEnd, $qbStart, $qbEnd);
231 my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,);
232
233 # space class
234 my $sp =" \t\xA0";
235 # characters
236 my $letters = "A-Za-zА-Яа-яЁёЙй";
237
238 @qs_ansi = ("«", "“", "„", "‘", "‚", '"');
239 @qe_ansi = ("»", "”", "“", "’", "‘", '"');
240 @qs_html = ("&#171;", "&#147;", "&#132;", "&#145;", "&#130;", "&#34;");
241 @qe_html = ("&#187;", "&#148;", "&#147;", "&#146;", "&#145;", "&#34;");
242 # << `` .. ` . "
243 @qs_ent = ("&laquo;", "&ldquo;", "&bdquo;", "&lsquo;", "&sbquo;", "&quot;");
244 # >> '' '' ' ` "
245 @qe_ent = ("&raquo;", "&rdquo;", "&ldquo;", "&rsquo;", "&lsquo;", "&quot;");
246
247 # Quotes collection
248 if(!$conf->{codeMode}) {
249 @qs = @qs_ansi; @qe = @qe_ansi;
250 } elsif ($conf->{codeMode} == 1) {
251 @qs = @qs_html; @qe = @qe_html;
252 } else {
253 @qs = @qs_ent; @qe = @qe_ent;
254 }
255
256 # Getting configuration setting
257 $conf->{qaType} ||= 0;
258 $conf->{qbType} ||= 1;
259 $conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0;
260 $conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1;
261
262 # Selecting quotes as requested by user
263 ($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]);
264 ($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]);
265
266 # Resetting all the quotes inside text to <">
267 my $qa = join('|', @qs_ansi) . '|' . join('|', @qe_ansi) . '|' . join('|', @qs_html) . '|' . join('|', @qe_html) . '|' . join('|', @qs_ent) . '|' . join('|', @qe_ent);
268 $strip =~ s/(?:(?:(?<=[^$letters])|(?<=^))($qa))|(?:($qa)(?:(?=[^$letters])|(?=$)))/\"/ig;
269
270 my $spread = 1;
271 my $mv = 0;
272 my $mvn = 0;
273 my @st;
274 $i = 0;
275 my $skip = 0;
276 my @space; # Space tags flag
277 my @break; # Text break flags
278
279 $st[$_] = '' foreach(0..$spread + 1);
280 $space[$_] = 0 foreach(0 + 1..$spread + 1);
281 $break[$_] = 0 foreach(0 + 1..$spread + 1);
282 $space[0] = 1;
283 $break[0] = 1;
284
285 while(1) {
286 # Skipping tags
287 foreach(0..$spread) {
288 do {
289 $skip = 0;
290 if($i + $_ + $mv <= length($strip)) {
291 if($i + $_ + $mv + 1 < length($strip)) {
292 if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) {
293 $space[$_ + 1] |= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1;
294 $break[$_ + 1] |= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1;
295 $mv += $markLength;
296 if(!$_) { $mvn = $mv; }
297 $st[$_ + 1] = "";
298 $skip = 1;
299 }
300 }
301 if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); }
302 }
303 } while($skip);
304 }
305
306 $i += $mvn;
307 $mv = 0;
308 $mvn = 0;
309
310 ($cp, $c, $cn) = ($st[0], $st[1], $st[2]);
311 $cp_is_sp = (($cp =~ /[^0-9$letters]/) || $space[0] || $space[1] || $break[0] || !$i) ? 1 : 0;
312 $cn_is_sp = (($cn =~ /[^0-9$letters]/) || $space[2] || $break[2] || $cn eq '') ? 1 : 0;
313
314 # Reset state if breaking tag appears
315 if($break[1] || $i == length($strip)) {
316 if($a_open || $b_open) {
317 # Log quote error if appears
318 if($conf->{logErrors}) {
319 my $quoteErrSampleLength = 100;
320 my $z = $i - 1;
321 my $y;
322 while(1) {
323 if(substr($strip, $z, 1) eq " " || substr($strip, $z, 1) eq "\xA0" || !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}}
324 last if(!$z);
325 $z--;
326 }
327 my $sample = substr($strip, $y, ($i - $y));
328 $sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g;
329 $sample =~ s/<\/?[a-z]+.*?>//g;
330 push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"});
331 $result->{error} = 1;
332 }
333 }
334 $a_open = 0;
335 $b_open = 0;
336 }
337
338 if($c eq '"') {
339 if(!$a_open) {
340 $a_open = 1;
341 substr($strip, $i, 1) = $qaStart;
342 $i += length($qaStart) - 1;
343 } elsif ($a_open && (($i == length($strip) - 1) || (!$b_open && $cn_is_sp))) {
344 $a_open = 0;
345 substr($strip, $i, 1) = $qaEnd;
346 $i += length($qaEnd) - 1;
347 } elsif ($a_open && !$b_open) {
348 $b_open = 1;
349 substr($strip, $i, 1) = $qbStart;
350 $i += length($qbStart) - 1;
351 } elsif ($a_open && $b_open) {
352 $b_open = 0;
353 substr($strip, $i, 1) = $qbEnd;
354 $i += length($qbEnd) - 1;
355 }
356 }
357
358 last if($i == length($strip));
359 $st[0] = $st[1];
360 $space[0] = $space[1];
361 $break[0] = $break[1];
362 $space[$_] = 0 foreach(0 + 1..$spread + 1);
363 $break[$_] = 0 foreach(0 + 1..$spread + 1);
364 $i++;
365 }
366 }
367
368 # ==Cutting the tags away
369 sub cuttags($$$$) {
370 my($class, $text, $conf, $result) = @_;
371 my $i = 0; # loop counter
372 my $hop; # Jump length
373 my ($c, $cn); # current & next character
374 my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag
375 my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags
376 my @tagsOpen; # an array storing the info about all the tags currently open
377
378 # space class
379 my $sp =" \t\xA0";
380
381 while(1) {
382 $hop = index($$text, "<", $i);
383
384 if($hop < 0) {
385 $strip .= substr($$text, $i, length($$text) - $i);
386 last;
387 } elsif($hop > 0) {
388 $strip .= substr($$text, $i, $hop - $i);
389 $i = $hop;
390 }
391
392 ($c, $cn) = unpack("aa", substr($$text, $i, 2));
393
394 $isTag = 0;
395
396 # =If tag opens
397 $isTagStart = ($cn =~ /!|[a-z]/i) ? 1 : 0;
398 if($isTagStart || ($cn eq "/")) { $isTag = 1; }
399
400 if($isTag) {
401 $ts = $i; # Tag start position
402 $te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position
403
404 if($te) {
405 $tagBody = substr($$text, $ts, $te - $ts + 1);
406 $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
407 $tagName =~ tr/A-Z/a-z/;
408 }
409
410 if($te && $tagName) {
411 $tagBody = substr($$text, $ts, $te - $ts + 1);
412 $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
413 $tagName =~ tr/A-Z/a-z/;
414
415 # =Flags
416 # Detecting whether the tag is single (self-closing) or double
417 $isSingleClosed = 0;
418 $isSingle = 0;
419 if($isTagStart) {
420 if(grep{$tagName eq $_} @singleTags) {
421 $isSingle = 1;
422 } elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") {
423 $isSingle = 1;
424 $isSingleClosed = 1;
425 }
426 }
427
428 # Detecting wether this is space tag or not
429 $isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0;
430
431 # Detecting wether this is breaking tag or not
432 $isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0;
433
434 # Tag Length
435 $tl = $te - $ts + 1;
436
437 # Updating the status for tags open
438 if($conf->{checkHTML} && !$isSingle) {
439 if($isTagStart) {
440 push(@tagsOpen, $tagName);
441 } else {
442 if($tagsOpen[$#tagsOpen] ne $tagName) {
443 # HTML error
444 $result->{error} = 1;
445 if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); }
446 } else {
447 pop(@tagsOpen);
448 }
449 }
450 }
451
452 # Eating tag content for some tags like <script>
453 $tagContent = "";
454 $cl = 0;
455 if((grep{$tagName eq $_} @tagsToEat) && $isTagStart) {
456 $cl = index($$text, "</$tagName>", $ts + $tl) - $ts - $tl;
457 if($cl > 0) {
458 $tagContent = substr($$text, $ts + $tl, $cl);
459 } else {
460 $cl = 0;
461 $result->{error} = 1;
462 if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Can't find <$tagName> end!"}); }
463 }
464 }
465
466 # Should I drop all the tags by default?
467 my $dropTag = 0;
468 if($conf->{tagsDenyAll}) { $dropTag = 1; }
469
470 # Checking deny list
471 if(defined($conf->{tagsDeny}) && !$dropTag) {
472 if($conf->{tagsDeny}->{$tagName}) { $dropTag = 1; }
473 }
474
475 # Checking allow list
476 if(defined($conf->{tagsAllow}) && $dropTag) {
477 if($conf->{tagsAllow}->{$tagName}) { $dropTag = 0; }
478 }
479
480 # Nobr tag status
481 if($tagName eq "nobr" && $isTagStart) {
482 $nobrIsOpen = 1;
483 } elsif(($tagName eq "nobr" && !$isTagStart) || (grep{$tagName eq $_} @breakingTags)) {
484 $nobrIsOpen = 0;
485 }
486
487 # =Final part
488 if(!$dropTag) {
489 # =Processing tags
490 # Tag name to lower case
491 if($conf->{tagNamesToLower}) {
492 if($isTagStart) { $tagBody = "<" . $tagName . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
493 else { $tagBody =~ tr/A-Z/a-z/; }
494 }
495 # Tag name to upper case
496 if($conf->{tagNamesToUpper}) {
497 if($isTagStart) { $tagBody = "<" . uc($tagName) . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
498 else { $tagBody =~ tr/a-z/A-Z/; }
499 }
500 # =Tag parameters to lower or upper case
501 if($isTagStart && ($conf->{tagAttributesToLower} || $conf->{tagAttributesToUpper})) {
502 # Regular parameters
503 my $tmp = "";
504 while ($tagBody =~ m/([^\s]*\s*)(?:([a-z\r]+)(\s*)(?==)(=\s*))?/ig ) {
505 $tmp .= $1 if ($1); if($conf->{tagAttributesToLower}) { if($2) { $tmp .= lc($2); } } else { if($2) { $tmp .= uc($2); } } $tmp .= $3 if ($3); $tmp .= $4 if ($4); $tmp .= $5 if ($5);
506 }
507
508 # Single parameters (like <checked>)
509 if($conf->{tagAttributesToLower}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{lc($1)}eig; }
510 elsif($conf->{tagAttributesToUpper}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{uc($1)}eig; }
511 }
512
513 # Simple XSS & tag attributes protection
514 if($isTagStart && ($conf->{simpleXSS} || $conf->{tagsAllow}->{$tagName}->{validAttributes} || $conf->{tagsAllow}->{$tagName}->{invalidAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || $conf->{tagsDenyAllAttributes})) {
515 $tagBody =~ s{(?<!<)(\s*)([a-z]+)([$sp]*=[$sp]*)("[^"]+"|[^$sp/>]+)} {
516 my ($a, $b, $c, $d) = ($1||'', $2, $3, $4);
517 if($conf->{simpleXSS} && ($b =~ /^on/ig || $d =~ /javascript|expression/ig)) {
518 '';
519 } elsif(($conf->{tagsDenyAllAttributes} || $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} || ($conf->{tagsAllow}->{$tagName}->{invalidAttributes} && $conf->{tagsAllow}->{$tagName}->{invalidAttributes}->{$b}))
520 && !(($conf->{tagsAllow}->{$tagName}->{validAttributes} && $conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b})
521 || $conf->{tagsAllow}->{$tagName}->{allowAllAttributes})
522 ) {
523 '';
524 } elsif($conf->{tagsAllow}->{$tagName}->{validAttributes} && !$conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) {
525 '';
526 } else {
527 $a . $b . $c . $d;
528 }
529 }eig;
530 }
531
532 # Close single tag
533 if($conf->{tagCloseSingle} && $isSingle && !$isSingleClosed) {
534 if(substr($tagBody, length($tagBody) - 2, 1) ne "/") {
535 if(substr($tagBody, length($tagBody) - 2, 1) ne " ") { substr($tagBody, length($tagBody) - 2, 1) .= " /"; } else { substr($tagBody, length($tagBody) - 2, 1) .= "/"; }
536 }
537 }
538
539 # Quote attribute values
540 if($conf->{tagQuoteValues} && $isTagStart) {
541 my $tmp = "";
542 # 1 23 4 5 6
543 while($tagBody =~ m/([<a-z >]+)?((=)(\s*)([^ >]+)([ >]+))?/ig) {
544 $tmp .= $1 if($1);
545 if($2) {
546 $tmp .= $3 if($3);
547 $tmp .= $4 if($4);
548 if($5 && substr($5, 0, 1) ne '"' && substr($5, length($5) - 1, 1) ne '"') { $tmp .= "\"$5\""; } else { $tmp .= $5; }
549 $tmp .= $6 if($6);
550 }
551 }
552 $tagBody = $tmp;
553 }
554
555 # Unquote attribute values
556 if($conf->{tagUnQuoteValues}) {
557 $tagBody =~ s/([a-z]+)(\s*)(=)(\s*)"([^\=\s">]+)"/$1$2$3$4$5/ig; #"
558 }
559
560 # Saving the tag
561 push(@$tags, {name=>$tagName, body=>$tagBody, content=>$tagContent});
562
563 # Forming flagSet
564 #
565 # |byte2: _ _ _ _ _ _ isSpace isBreaking| byte1: _ _ _ _ _ p br| byte0: _ _ _ _ nobr isTagStart
566 $flagSet2 = 0;
567 if($isSpace) { $flagSet2 |= 2; }
568 if($isBreaking) { $flagSet2 |= 1; }
569 $flagSet1 = 0;
570 if($tagName eq "br") { $flagSet1 |= 1; }
571 if($tagName eq "p") { $flagSet1 |= 2; }
572 $flagSet0 = 0;
573 if($isTagStart) { $flagSet0 |= 1; }
574 if($nobrIsOpen) { $flagSet0 |= 2; }
575 # Planting the marker
576 $strip .= "\x00\x0F" . chr($flagSet2) . chr($flagSet1) . chr($flagSet0) . "\x0F\x00";
577 }
578
579 # Moving the pointer (tag end position + content length)
580 $i = $te + $cl;
581
582 # Eating crs & lfs after dropped tag
583 if($conf->{tagLf} && $dropTag) {
584 while(1) {
585 if(substr($$text, $i + 1, 1) eq "\r") { $i++; } elsif(substr($$text, $i + 1, 1) eq "\n") { $i++; last; } else { last }
586 }
587 }
588 }
589 } else {
590 # This is not a tag, just add the "<" to result
591 $strip .= $c;
592 }
593
594 last if($i == length($$text));
595 $i++;
596 }
597 }
598
599 # ==Find where tag ends
600 sub tagend($$$) {
601 my ($text, $i) = @_;
602
603 my $gotcha = 0;
604 my $quote = 0;
605
606 $i |= 0;
607
608 while (1) {
609 if (substr($$text, $i, 1) eq '"') { $quote ^= 1; }
610 if (!$quote && substr($$text, $i, 1) eq '>') { $gotcha = $i; }
611 last if ($i == length($$text) || $gotcha);
612 $i++;
613 }
614
615 return $gotcha;
616 }
617
618 # ==Bring everything back to HTML
619 sub planttags($$) {
620 my ($class, $result) = @_;
621 my $i = 0;
622 my $max = length($strip);
623 my $ctag = 0;
624 my $step;
625
626 while (1) {
627 if($i < $max - 2 && substr($strip, $i, 2) eq "\x00\x0F") {
628 $result->{text} .= $$tags[$ctag]->{body};
629 if($$tags[$ctag]->{content}) { $result->{text} .= $$tags[$ctag]->{content}; }
630 $i += $markLength;
631 $ctag++;
632 } else {
633 if($i < $max - 2) { $step = index($strip, "\x00\x0F", $i) - $i; } else { $step = $max - $i; }
634 if($step < 0) { $step = $max - $i; }
635
636 if($step >= 0) {
637 $result->{text} .= substr($strip, $i, $step);
638 $i += $step;
639 }
640 }
641
642 last if($i == $max);
643 }
644 }
645
646 # ==Bring the text to plain mode==
647 sub vanish($$) {
648 my($class, $text) = @_;
649
650 $$text =~ s/&laquo;|&ldquo;|&bdquo;|&lsquo;|&sbquo;|&quot;|&raquo;|&rdquo;|&ldquo;|&rsquo;|&#171;|&#147;|&#132;|&#145;|&#130;|&#34;|&#187;|&#148;|&#146;|«|“|„|‘|‚|"|»|”|’/"/ig;
651 $$text =~ s/&nbsp;|&#160;| / /ig;
652 $$text =~ s/&mdash;|&ndash;|&#151;|&#150;|—|–/-/ig;
653 $$text =~ s/&hellip;|&#133;|…/.../ig;
654 $$text =~ s/&copy;|&#169;|©/(c)/ig;
655 $$text =~ s/&reg;|&#174;|®/(r)/ig;
656 $$text =~ s/&frac14;|&#188;|¼/1\/4/ig;
657 $$text =~ s/&frac12;|&#189;|½/1\/2/ig;
658 $$text =~ s/&frac34;|&#190;|¾/3\/4/ig;
659 }
660
661 # ==Parse the tagsAllow string advanced format==
662 sub parseTagsAllowString($$) {
663 my($class, $string) = @_;
664
665 return {tagsAllow=>{}, tagsDenyAllAttributes=>0} if(!$string);
666
667 my $tagsAllow = {};
668 my $tagsDenyAllAttributes = 0;
669
670 # Should I deny all tag attributes by default?
671 if(substr($string,0,1) eq '|') {
672 $tagsDenyAllAttributes = 1;
673 substr($string,0,1) = '';
674 };
675
676 # Parsing the Configuration String
677 while($string =~ /([a-z:|]+)/ig) {
678 my $tBody = $1;
679 my ($tagName) = ($tBody =~ /^([a-z]+)/i)[0];
680
681 last if(!$tagName);
682
683 my $attrList = ();
684 $tagsAllow->{$tagName}->{val}=1;
685
686 if($tBody =~ /^$tagName\|$/i) {
687 $tagsAllow->{$tagName}->{denyAllAttributes}=1;
688 } elsif($tBody =~ /^$tagName\:$/i) {
689 $tagsAllow->{$tagName}->{allowAllAttributes}=1;
690 } else {
691 while($tBody =~ /:([a-z]+)/ig) {
692 $tagsAllow->{$tagName}->{validAttributes}->{$1}=1;
693 }
694
695 while($tBody =~ /\|([a-z]+)/ig) {
696 if(!$tagsAllow->{$tagName}->{validAttributes}->{$1}) {
697 $tagsAllow->{$tagName}->{invalidAttributes}->{$1}=1;
698 }
699 }
700 }
701 }
702
703 return {tagsAllow=>$tagsAllow, tagsDenyAllAttributes=>$tagsDenyAllAttributes};
704 }
705
706 # ==Parse the tagsAllow string advanced format==
707 sub parseTagsDenyString($$) {
708 my($class, $string) = @_;
709
710 return {} if(!$string);
711
712 my $tagsDeny = {};
713 while($string =~ /([a-z]+)/ig) {
714 $tagsDeny->{$1}->{val}=1;
715 }
716
717 return $tagsDeny;
718 }
719
720 return 1;