Line #	Revision	Author
1	3	ahitrov@rambler.ru	package Jevix::MakeupUtf;
2
3			# ==========================================
4			#
5			# Jevix Version 0.9.5 (utf-8)
6			#
7			# Developed by Igor Askarov
8			#
9			# Please send all suggestions to
10			# Igor Askarov <juks@juks.ru>
11			# http://www.jevix.ru/
12			#
13			# Release date: 21/01/2008
14			#
15			# === Methods list==========================
16			#
17			# new the constructor
18			# procces entry sub
19			# preset presets selector
20			# makeup makeup the text
21			# quotes quotes processor
22			# cuttags tags processor
23			# tagend looking fo tag end
24			# planttags sub to bring the tags back
25			# vanish sub to remove all the stuff and bring the text to plain mode
26			# parseTagsAllowString parse the tagsAllow string to hash
27			# parseTagsDenyString parse the tagsDeny string to hash
28			#
29			# ==========================================
30
31			use strict;
32			use warnings;
33
34			#UTFMODE#
35			use Encode;
36			use utf8;
37			#UTFMODE#
38
39			my $markLength = 7;
40			my $strip;
41			my $result;
42			my $tags;
43			my $conf;
44
45			my @singleTags = qw/link input spacer img br hr/;
46			my @breakingTags = qw/p td div hr/;
47			my @spaceTags = qw/br/;
48			my @tagsToEat = qw/script style/;
49
50			# ==The constructor
51			sub new {
52			my Jevix::MakeupUtf $class = shift;
53
54			return $class;
55			}
56
57			# ==Here we've got the input
58			sub process($$$) {
59			my($class, $text, $userConfig) = @_;
60
61			#UTFMODE#
62			$$text = decode("utf8", $$text);
63			#UTFMODE#
64
65			$conf = $userConfig ? $userConfig : {presetBasic=>1};
66			$class->preset();
67
68			$strip = "";
69			$tags = [];
70
71			$result = {};
72			$result->{error} = 0;
73			$result->{errorLog} = [];
74
75			if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); }
76			if($conf->{quotes}) { $class->quotes($conf); }
77			$class->makeup($conf);
78
79			$result->{text} = "";
80			if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; }
81
82			#UTFMODE#
83			Encode::_utf8_off($result->{text});
84			#UTFMODE#
85
86			return $result;
87			}
88
89			# ==Choosing default setup when necessary
90			sub preset($$) {
91			my ($class) = @_;
92
93			if(!$conf \|\| $conf->{presetBasic}) {
94			$conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode
95			$conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to <br/>
96			$conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs
97			$conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary
98			$conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol
99			$conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters
100			$conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces
101			$conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces
102			$conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside <nobr>
103			$conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag
104			$conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces
105			$conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes
106			$conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type
107			$conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type
108			$conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions
109			$conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters
110			}
111
112			# If tagsAllow came as a string
113			if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) {
114			my $tmp = $class->parseTagsAllowString($conf->{tagsAllow});
115			$conf->{tagsAllow} = $tmp->{tagsAllow};
116			$conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes};
117			}
118
119			# If tagsDeny came as a string
120			if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) {
121			$conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny});
122			}
123			}
124
125			# ==Imposing clear text
126			sub makeup($$) {
127			my ($class, $conf) = @_;
128
129			# ==Misc
130			# Prepositions
131			my $prp_rus = "а\|без\|безо\|в\|вне\|во\|да\|для\|до\|за\|и\|из\|изо\|или\|к\|как\|на\|над\|надо\|не\|ни\|но\|о\|об\|обо\|около\|от\|ото\|по\|под\|подо\|при\|про\|с\|сквозь\|со\|у\|через";
132			my $prp_eng = "aboard\|about\|above\|absent\|across\|after\|against\|along\|alongside\|amid\|amidst\|among\|amongst\|around\|as\|astride\|at\|atop\|before\|behind\|below\|beneath\|beside\|besides\|between\|beyond\|but\|by\|despite\|down\|during\|except\|following\|for\|from\|in\|inside\|into\|like\|mid\|minus\|near\|nearest\|notwithstanding\|of\|off\|on\|onto\|opposite\|out\|outside\|over\|past\|re\|round\|save\|since\|than\|through\|throughout\|till\|to\|toward\|towards\|under\|underneath\|unlike\|until\|up\|upon\|via\|with\|within\|without";
133			my $prp = "$prp_rus\|$prp_eng";
134
135			my $letters = "A-Za-zА-Яа-яЁёЙй"; # Characters
136			my $cap_letters = "A-ZА-ЯЁё"; # Capital characters
137
138			my $sp = " \xA0\t"; # space class
139			my $rt = "\r?\n"; # cr class
140
141			my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag
142			my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag
143			my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start
144			my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end
145			my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag
146			my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag
147			my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where <nobr> is open
148			my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no <nobr> open
149			my $ts = "\x00\x0F"; # Tag start
150			my $te = "\x0F\x00"; # Tag end
151
152			my $brt = "<br *\/?>"; # br tag in text mode
153			my $pst = "<p>";
154			my $pet = "</p>";
155
156			# Codes, metasymbols or what ever?
157			my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg);
158			if(!$conf->{codeMode}) {
159			($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®");
160			$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"};
161			} elsif($conf->{codeMode} == 1) {
162			($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("", " ", "", "©", "®");
163			$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"};
164			} else {
165			($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®");
166			$cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"};
167			}
168
169			# Wiping edge spaces
170			if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n](.+?)[$sp\r\n]$/$1/isg; }
171
172			# Wiping spaces between tags (</td> </tr>)
173			if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; }
174
175			# Wiping multispaces
176			if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; }
177
178			# Wiping redundant spaces
179			if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))\|(\()(?<![:%;]-\()[$sp]+}{$1 ? $2 : $3}eig; }
180
181			if($conf->{nbsp}) {
182			# Prepositions with
183			$strip =~ s/(^\|\x00\|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm;
184
185			#   with digits
186			$strip =~ s{($nb\|^)(.*?)($bb\|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+\| \| )(?:(?=[0-9]{2,})\|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg;
187			}
188
189			# Put composite words inside <nobr>
190			if($conf->{compositeWords}) { $strip =~ s{($nb\|^)(.*?)($bb\|$)}{ my ($a, $b, $c) = ($1, $2, $3);
191			$b =~ s{(^\|[$sp\x00]\| )([$letters]+(?:-[$letters]+)+)(?=$\|[$sp\x00])}{
192			my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3;
193			if(length($e) <= $conf->{compositeWordsLength}) { "$d<nobr>$e<\/nobr>" } else {"$d$e$f"}
194			}eig; "$a$b$c";
195			}eisg; }
196
197			# Dots
198			if($conf->{dots}) { $strip =~ s/\.{3}\|…\|…/$cdots/ig; }
199
200			# Dashes
201			if($conf->{dashes}) {
202			# Hyphen
203			$strip =~ s/([^$sp])([$sp]\| \| )(-{1,2}\|—\|—\|)/$1$cnbsp$cdash/ig;
204			# "Speech" hyphen
205			$strip =~ s/((?:^\|$ps\|$br\|$brt(?:$rt)\|[$rt]))[$sp](?: )(-{1,2}\|—\|—\|)[$sp](?: )*(.)/$1$cdash$cnbsp$3/ig;
206			}
207
208			# Misc stuff
209			if($conf->{misc}) {
210			# Fracs
211			$strip =~ s{(?:(?<=[$sp\x00])\|(?<=^))([13])/([24])(?:(?=[$sp\x00])\|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg;
212			# Copyright & registered
213			$strip =~ s{(?:(?<=[$sp\x00])\|(?<=^))($[cr]$\|©\|©)(?:(?=[$sp\x00?!;.,])\|(?=$))}{ if((lc($1) eq "(c)") \|\| (lc($1) eq "©") \|\| ($1 eq "©")) {$ccopy} elsif((lc($1) eq "(r)") \|\| (lc($1) eq "®") \|\| ($1 eq "®")) {$creg} else { $2 } }eig;
214			}
215
216			# Paragraphs
217			if($conf->{paragraphs}) { $strip =~ s{(^\|$pe(?:$rt$rt)?\|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?<!$pe)(?:(?=$)\|(?=$rt$rt)\|(?=$ps))}{ my ($a, $b, $c) = ($1,$2,$3\|\|""); (($b =~ /^[ \r\n]+$/) \|\| ($b =~ /^(<br *\/?>\|$br)+$/)) ? "$a$b$c" : "$a<p>$b</p>";}eisg; }
218
219			# Line break
220			if($conf->{lineBreaks}) { $strip =~ s/(?<!$pt)(?<!$br)(?<!$br\r)(?<!$pe\r\n\r\n)(?<!$pe\n\n)(?<!$pe\r\n)(?<!$pe\n)(?<!$pe\r)(?<!$pe)(?<!$pet\r\n\r\n)(?<!$pet\r\n\r)(?<!$pet\n\n)(?<!$pet\r\n)(?<!$pet\n)(?<!$pet\r)(?<!$pet)(?<!$pst)($rt)(?!$brt)/<br \/>$1/isg; }
221			}
222
223			# ==impose quotes
224			sub quotes($$) {
225			my ($class, $conf) = @_;
226
227			my $i;
228			my ($a_open, $b_open) = (0,0);
229			my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0);
230			my ($qaStart, $qaEnd, $qbStart, $qbEnd);
231			my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,);
232
233			# space class
234			my $sp =" \t\xA0";
235			# characters
236			my $letters = "A-Za-zА-Яа-яЁёЙй";
237
238			@qs_ansi = ("«", "“", "„", "‘", "‚", '"');
239			@qe_ansi = ("»", "”", "“", "’", "‘", '"');
240			@qs_html = ("«", "", "", "", "", """);
241			@qe_html = ("»", "", "", "", "", """);
242			# << `` .. ` . "
243			@qs_ent = ("«", "“", "&bdquo;", "‘", "&sbquo;", """);
244			# >> '' '' ' ` "
245			@qe_ent = ("»", "”", "“", "’", "‘", """);
246
247			# Quotes collection
248			if(!$conf->{codeMode}) {
249			@qs = @qs_ansi; @qe = @qe_ansi;
250			} elsif ($conf->{codeMode} == 1) {
251			@qs = @qs_html; @qe = @qe_html;
252			} else {
253			@qs = @qs_ent; @qe = @qe_ent;
254			}
255
256			# Getting configuration setting
257			$conf->{qaType} \|\|= 0;
258			$conf->{qbType} \|\|= 1;
259			$conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0;
260			$conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1;
261
262			# Selecting quotes as requested by user
263			($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]);
264			($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]);
265
266			# Resetting all the quotes inside text to <">
267			my $qa = join('\|', @qs_ansi) . '\|' . join('\|', @qe_ansi) . '\|' . join('\|', @qs_html) . '\|' . join('\|', @qe_html) . '\|' . join('\|', @qs_ent) . '\|' . join('\|', @qe_ent);
268			$strip =~ s/(?:(?:(?<=[^$letters])\|(?<=^))($qa))\|(?:($qa)(?:(?=[^$letters])\|(?=$)))/\"/ig;
269
270			my $spread = 1;
271			my $mv = 0;
272			my $mvn = 0;
273			my @st;
274			$i = 0;
275			my $skip = 0;
276			my @space; # Space tags flag
277			my @break; # Text break flags
278
279			$st[$_] = '' foreach(0..$spread + 1);
280			$space[$_] = 0 foreach(0 + 1..$spread + 1);
281			$break[$_] = 0 foreach(0 + 1..$spread + 1);
282			$space[0] = 1;
283			$break[0] = 1;
284
285			while(1) {
286			# Skipping tags
287			foreach(0..$spread) {
288			do {
289			$skip = 0;
290			if($i + $_ + $mv <= length($strip)) {
291			if($i + $_ + $mv + 1 < length($strip)) {
292			if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) {
293			$space[$_ + 1] \|= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1;
294			$break[$_ + 1] \|= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1;
295			$mv += $markLength;
296			if(!$_) { $mvn = $mv; }
297			$st[$_ + 1] = "";
298			$skip = 1;
299			}
300			}
301			if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); }
302			}
303			} while($skip);
304			}
305
306			$i += $mvn;
307			$mv = 0;
308			$mvn = 0;
309
310			($cp, $c, $cn) = ($st[0], $st[1], $st[2]);
311			$cp_is_sp = (($cp =~ /[^0-9$letters]/) \|\| $space[0] \|\| $space[1] \|\| $break[0] \|\| !$i) ? 1 : 0;
312			$cn_is_sp = (($cn =~ /[^0-9$letters]/) \|\| $space[2] \|\| $break[2] \|\| $cn eq '') ? 1 : 0;
313
314			# Reset state if breaking tag appears
315			if($break[1] \|\| $i == length($strip)) {
316			if($a_open \|\| $b_open) {
317			# Log quote error if appears
318			if($conf->{logErrors}) {
319			my $quoteErrSampleLength = 100;
320			my $z = $i - 1;
321			my $y;
322			while(1) {
323			if(substr($strip, $z, 1) eq " " \|\| substr($strip, $z, 1) eq "\xA0" \|\| !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}}
324			last if(!$z);
325			$z--;
326			}
327			my $sample = substr($strip, $y, ($i - $y));
328			$sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g;
329			$sample =~ s/<\/?[a-z]+.*?>//g;
330			push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"});
331			$result->{error} = 1;
332			}
333			}
334			$a_open = 0;
335			$b_open = 0;
336			}
337
338			if($c eq '"') {
339			if(!$a_open) {
340			$a_open = 1;
341			substr($strip, $i, 1) = $qaStart;
342			$i += length($qaStart) - 1;
343			} elsif ($a_open && (($i == length($strip) - 1) \|\| (!$b_open && $cn_is_sp))) {
344			$a_open = 0;
345			substr($strip, $i, 1) = $qaEnd;
346			$i += length($qaEnd) - 1;
347			} elsif ($a_open && !$b_open) {
348			$b_open = 1;
349			substr($strip, $i, 1) = $qbStart;
350			$i += length($qbStart) - 1;
351			} elsif ($a_open && $b_open) {
352			$b_open = 0;
353			substr($strip, $i, 1) = $qbEnd;
354			$i += length($qbEnd) - 1;
355			}
356			}
357
358			last if($i == length($strip));
359			$st[0] = $st[1];
360			$space[0] = $space[1];
361			$break[0] = $break[1];
362			$space[$_] = 0 foreach(0 + 1..$spread + 1);
363			$break[$_] = 0 foreach(0 + 1..$spread + 1);
364			$i++;
365			}
366			}
367
368			# ==Cutting the tags away
369			sub cuttags($$$$) {
370			my($class, $text, $conf, $result) = @_;
371			my $i = 0; # loop counter
372			my $hop; # Jump length
373			my ($c, $cn); # current & next character
374			my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag
375			my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags
376			my @tagsOpen; # an array storing the info about all the tags currently open
377
378			# space class
379			my $sp =" \t\xA0";
380
381			while(1) {
382			$hop = index($$text, "<", $i);
383
384			if($hop < 0) {
385			$strip .= substr($$text, $i, length($$text) - $i);
386			last;
387			} elsif($hop > 0) {
388			$strip .= substr($$text, $i, $hop - $i);
389			$i = $hop;
390			}
391
392			($c, $cn) = unpack("aa", substr($$text, $i, 2));
393
394			$isTag = 0;
395
396			# =If tag opens
397			$isTagStart = ($cn =~ /!\|[a-z]/i) ? 1 : 0;
398			if($isTagStart \|\| ($cn eq "/")) { $isTag = 1; }
399
400			if($isTag) {
401			$ts = $i; # Tag start position
402			$te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position
403
404			if($te) {
405			$tagBody = substr($$text, $ts, $te - $ts + 1);
406			$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
407			$tagName =~ tr/A-Z/a-z/;
408			}
409
410			if($te && $tagName) {
411			$tagBody = substr($$text, $ts, $te - $ts + 1);
412			$tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0];
413			$tagName =~ tr/A-Z/a-z/;
414
415			# =Flags
416			# Detecting whether the tag is single (self-closing) or double
417			$isSingleClosed = 0;
418			$isSingle = 0;
419			if($isTagStart) {
420			if(grep{$tagName eq $_} @singleTags) {
421			$isSingle = 1;
422			} elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") {
423			$isSingle = 1;
424			$isSingleClosed = 1;
425			}
426			}
427
428			# Detecting wether this is space tag or not
429			$isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0;
430
431			# Detecting wether this is breaking tag or not
432			$isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0;
433
434			# Tag Length
435			$tl = $te - $ts + 1;
436
437			# Updating the status for tags open
438			if($conf->{checkHTML} && !$isSingle) {
439			if($isTagStart) {
440			push(@tagsOpen, $tagName);
441			} else {
442			if($tagsOpen[$#tagsOpen] ne $tagName) {
443			# HTML error
444			$result->{error} = 1;
445			if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); }
446			} else {
447			pop(@tagsOpen);
448			}
449			}
450			}
451
452			# Eating tag content for some tags like <script>
453			$tagContent = "";
454			$cl = 0;
455			if((grep{$tagName eq $_} @tagsToEat) && $isTagStart) {
456			$cl = index($$text, "</$tagName>", $ts + $tl) - $ts - $tl;
457			if($cl > 0) {
458			$tagContent = substr($$text, $ts + $tl, $cl);
459			} else {
460			$cl = 0;
461			$result->{error} = 1;
462			if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Can't find <$tagName> end!"}); }
463			}
464			}
465
466			# Should I drop all the tags by default?
467			my $dropTag = 0;
468			if($conf->{tagsDenyAll}) { $dropTag = 1; }
469
470			# Checking deny list
471			if(defined($conf->{tagsDeny}) && !$dropTag) {
472			if($conf->{tagsDeny}->{$tagName}) { $dropTag = 1; }
473			}
474
475			# Checking allow list
476			if(defined($conf->{tagsAllow}) && $dropTag) {
477			if($conf->{tagsAllow}->{$tagName}) { $dropTag = 0; }
478			}
479
480			# Nobr tag status
481			if($tagName eq "nobr" && $isTagStart) {
482			$nobrIsOpen = 1;
483			} elsif(($tagName eq "nobr" && !$isTagStart) \|\| (grep{$tagName eq $_} @breakingTags)) {
484			$nobrIsOpen = 0;
485			}
486
487			# =Final part
488			if(!$dropTag) {
489			# =Processing tags
490			# Tag name to lower case
491			if($conf->{tagNamesToLower}) {
492			if($isTagStart) { $tagBody = "<" . $tagName . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
493			else { $tagBody =~ tr/A-Z/a-z/; }
494			}
495			# Tag name to upper case
496			if($conf->{tagNamesToUpper}) {
497			if($isTagStart) { $tagBody = "<" . uc($tagName) . substr($tagBody, length($tagName) + 1, length($tagBody) - length($tagName) - 1); }
498			else { $tagBody =~ tr/a-z/A-Z/; }
499			}
500			# =Tag parameters to lower or upper case
501			if($isTagStart && ($conf->{tagAttributesToLower} \|\| $conf->{tagAttributesToUpper})) {
502			# Regular parameters
503			my $tmp = "";
504			while ($tagBody =~ m/([^\s]\s)(?:([a-z\r]+)(\s)(?==)(=\s))?/ig ) {
505			$tmp .= $1 if ($1); if($conf->{tagAttributesToLower}) { if($2) { $tmp .= lc($2); } } else { if($2) { $tmp .= uc($2); } } $tmp .= $3 if ($3); $tmp .= $4 if ($4); $tmp .= $5 if ($5);
506			}
507
508			# Single parameters (like <checked>)
509			if($conf->{tagAttributesToLower}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{lc($1)}eig; }
510			elsif($conf->{tagAttributesToUpper}) { $tagBody =~ s{(?<!=)( +([a-z]+))}{uc($1)}eig; }
511			}
512
513			# Simple XSS & tag attributes protection
514			if($isTagStart && ($conf->{simpleXSS} \|\| $conf->{tagsAllow}->{$tagName}->{validAttributes} \|\| $conf->{tagsAllow}->{$tagName}->{invalidAttributes} \|\| $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} \|\| $conf->{tagsDenyAllAttributes})) {
515			$tagBody =~ s{(?<!<)(\s)([a-z]+)([$sp]=[$sp]*)("[^"]+"\|[^$sp/>]+)} {
516			my ($a, $b, $c, $d) = ($1\|\|'', $2, $3, $4);
517			if($conf->{simpleXSS} && ($b =~ /^on/ig \|\| $d =~ /javascript\|expression/ig)) {
518			'';
519			} elsif(($conf->{tagsDenyAllAttributes} \|\| $conf->{tagsAllow}->{$tagName}->{denyAllAttributes} \|\| ($conf->{tagsAllow}->{$tagName}->{invalidAttributes} && $conf->{tagsAllow}->{$tagName}->{invalidAttributes}->{$b}))
520			&& !(($conf->{tagsAllow}->{$tagName}->{validAttributes} && $conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b})
521			\|\| $conf->{tagsAllow}->{$tagName}->{allowAllAttributes})
522			) {
523			'';
524			} elsif($conf->{tagsAllow}->{$tagName}->{validAttributes} && !$conf->{tagsAllow}->{$tagName}->{validAttributes}->{$b}) {
525			'';
526			} else {
527			$a . $b . $c . $d;
528			}
529			}eig;
530			}
531
532			# Close single tag
533			if($conf->{tagCloseSingle} && $isSingle && !$isSingleClosed) {
534			if(substr($tagBody, length($tagBody) - 2, 1) ne "/") {
535			if(substr($tagBody, length($tagBody) - 2, 1) ne " ") { substr($tagBody, length($tagBody) - 2, 1) .= " /"; } else { substr($tagBody, length($tagBody) - 2, 1) .= "/"; }
536			}
537			}
538
539			# Quote attribute values
540			if($conf->{tagQuoteValues} && $isTagStart) {
541			my $tmp = "";
542			# 1 23 4 5 6
543			while($tagBody =~ m/([<a-z >]+)?((=)(\s*)([^ >]+)([ >]+))?/ig) {
544			$tmp .= $1 if($1);
545			if($2) {
546			$tmp .= $3 if($3);
547			$tmp .= $4 if($4);
548			if($5 && substr($5, 0, 1) ne '"' && substr($5, length($5) - 1, 1) ne '"') { $tmp .= "\"$5\""; } else { $tmp .= $5; }
549			$tmp .= $6 if($6);
550			}
551			}
552			$tagBody = $tmp;
553			}
554
555			# Unquote attribute values
556			if($conf->{tagUnQuoteValues}) {
557			$tagBody =~ s/([a-z]+)(\s)(=)(\s)"([^\=\s">]+)"/$1$2$3$4$5/ig; #"
558			}
559
560			# Saving the tag
561			push(@$tags, {name=>$tagName, body=>$tagBody, content=>$tagContent});
562
563			# Forming flagSet
564			#
565			# \|byte2: _ _ _ _ _ _ isSpace isBreaking\| byte1: _ _ _ _ _ p br\| byte0: _ _ _ _ nobr isTagStart
566			$flagSet2 = 0;
567			if($isSpace) { $flagSet2 \|= 2; }
568			if($isBreaking) { $flagSet2 \|= 1; }
569			$flagSet1 = 0;
570			if($tagName eq "br") { $flagSet1 \|= 1; }
571			if($tagName eq "p") { $flagSet1 \|= 2; }
572			$flagSet0 = 0;
573			if($isTagStart) { $flagSet0 \|= 1; }
574			if($nobrIsOpen) { $flagSet0 \|= 2; }
575			# Planting the marker
576			$strip .= "\x00\x0F" . chr($flagSet2) . chr($flagSet1) . chr($flagSet0) . "\x0F\x00";
577			}
578
579			# Moving the pointer (tag end position + content length)
580			$i = $te + $cl;
581
582			# Eating crs & lfs after dropped tag
583			if($conf->{tagLf} && $dropTag) {
584			while(1) {
585			if(substr($$text, $i + 1, 1) eq "\r") { $i++; } elsif(substr($$text, $i + 1, 1) eq "\n") { $i++; last; } else { last }
586			}
587			}
588			}
589			} else {
590			# This is not a tag, just add the "<" to result
591			$strip .= $c;
592			}
593
594			last if($i == length($$text));
595			$i++;
596			}
597			}
598
599			# ==Find where tag ends
600			sub tagend($$$) {
601			my ($text, $i) = @_;
602
603			my $gotcha = 0;
604			my $quote = 0;
605
606			$i \|= 0;
607
608			while (1) {
609			if (substr($$text, $i, 1) eq '"') { $quote ^= 1; }
610			if (!$quote && substr($$text, $i, 1) eq '>') { $gotcha = $i; }
611			last if ($i == length($$text) \|\| $gotcha);
612			$i++;
613			}
614
615			return $gotcha;
616			}
617
618			# ==Bring everything back to HTML
619			sub planttags($$) {
620			my ($class, $result) = @_;
621			my $i = 0;
622			my $max = length($strip);
623			my $ctag = 0;
624			my $step;
625
626			while (1) {
627			if($i < $max - 2 && substr($strip, $i, 2) eq "\x00\x0F") {
628			$result->{text} .= $$tags[$ctag]->{body};
629			if($$tags[$ctag]->{content}) { $result->{text} .= $$tags[$ctag]->{content}; }
630			$i += $markLength;
631			$ctag++;
632			} else {
633			if($i < $max - 2) { $step = index($strip, "\x00\x0F", $i) - $i; } else { $step = $max - $i; }
634			if($step < 0) { $step = $max - $i; }
635
636			if($step >= 0) {
637			$result->{text} .= substr($strip, $i, $step);
638			$i += $step;
639			}
640			}
641
642			last if($i == $max);
643			}
644			}
645
646			# ==Bring the text to plain mode==
647			sub vanish($$) {
648			my($class, $text) = @_;
649
650			$$text =~ s/«\|“\|&bdquo;\|‘\|&sbquo;\|"\|»\|”\|“\|’\|«\|\|\|\|\|"\|»\|\|\|«\|“\|„\|‘\|‚\|"\|»\|”\|’/"/ig;
651			$$text =~ s/ \| \| / /ig;
652			$$text =~ s/—\|–\|\|\|—\|–/-/ig;
653			$$text =~ s/…\|\|…/.../ig;
654			$$text =~ s/©\|©\|©/(c)/ig;
655			$$text =~ s/®\|®\|®/(r)/ig;
656			$$text =~ s/¼\|¼\|¼/1\/4/ig;
657			$$text =~ s/½\|½\|½/1\/2/ig;
658			$$text =~ s/¾\|¾\|¾/3\/4/ig;
659			}
660
661			# ==Parse the tagsAllow string advanced format==
662			sub parseTagsAllowString($$) {
663			my($class, $string) = @_;
664
665			return {tagsAllow=>{}, tagsDenyAllAttributes=>0} if(!$string);
666
667			my $tagsAllow = {};
668			my $tagsDenyAllAttributes = 0;
669
670			# Should I deny all tag attributes by default?
671			if(substr($string,0,1) eq '\|') {
672			$tagsDenyAllAttributes = 1;
673			substr($string,0,1) = '';
674			};
675
676			# Parsing the Configuration String
677			while($string =~ /([a-z:\|]+)/ig) {
678			my $tBody = $1;
679			my ($tagName) = ($tBody =~ /^([a-z]+)/i)[0];
680
681			last if(!$tagName);
682
683			my $attrList = ();
684			$tagsAllow->{$tagName}->{val}=1;
685
686			if($tBody =~ /^$tagName\\|$/i) {
687			$tagsAllow->{$tagName}->{denyAllAttributes}=1;
688			} elsif($tBody =~ /^$tagName\:$/i) {
689			$tagsAllow->{$tagName}->{allowAllAttributes}=1;
690			} else {
691			while($tBody =~ /:([a-z]+)/ig) {
692			$tagsAllow->{$tagName}->{validAttributes}->{$1}=1;
693			}
694
695			while($tBody =~ /\\|([a-z]+)/ig) {
696			if(!$tagsAllow->{$tagName}->{validAttributes}->{$1}) {
697			$tagsAllow->{$tagName}->{invalidAttributes}->{$1}=1;
698			}
699			}
700			}
701			}
702
703			return {tagsAllow=>$tagsAllow, tagsDenyAllAttributes=>$tagsDenyAllAttributes};
704			}
705
706			# ==Parse the tagsAllow string advanced format==
707			sub parseTagsDenyString($$) {
708			my($class, $string) = @_;
709
710			return {} if(!$string);
711
712			my $tagsDeny = {};
713			while($string =~ /([a-z]+)/ig) {
714			$tagsDeny->{$1}->{val}=1;
715			}
716
717			return $tagsDeny;
718			}
719
720			return 1;

Repository List / Contenido / koi8 / core / ports / all / Jevix / MakeupUTF.pm @ HEAD