package Jevix::MakeupUtf; # ========================================== # # Jevix Version 0.9.5 (utf-8) # # Developed by Igor Askarov # # Please send all suggestions to # Igor Askarov # http://www.jevix.ru/ # # Release date: 21/01/2008 # # === Methods list========================== # # new the constructor # procces entry sub # preset presets selector # makeup makeup the text # quotes quotes processor # cuttags tags processor # tagend looking fo tag end # planttags sub to bring the tags back # vanish sub to remove all the stuff and bring the text to plain mode # parseTagsAllowString parse the tagsAllow string to hash # parseTagsDenyString parse the tagsDeny string to hash # # ========================================== use strict; use warnings; #UTFMODE# use Encode; use utf8; #UTFMODE# my $markLength = 7; my $strip; my $result; my $tags; my $conf; my @singleTags = qw/link input spacer img br hr/; my @breakingTags = qw/p td div hr/; my @spaceTags = qw/br/; my @tagsToEat = qw/script style/; # ==The constructor sub new { my Jevix::MakeupUtf $class = shift; return $class; } # ==Here we've got the input sub process($$$) { my($class, $text, $userConfig) = @_; #UTFMODE# $$text = decode("utf8", $$text); #UTFMODE# $conf = $userConfig ? $userConfig : {presetBasic=>1}; $class->preset(); $strip = ""; $tags = []; $result = {}; $result->{error} = 0; $result->{errorLog} = []; if(!$conf->{isHTML}) { $strip = $$text; } else { $class->cuttags($text, $conf, $result); } if($conf->{quotes}) { $class->quotes($conf); } $class->makeup($conf); $result->{text} = ""; if($conf->{isHTML}) { $class->planttags($result); } else { $result->{text} = $strip; } #UTFMODE# Encode::_utf8_off($result->{text}); #UTFMODE# return $result; } # ==Choosing default setup when necessary sub preset($$) { my ($class) = @_; if(!$conf || $conf->{presetBasic}) { $conf->{isHTML} = 1 if(!defined($conf->{isHTML})); # HTML mode $conf->{lineBreaks} = 1 if(!defined($conf->{lineBreaks})); # Linebreaks to
$conf->{paragraphs} = 0 if(!defined($conf->{paragraphs})); # Paragraphs $conf->{dashes} = 1 if(!defined($conf->{dashes})); # Replace hyphens with dashes when necessary $conf->{dots} = 1 if(!defined($conf->{dots})); # Replace 3 dots with a symbol $conf->{edgeSpaces} = 1 if(!defined($conf->{edgeSpaces})); # Wipe edge space characters $conf->{multiSpaces} = 1 if(!defined($conf->{multiSpaces})); # Wipe multispaces $conf->{redundantSpaces} = 1 if(!defined($conf->{redundantSpaces})); # Wipe redundant spaces $conf->{compositeWordsLength} = 10 if(!defined($conf->{compositeWordsLength})); # The maximim length of composite word to be put inside $conf->{tagLf} = 1 if(!defined($conf->{tagLf})); # Wipe crs and lfs after droppped tag $conf->{nbsp} = 1 if(!defined($conf->{nbsp})); # Insert non-breaking spaces $conf->{quotes} = 1 if(!defined($conf->{quotes})); # Makeup quotes $conf->{qaType} = 0 if(!defined($conf->{qaType})); # Main quotes type $conf->{qbType} = 2 if(!defined($conf->{qbType})); # Nested quotes type $conf->{misc} = 1 if(!defined($conf->{misc})); # Misc substitutions $conf->{codeMode} = 2 if(!defined($conf->{codeMode})); # The way jevix should represent html special characters } # If tagsAllow came as a string if(defined($conf->{tagsAllow}) && !ref($conf->{tagsAllow})) { my $tmp = $class->parseTagsAllowString($conf->{tagsAllow}); $conf->{tagsAllow} = $tmp->{tagsAllow}; $conf->{tagsDenyAllAttributes} = $tmp->{tagsDenyAllAttributes}; } # If tagsDeny came as a string if(defined($conf->{tagsDeny}) && !ref($conf->{tagsDeny})) { $conf->{tagsDeny} = $class->parseTagsDenyString($conf->{tagsDeny}); } } # ==Imposing clear text sub makeup($$) { my ($class, $conf) = @_; # ==Misc # Prepositions my $prp_rus = "а|без|безо|в|вне|во|да|для|до|за|и|из|изо|или|к|как|на|над|надо|не|ни|но|о|об|обо|около|от|ото|по|под|подо|при|про|с|сквозь|со|у|через"; my $prp_eng = "aboard|about|above|absent|across|after|against|along|alongside|amid|amidst|among|amongst|around|as|astride|at|atop|before|behind|below|beneath|beside|besides|between|beyond|but|by|despite|down|during|except|following|for|from|in|inside|into|like|mid|minus|near|nearest|notwithstanding|of|off|on|onto|opposite|out|outside|over|past|re|round|save|since|than|through|throughout|till|to|toward|towards|under|underneath|unlike|until|up|upon|via|with|within|without"; my $prp = "$prp_rus|$prp_eng"; my $letters = "A-Za-zА-Яа-яЁёЙй"; # Characters my $cap_letters = "A-ZА-ЯЁё"; # Capital characters my $sp = " \xA0\t"; # space class my $rt = "\r?\n"; # cr class my $br = "\x00\x0F.[\x01\x03].\x0F\x00"; # br tag my $pt = "\x00\x0F.[\x02].\x0F\x00"; # Paragraph tag my $ps = "\x00\x0F.[\x02][\x01\x03]\x0F\x00"; # Paragraph start my $pe = "\x00\x0F.[\x02][\x02\x00]\x0F\x00"; # Paragraph end my $to = "\x00\x0F..[\x03\x01]\x0F\x00"; # Opening tag my $tc = "\x00\x0F..[\x02\x00]\x0F\x00"; # Closing tag my $bb = "\x00\x0F..[\x02\x03]\x0F\x00"; # Tag where is open my $nb = "\x00\x0F..[\x01\x00]\x0F\x00"; # Tag where no open my $ts = "\x00\x0F"; # Tag start my $te = "\x0F\x00"; # Tag end my $brt = "
"; # br tag in text mode my $pst = "

"; my $pet = "

"; # Codes, metasymbols or what ever? my ($cdash, $cnbsp, $cdots, $cfracs, $ccopy, $creg); if(!$conf->{codeMode}) { ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); $cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; } elsif($conf->{codeMode} == 1) { ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); $cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; } else { ($cdash, $cnbsp, $cdots, $ccopy, $creg) = ("—", " ", "…", "©", "®"); $cfracs = {'1/4'=>"¼", '1/2'=>"½", '3/4'=>"¾"}; } # Wiping edge spaces if($conf->{edgeSpaces}) { $strip =~ s/^[$sp\r\n]*(.+?)[$sp\r\n]*$/$1/isg; } # Wiping spaces between tags ( ) if($conf->{tagSpaces}) { $strip =~ s/($tc)[$sp]($tc)/$1$2/isg; } # Wiping multispaces if($conf->{multiSpaces}) { $strip =~ s/([$sp]){2,}/$1/ig; } # Wiping redundant spaces if($conf->{redundantSpaces}) { $strip =~ s{([$sp]+(?![:;]-[)(])([;:,.)?!]))|(\()(?{nbsp}) { # Prepositions with   $strip =~ s/(^|\x00|[$sp])($prp)[$sp]([0-9$letters])/$1$2$cnbsp$3/gm; #   with digits $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s/([0-9]+)([$sp]+| | )(?:(?=[0-9]{2,})|(?=%))/$1$cnbsp/ig; "$a$b$c"; }eisg; } # Put composite words inside
if($conf->{compositeWords}) { $strip =~ s{($nb|^)(.*?)($bb|$)}{ my ($a, $b, $c) = ($1, $2, $3); $b =~ s{(^|[$sp\x00]| )([$letters]+(?:-[$letters]+)+)(?=$|[$sp\x00])}{ my $d = !defined($1) ? "" : $1; my $e = !defined($2) ? "" : $2; my $f = !defined($3) ? "" : $3; if(length($e) <= $conf->{compositeWordsLength}) { "$d$e<\/nobr>" } else {"$d$e$f"} }eig; "$a$b$c"; }eisg; } # Dots if($conf->{dots}) { $strip =~ s/\.{3}|…|…/$cdots/ig; } # Dashes if($conf->{dashes}) { # Hyphen $strip =~ s/([^$sp])([$sp]| | )(-{1,2}|—|—|—)/$1$cnbsp$cdash/ig; # "Speech" hyphen $strip =~ s/((?:^|$ps|$br|$brt(?:$rt)*|[$rt]))[$sp]*(?: )*(-{1,2}|—|—|—)[$sp]*(?: )*(.)/$1$cdash$cnbsp$3/ig; } # Misc stuff if($conf->{misc}) { # Fracs $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))([13])/([24])(?:(?=[$sp\x00])|(?=$))}{if(defined($cfracs->{"$1/$2"})) { $cfracs->{"$1/$2"} } else { "$1/$2" } }esg; # Copyright & registered $strip =~ s{(?:(?<=[$sp\x00])|(?<=^))(\([cr]\)|©|©)(?:(?=[$sp\x00?!;.,])|(?=$))}{ if((lc($1) eq "(c)") || (lc($1) eq "©") || ($1 eq "©")) {$ccopy} elsif((lc($1) eq "(r)") || (lc($1) eq "®") || ($1 eq "®")) {$creg} else { $2 } }eig; } # Paragraphs if($conf->{paragraphs}) { $strip =~ s{(^|$pe(?:$rt$rt)?|$rt$rt)(?!$ps)(.+?)($br)?($brt)?(?|$br)+$/)) ? "$a$b$c" : "$a

$b

";}eisg; } # Line break if($conf->{lineBreaks}) { $strip =~ s/(?$1/isg; } } # ==impose quotes sub quotes($$) { my ($class, $conf) = @_; my $i; my ($a_open, $b_open) = (0,0); my ($cp, $c, $cn, $cn_is_sp, $cp_is_sp) = ('', '', '', 0, 0); my ($qaStart, $qaEnd, $qbStart, $qbEnd); my (@qs, @qe, @qs_ansi, @qe_ansi, @qs_html, @qe_html, @qs_ent, @qe_ent,); # space class my $sp =" \t\xA0"; # characters my $letters = "A-Za-zА-Яа-яЁёЙй"; @qs_ansi = ("«", "“", "„", "‘", "‚", '"'); @qe_ansi = ("»", "”", "“", "’", "‘", '"'); @qs_html = ("«", "“", "„", "‘", "‚", """); @qe_html = ("»", "”", "“", "’", "‘", """); # << `` .. ` . " @qs_ent = ("«", "“", "„", "‘", "‚", """); # >> '' '' ' ` " @qe_ent = ("»", "”", "“", "’", "‘", """); # Quotes collection if(!$conf->{codeMode}) { @qs = @qs_ansi; @qe = @qe_ansi; } elsif ($conf->{codeMode} == 1) { @qs = @qs_html; @qe = @qe_html; } else { @qs = @qs_ent; @qe = @qe_ent; } # Getting configuration setting $conf->{qaType} ||= 0; $conf->{qbType} ||= 1; $conf->{qaType} = ($conf->{qaType} >= 0 && $conf->{qaType} <= 5) ? $conf->{qaType} : 0; $conf->{qbType} = ($conf->{qbType} >= 0 && $conf->{qbType} <= 5) ? $conf->{qbType} : 1; # Selecting quotes as requested by user ($qaStart, $qaEnd) = ($qs[$conf->{qaType}], $qe[$conf->{qaType}]); ($qbStart, $qbEnd) = ($qs[$conf->{qbType}], $qe[$conf->{qbType}]); # Resetting all the quotes inside text to <"> my $qa = join('|', @qs_ansi) . '|' . join('|', @qe_ansi) . '|' . join('|', @qs_html) . '|' . join('|', @qe_html) . '|' . join('|', @qs_ent) . '|' . join('|', @qe_ent); $strip =~ s/(?:(?:(?<=[^$letters])|(?<=^))($qa))|(?:($qa)(?:(?=[^$letters])|(?=$)))/\"/ig; my $spread = 1; my $mv = 0; my $mvn = 0; my @st; $i = 0; my $skip = 0; my @space; # Space tags flag my @break; # Text break flags $st[$_] = '' foreach(0..$spread + 1); $space[$_] = 0 foreach(0 + 1..$spread + 1); $break[$_] = 0 foreach(0 + 1..$spread + 1); $space[0] = 1; $break[0] = 1; while(1) { # Skipping tags foreach(0..$spread) { do { $skip = 0; if($i + $_ + $mv <= length($strip)) { if($i + $_ + $mv + 1 < length($strip)) { if((substr($strip, $i + $_ + $mv, 1) eq "\x00") && (substr($strip, $i + $_ + $mv + 1, 1) eq "\x0F")) { $space[$_ + 1] |= (ord(substr($strip, $i + $_ + $mv + 2, 1)) & 2) >> 1; $break[$_ + 1] |= ord(substr($strip, $i + $_ + $mv + 2, 1)) & 1; $mv += $markLength; if(!$_) { $mvn = $mv; } $st[$_ + 1] = ""; $skip = 1; } } if(!$skip) { $st[$_ + 1] = substr($strip, $i + $_ + $mv, 1); } } } while($skip); } $i += $mvn; $mv = 0; $mvn = 0; ($cp, $c, $cn) = ($st[0], $st[1], $st[2]); $cp_is_sp = (($cp =~ /[^0-9$letters]/) || $space[0] || $space[1] || $break[0] || !$i) ? 1 : 0; $cn_is_sp = (($cn =~ /[^0-9$letters]/) || $space[2] || $break[2] || $cn eq '') ? 1 : 0; # Reset state if breaking tag appears if($break[1] || $i == length($strip)) { if($a_open || $b_open) { # Log quote error if appears if($conf->{logErrors}) { my $quoteErrSampleLength = 100; my $z = $i - 1; my $y; while(1) { if(substr($strip, $z, 1) eq " " || substr($strip, $z, 1) eq "\xA0" || !$z) { if($i-$z <= $quoteErrSampleLength) {$y = $z}} last if(!$z); $z--; } my $sample = substr($strip, $y, ($i - $y)); $sample =~ s/\x00\x0F[^\x0F]+\x0F\x00//g; $sample =~ s/<\/?[a-z]+.*?>//g; push(@{$result->{errorLog}}, {type=>"Quote_error", message=>"Quote mismatch near [$sample]<--"}); $result->{error} = 1; } } $a_open = 0; $b_open = 0; } if($c eq '"') { if(!$a_open) { $a_open = 1; substr($strip, $i, 1) = $qaStart; $i += length($qaStart) - 1; } elsif ($a_open && (($i == length($strip) - 1) || (!$b_open && $cn_is_sp))) { $a_open = 0; substr($strip, $i, 1) = $qaEnd; $i += length($qaEnd) - 1; } elsif ($a_open && !$b_open) { $b_open = 1; substr($strip, $i, 1) = $qbStart; $i += length($qbStart) - 1; } elsif ($a_open && $b_open) { $b_open = 0; substr($strip, $i, 1) = $qbEnd; $i += length($qbEnd) - 1; } } last if($i == length($strip)); $st[0] = $st[1]; $space[0] = $space[1]; $break[0] = $break[1]; $space[$_] = 0 foreach(0 + 1..$spread + 1); $break[$_] = 0 foreach(0 + 1..$spread + 1); $i++; } } # ==Cutting the tags away sub cuttags($$$$) { my($class, $text, $conf, $result) = @_; my $i = 0; # loop counter my $hop; # Jump length my ($c, $cn); # current & next character my ($tl, $ts, $te, $cl, $tagName, $tagBody, $tagContent); # tag length, tag dimensions, tag name, tag body text, single tag flag, content inside the tag my ($isTag, $isTagStart, $isSingle, $isSingleClosed, $isSpace, $isBreaking, $nobrIsOpen, $flagSet2, $flagSet1, $flagSet0); # some useful flags my @tagsOpen; # an array storing the info about all the tags currently open # space class my $sp =" \t\xA0"; while(1) { $hop = index($$text, "<", $i); if($hop < 0) { $strip .= substr($$text, $i, length($$text) - $i); last; } elsif($hop > 0) { $strip .= substr($$text, $i, $hop - $i); $i = $hop; } ($c, $cn) = unpack("aa", substr($$text, $i, 2)); $isTag = 0; # =If tag opens $isTagStart = ($cn =~ /!|[a-z]/i) ? 1 : 0; if($isTagStart || ($cn eq "/")) { $isTag = 1; } if($isTag) { $ts = $i; # Tag start position $te = $isTagStart ? tagend($text, $ts) : index($$text, ">", $ts); # Tag end position if($te) { $tagBody = substr($$text, $ts, $te - $ts + 1); $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; $tagName =~ tr/A-Z/a-z/; } if($te && $tagName) { $tagBody = substr($$text, $ts, $te - $ts + 1); $tagName = $isTagStart ? ($tagBody =~ m/^<([a-z]+)/i)[0] : ($tagBody =~ m/^<\/\s*([a-z]+)/i)[0]; $tagName =~ tr/A-Z/a-z/; # =Flags # Detecting whether the tag is single (self-closing) or double $isSingleClosed = 0; $isSingle = 0; if($isTagStart) { if(grep{$tagName eq $_} @singleTags) { $isSingle = 1; } elsif (substr($tagBody, length($tagBody) - 2, 1) eq "/") { $isSingle = 1; $isSingleClosed = 1; } } # Detecting wether this is space tag or not $isSpace = (grep{$tagName eq $_} @spaceTags) ? 1 : 0; # Detecting wether this is breaking tag or not $isBreaking = (grep{$tagName eq $_} @breakingTags) ? 1 : 0; # Tag Length $tl = $te - $ts + 1; # Updating the status for tags open if($conf->{checkHTML} && !$isSingle) { if($isTagStart) { push(@tagsOpen, $tagName); } else { if($tagsOpen[$#tagsOpen] ne $tagName) { # HTML error $result->{error} = 1; if($conf->{logErrors}) { push(@{$result->{errorLog}}, {type=>"HTML_Parse", position=>$i, message=>"Found closing tag <$tagName> while waiting tag <" . $tagsOpen[$#tagsOpen] . "> to close!"}); } } else { pop(@tagsOpen); } } } # Eating tag content for some tags like