Repository List / Contenido / utf8 / core / lib / Contenido / Parser / Util.pm @ r98

Line #	Revision	Author
1	98	ahitrov	package Contenido::Parser::Util;
2
3			use strict;
4
5			sub clean_invalid_chars { # http://www.w3.org/TR/REC-xml/#NT-Char
6			my ($cont_ref) = shift;
7			$$cont_ref =~ s/[\x0-\x8\|\xB\xC\|\xE-\x1F\|\x{d800}-\x{dfff}\|\x{fffe}\x{ffff}]//sgi;
8			}
9
10			sub text_cleanup {
11			my $text = shift;
12			my $delim = shift \|\| "\n\n";
13
14			$text =~ s/^\s+//; $text =~ s/\s+$//;
15			$text =~ s/\r\n/\n/g;
16
17			my @paragfs = $text =~ /\n{2,}/ ? # is paragraphs detected?
18			split /\n{2,}/, $text : # - by paragraphs only
19			split /\n+/, $text; # - by any newline
20
21			for (@paragfs) {
22			s/^\s+//mg; s/\s+$//mg; # trim whitespace
23			s/[[:blank:]]+/ /g; # collapse spaces
24			}
25
26			return join "\n\n", grep length $_, @paragfs;
27			}
28
29			1;