1 |
98 |
ahitrov |
package Contenido::Parser::Util; |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
|
5 |
|
|
sub clean_invalid_chars { # http://www.w3.org/TR/REC-xml/#NT-Char |
6 |
|
|
my ($cont_ref) = shift; |
7 |
|
|
$$cont_ref =~ s/[\x0-\x8|\xB\xC|\xE-\x1F|\x{d800}-\x{dfff}|\x{fffe}\x{ffff}]//sgi; |
8 |
|
|
} |
9 |
|
|
|
10 |
|
|
sub text_cleanup { |
11 |
|
|
my $text = shift; |
12 |
|
|
my $delim = shift || "\n\n"; |
13 |
|
|
|
14 |
|
|
$text =~ s/^\s+//; $text =~ s/\s+$//; |
15 |
|
|
$text =~ s/\r\n/\n/g; |
16 |
|
|
|
17 |
|
|
my @paragfs = $text =~ /\n{2,}/ ? # is paragraphs detected? |
18 |
|
|
split /\n{2,}/, $text : # - by paragraphs only |
19 |
|
|
split /\n+/, $text; # - by any newline |
20 |
|
|
|
21 |
|
|
for (@paragfs) { |
22 |
|
|
s/^\s+//mg; s/\s+$//mg; # trim whitespace |
23 |
|
|
s/[[:blank:]]+/ /g; # collapse spaces |
24 |
|
|
} |
25 |
|
|
|
26 |
|
|
return join "\n\n", grep length $_, @paragfs; |
27 |
|
|
} |
28 |
|
|
|
29 |
|
|
1; |