package Contenido::Parser::Util; use strict; sub clean_invalid_chars { # http://www.w3.org/TR/REC-xml/#NT-Char my ($cont_ref) = shift; $$cont_ref =~ s/[\x0-\x8|\xB\xC|\xE-\x1F|\x{d800}-\x{dfff}|\x{fffe}\x{ffff}]//sgi; } sub text_cleanup { my $text = shift; my $delim = shift || "\n\n"; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\r\n/\n/g; my @paragfs = $text =~ /\n{2,}/ ? # is paragraphs detected? split /\n{2,}/, $text : # - by paragraphs only split /\n+/, $text; # - by any newline for (@paragfs) { s/^\s+//mg; s/\s+$//mg; # trim whitespace s/[[:blank:]]+/ /g; # collapse spaces } return join "\n\n", grep length $_, @paragfs; } 1;