libqbit-perl/lib/qbit/StringUtils.pm
=head1 Name
qbit::StringUtils - Functions to manipulate strings.
=cut
package qbit::StringUtils;
{
$qbit::StringUtils::VERSION = '0.6';
}
use strict;
use warnings;
use utf8;
use qbit::GetText;
use qbit::Exceptions;
use base qw(Exporter);
use HTML::Entities;
use URI::Escape qw(uri_escape_utf8);
use Net::LibIDN qw(idn_to_unicode idn_to_ascii);
use JSON::XS ();
use POSIX qw(locale_h);
BEGIN {
our (@EXPORT, @EXPORT_OK);
@EXPORT = qw(
html_encode html_decode uri_escape check_email idn_to_unicode get_domain to_json from_json format_number
);
@EXPORT_OK = @EXPORT;
}
my $RFC822PAT = <<'EOF';
[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\
xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xf
f\n\015()]*)*\)[\040\t]*)*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\x
ff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015
"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\
xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80
-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*
)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\
\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\
x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x8
0-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n
\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040
\t]*)*)*@[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([
^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\
\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\
x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()
]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\
x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\04
0\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\
n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\
015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?!
[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\
]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\
x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\01
5()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*|(?:[^(\040)<>@,;:".
\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]
)|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^
()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037]*(?:(?:\([^\\\x80-\xff\n\0
15()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][
^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)|"[^\\\x80-\xff\
n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^()<>@,;:".\\\[\]\
x80-\xff\000-\010\012-\037]*)*<[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?
:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-
\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:@[\040\t]*
(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015
()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()
]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\0
40)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\
[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\
xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*
)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80
-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x
80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t
]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\
\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])
*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x
80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80
-\xff\n\015()]*)*\)[\040\t]*)*)*(?:,[\040\t]*(?:\([^\\\x80-\xff\n\015(
)]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\
\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*@[\040\t
]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\0
15()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015
()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(
\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|
\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80
-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()
]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040
\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".
\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff
])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\
\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x
80-\xff\n\015()]*)*\)[\040\t]*)*)*)*:[\040\t]*(?:\([^\\\x80-\xff\n\015
()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\
\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)?(?:[^
(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-
\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\
n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|
\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))
[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff
\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\x
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(
?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\
000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\
xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\x
ff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)
*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*@[\040\t]*(?:\([^\\\x80-\x
ff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-
\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)
*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\
]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]
)[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-
\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\x
ff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(
?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80
-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8
0-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?:
\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]
*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)
*\)[\040\t]*)*)*>)
EOF
$RFC822PAT =~ s/\n//g;
my $DOMAIN_PART_RE = '[^:\s\/\.!@#$%^&*()\[\]\{\}\?\+;\'"`\\\\]+';
=head1 Functions
=head2 html_encode
B<Arguments:>
=over
=item
B<$str> - string.
=back
B<Return value:> string with encoded HTML entities.
=cut
sub html_encode($) {
return defined($_[0]) ? encode_entities($_[0]) : '';
}
=head2 html_decode
B<Arguments:>
=over
=item
B<$str> - string.
=back
B<Return value:> string with decoded HTML entities.
=cut
sub html_decode($) {
return defined($_[0]) ? decode_entities($_[0]) : '';
}
=head2 uri_escape
B<Arguments:>
=over
=item
B<$str> - string.
=back
B<Return value:> string with escaped URI entities.
=cut
sub uri_escape($) {
return uri_escape_utf8($_[0]);
}
=head2 check_email
B<Arguments:>
=over
=item
B<$email> - string, E-Mail.
=back
B<Return value:> boolean, TRUE if email is valid.
=cut
sub check_email($) {
my ($email) = @_;
return $email =~ /^$RFC822PAT$/;
}
=head2 get_domain
B<Arguments:>
=over
=item
B<$url> - string, URL;
=item
B<%opts> - additional arguments:
=over
=item
B<ascii> - boolean, convert unicode chars to ascii;
=item
B<www> - boolean, save 'www.'.
=back
=back
B<Return value:> string if domain valid, else nothing.
=cut
sub get_domain($;%) {
my ($url, %opts) = @_;
my $www = $opts{'www'} ? '' : '(?:www\.)?';
$url = lc($url);
$url =~ s/(^\s+)|(\s+$)//g;
if ($url =~ /^(?:https?:\/\/)?$www((?:$DOMAIN_PART_RE\.)*$DOMAIN_PART_RE)\.?($|\/|:\d+|\?)/) {
my $res = $opts{'ascii'} ? idn_to_ascii($1, 'utf-8') : idn_to_unicode($1, 'utf-8');
utf8::decode($res);
return $res;
} else {
return;
}
}
=head2 to_json
B<Arguments:>
=over
=item
B<$data> - scalar.
=back
B<Return value:> string, C<$data> as JSON.
=cut
sub to_json($;%) {
my ($data, %opts) = @_;
my $res;
if ($opts{'pretty'}) {
$res = JSON::XS->new->utf8->allow_nonref->pretty->encode($data);
} else {
$res = JSON::XS->new->utf8->allow_nonref->encode($data);
}
utf8::decode($res);
return $res;
}
=head2 from_json
B<Arguments:>
=over
=item
B<$text> - string, JSON.
=back
B<Return value:> scalar, perl structure from JSON.
=cut
sub from_json($) {
my ($text) = @_;
utf8::encode($text);
return JSON::XS->new->utf8->allow_nonref->decode($text);
}
=head2 format_number
B<Arguments:>
=over
=item
B<$number> - number;
=item
B<%args> - hash, additional arguments:
=over
=item
B<precision>: number, needed precision, if missed then frac will return as is;
B<thousands_sep>: string, thousands separator, default gets from locale;
B<decimal_point>: string, decimal point, default gets from locale.
=back
=back
B<Return value:> string, formatted number.
=cut
sub format_number($%) {
my ($number, %opts) = @_;
my $fmt_precision = ($opts{'precision'} || 0) + 1;
$number = sprintf("%.${fmt_precision}f", $number)
if $number =~ /^(-?[\d.]+)e([+-]\d+)$/; # Convert exponent notation
setlocale(LC_NUMERIC, "$ENV{'LC_ALL'}.utf8") if defined($ENV{'LC_ALL'});
my $localeconv = localeconv();
my $half = 0.50000000000008;
foreach my $opt (qw(thousands_sep decimal_point)) {
unless (defined($opts{$opt})) {
$opts{$opt} = $localeconv->{$opt};
utf8::decode($opts{$opt});
}
}
my ($int, $frac_zero, $frac) =
$number =~ /^(\d+)(?:[^\d](0*)(\d*))?$/
? (int($1), $2, int($3 || 0))
: throw Exception::BadArguments gettext('Invalid number "%s"', $number);
$frac_zero = '' unless defined($frac_zero);
if (defined($opts{'precision'})) {
if ($opts{'precision'} == 0) {
++$int if substr($frac, 0, 1) >= 5;
$frac = '';
} else {
$frac = int("0.$frac_zero$frac" * (10**$opts{'precision'}) + $half);
$frac = substr("$frac_zero$frac", 0, $opts{'precision'});
$frac = "$opts{'decimal_point'}$frac" . ('0' x ($opts{'precision'} - length($frac)));
}
} else {
$frac = $frac == 0 ? '' : "$opts{'decimal_point'}$frac_zero$frac";
}
if (length($int) > 3) {
$int = reverse($int);
$int =~ s/(\d\d\d)(?!$)/$1$opts{'thousands_sep'}/g;
$int = reverse($int);
}
return "$int$frac";
}
1;