Group
Extension

Lingua-PT-Capitalizer/lib/Lingua/PT/Capitalizer.pm

package Lingua::PT::Capitalizer;

use common::sense;
use English qw[-no_match_vars];

use base qw[ Exporter ];
our @EXPORT = q(capitalize);

our $VERSION = '0.001'; # VERSION

our %lc_always = (
    q(a)   => 1,
    q(á)   => 1,
    q(à)   => 1,
    q(aos) => 1,
    q(às)  => 1,
    q(d')  => 1,    # elisão
    q(da)  => 1,
    q(das) => 1,
    q(de)  => 1,
    q(di)  => 1,
    q(do)  => 1,
    q(dos) => 1,
    q(e)   => 1,
    q(na)  => 1,
    q(nas) => 1,
    q(no)  => 1,
    q(nos) => 1,
    q(o)   => 1,
    q(os)  => 1,
);

sub capitalize {
    my ( $self, $text, $preserve_caps )
        = ref $_[0] eq __PACKAGE__
        ? @_
        : ( undef, @_ );

    $text //= $_;

    return unless defined $text;

    my @token = _pt_word_tokenizer($text);

    foreach ( my $i = 0; $i <= $#token; $i++ ) {
        foreach ( $token[$i] ) {
            if ( $preserve_caps ~~ q(1)
                && m{^\p{Lu}+(?:\N{APOSTROPHE})?$}msx )
            {
                next;
            }

            my $lc = lc;
            when ( $i == 0 ) { $_ = ucfirst $lc; }
            when ( substr( $_, -1 ) eq q(.) ) { $_ = ucfirst $lc; }
            when ( exists $lc_always{$lc} ) { $_ = $lc; }
            default {
                $_ = ucfirst $lc;
            }
        } ## end foreach ( $token[$i] )
    } ## end foreach ( my $i = 0; $i <= ...)

    $text = join q(), @token;

    return $text;
} ## end sub capitalize

sub _pt_word_tokenizer {
    my $text = shift;

    my $re = qr{(\p{L}+(?:\N{APOSTROPHE}|\p{L}{0,2}))}msx;

    my @token = split $re, $text;
    @token = grep { defined && $_ ne q() } @token;

    return @token;
}

sub new {
    my $class = shift;
    my $self  = {
        capitalize => \&capitalize,
        lc_always  => \%lc_always,
    };
    return bless $self, $class;
}

1;

__END__

=pod

=encoding utf8

=head1 NAME

Lingua::PT::Capitalizer - Simple text capitalize.

=head1 VERSION

version 0.001

=head1 SYNOPSIS

=head2 Procedural Interface

   use Lingua::PT::Capitalizer;
   my $text = q(ESCRITOR, JORNALISTA, CONTISTA E POETA JOAQUIM MARIA MACHADO DE ASSIS);
   say capitalize($text);
   # Output:
   # Escritor, Jornalista, Contista e Poeta Joaquim Maria Machado de Assis

   $text = q(comprehensive perl archive network (CPAN));
   say capitalize($text, 1);
   # Output:
   # Comprehensive Perl Archive Network (CPAN)

=head2 OO Interface

    use Lingua::PT::Capitalizer ();

    my $capitalizer = Lingua::PT::Capitalizer->new();
    my $text = q(ESCRITOR, JORNALISTA, CONTISTA E POETA JOAQUIM MARIA MACHADO DE ASSIS);
    say $capitalizer->capitalize($text);
    # Output:
    # Escritor, Jornalista, Contista e Poeta Joaquim Maria Machado de Assis

    $text = q(comprehensive perl archive network (CPAN));
    say $capitalizer->capitalize($text, 1);
    # Output:
    # Comprehensive Perl Archive Network (CPAN)

=head1 DESCRIPTION

This  module  format   strings  in  title-case mode   using  common typographic
rules for proper names in S<B<Portuguese Language>>.

=head1 SUBROUTINES/METHODS

=head2 capitalize

Receive  one  or  two  arguments  and   return  a  capitalized  string.  If the
second  argument is C<1>, the upper  case words won't be affected.

=head2 lc_always

A  data  structure  with  some  articles  and  prepositions  that  normaly stay
in lower case.

=head1 EXAMPLES

=head2 Using files as input

If F<text.txt>  is a file with  names/titles to capitalize, the  easiest way to
capitalize all is:

    perl -MLingua::PT::Capitalizer -wpE'$_=capitalize' text.txt

=head2 Getting lc_always hash

   # Procedural Interface
   my %lc_always = %{Lingua::PT::Capitalizer::lc_always}

   # OO Interface
   my %lc_always = $capitalizer->{lc_always}

=head1 AUTHOR

Ronaldo Ferreira de Lima aka jimmy <jimmy at gmail>.

=head1 SEE ALSO

Text::Capitalize.

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.