Yet Another CPAN Grep

DTA-CAB/CAB/Analyzer/LangId/Simple.pm

## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Analyzer::LangId::Simple.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: language identification using stopword lists

##==============================================================================
package DTA::CAB::Analyzer::LangId::Simple;
use DTA::CAB::Analyzer::Dict::Json;
use Carp;
use strict;
our @ISA = qw(DTA::CAB::Analyzer::Dict::Json);

## $obj = CLASS_OR_OBJ->new(%args)
##  + object structure: see DTA::CAB::Analyzer::Dict::Json
sub new {
  my $that = shift;
  my $lid = $that->SUPER::new(
			      ##-- analysis selection
			      label      => 'lang',
			      #slabel     => 'lang', ##-- sentence-level label
			      #vlabel     => 'lang_counts', ##-- DEBUG: verbose sentence-level counts, empty or undef for none
			      defaultLang => 'de',
			      defaultCount => 0.1,  ##-- bonus count for default lang (characters)
			      minSentLen   => 2,    ##-- minimum number of tokens in sentence required before guessing
			      minSentChars => 8,    ##-- minimum number of text characters in sentence required begore guessing

			      ##-- user args
			      @_
			     );
  return $lid;
}

##==============================================================================
## Methods: Prepare

## $bool = $dic->ensureLoaded()
##  + ensures analyzer data is loaded from default files
sub ensureLoaded {
  my $lid = shift;
  return $lid->SUPER::ensureLoaded(@_) && $lid->decodeDictValues();
}


##==============================================================================
## Methods: Analysis

##------------------------------------------------------------------------
## Methods: Analysis: v1.x: API

## $doc = $anl->analyzeTypes($doc,\%types,\%opts)
##  + perform type-wise analysis of all (text) types in $doc->{types}
sub analyzeTypes {
  my ($lid,$doc,$types,$opts) = @_;

  ##-- common vars
  my $label  = $lid->{label} || $lid->defaultLabel;
  my $slabel = $lid->{slabel} || $label;
  my $swd    = $lid->{ttd}{dict};
  my $allow_re = defined($lid->{allowRegex}) ? qr($lid->{allowRegex}) : undef;
  my $l0     = $lid->{defaultLang};
  my (@l);

  ##-- word-wise analysis
  my ($l,$prev);
  foreach (values %$types) {
    next if (defined($allow_re) && $_->{text} !~ $allow_re);

    ##-- list check
    @l = (defined($l=$swd->{lc($_->{text})}) ? @$l : qw());

    ##-- local analysis check(s)
    if (!$_->{xlit} || !$_->{xlit}{isLatinExt}) {
      if    ($_->{text} =~ /^\p{Greek}{2,}$/)  { push(@l, 'el'); }
      elsif ($_->{text} =~ /^\p{Hebrew}{2,}$/) { push(@l, 'he'); }
      elsif ($_->{text} =~ /^\p{Arabic}{2,}$/) { push(@l, 'ar'); }
      elsif ($_->{text} =~ /[[:alpha:]]{2,}/ && $_->{text} !~ /\p{Latin}/) { push(@l,'xy'); } ##-- combination of latin and non-latin characters
    }
    if    ($_->{text} =~ /[\p{InMathematicalOperators}]/) { push(@l,'xy'); }
    elsif ($_->{text} =~ /[[:alpha:]](?:.?)[[:digit:]]/ && $_->{text} !~ m{^[a-zA-Z]+://}) {
      ##-- don't treat links as 'xy' specials
      push(@l,'xy');
    }

    ##-- latin: use {mlatin}, but don't count known NE; workaround for mantis bug #6737
    push(@l, 'la') if ($_->{mlatin} && (!$_->{morph} || !grep {$_->{hi} =~ /\[_NE\]/} @{$_->{morph}}));

    ##-- default language: use {morph} and {msafe}, disregarding NE,FM
    push(@l, $l0) if ($l0 && $_->{morph} && ($_->{msafe}//1) && grep {$_->{hi} !~ /\[_(?:FM|NE)\]/} @{$_->{morph}});

    ##-- exlex language (not really)
    #push(@l, $l0, 'exlex') if (($_->{exlex} && $_->{exlex} ne $_->{text}));

    ##-- make unique
    if (@l) {
      $prev = '';
      $_->{$label} = [map {$prev eq $_ ? qw() : ($prev=$_)} sort @l];
    } else {
      $_->{$label} = undef;
    }
  }

  return $doc;
}


## $doc = $anl->analyzeSentences($doc,\%opts)
sub analyzeSentences {
  my ($lid,$doc,$opts) = @_;

  ##-- common vars
  my $label  = $lid->{label} || $lid->defaultLabel;
  my $slabel = $lid->{slabel} || $label;
  my $vlabel = $lid->{vlabel};
  my $l0     = $lid->{defaultLang} // '';
  my $n0     = $l0 ? ($lid->{defaultCount}//0) : 0;
  my $minlen = $lid->{minSentLen} // 0;
  my $minchrs= $lid->{minSentChars} // 0;
  my $nil    = [];

  ##-- ye olde loope
  my (%ln,$s,$nchrs,$l,$n,$w);
  foreach $s (@{$doc->{body}}) {
    ##-- check minimum sentence length in tokens
    next if (@{$s->{tokens}} < $minlen);

    ##-- count number of stopword-CHARACTERS per language
    %ln = ($l0=>$n0);
    $nchrs = 0;
    foreach $w (@{$s->{tokens}}) {
      $nchrs  += length($w->{text});
      $ln{$_} += length($w->{text}) foreach (@{$w->{$label}//$nil});
    }
    next if ($nchrs < $minchrs);

    ##-- get top-ranked language for this sentence
    ($l,$n) = ($l0,$n0);
    foreach (sort keys %ln) {
      ($l,$n)=($_,$ln{$_}) if ($n < $ln{$_});
    }
    $s->{$slabel} = $l;
    $s->{$vlabel} = {%ln} if ($vlabel); ##-- DEBUG
  }

  return $doc;
}


1; ##-- be happy


__END__
##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl, edited

##========================================================================
## NAME
=pod

=encoding utf8

=head1 NAME

DTA::CAB::Analyzer::LangId::Simple - simple language guesser using stopword lists

=cut

##========================================================================
## SYNOPSIS
=pod

=head1 SYNOPSIS

 ##========================================================================
 ## PRELIMINARIES
 
 use DTA::CAB::Analyzer::LangId::Simple;
 
 ##========================================================================
 ## Methods: Prepare
 
 $bool = $lid->ensureLoaded();
 
 ##========================================================================
 ## Methods: Analysis: v1.x: API
 
 $doc = $anl->analyzeTypes($doc,\%types,\%opts);
 $doc = $anl->analyzeSentences($doc,\%opts);
 

=cut

##========================================================================
## DESCRIPTION
=pod

=head1 DESCRIPTION

=cut

##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::LangId::Simple: Methods: Constructors etc.
=pod

=head2 Methods: Constructors etc.

=over 4

=item new

 $obj = CLASS_OR_OBJ->new(%args)

Creates a new simple language-guesser object, which inherits
from L<DTA::CAB::Analyzer::Dict::Json|DTA::CAB::Analyzer::Dict::Json>.
Known options in %args:

 ##-- analysis selection
 label      => 'lang', ##-- analyzer label
 defaultLang => 'de',  ##-- default language (if e.g. known by 'morph')
 defaultCount => 0.1,  ##-- bonus count for default lang (characters)
 minSentLen   => 2,    ##-- minimum number of tokens in sentence required before guessing
 minSentChars => 8,    ##-- minimum number of text characters in sentence required begore guessing

=back

=cut


##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::LangId::Simple: Methods: Prepare
=pod

=head2 Methods: Prepare

=over 4

=item ensureLoaded

 $bool = $lid->ensureLoaded();

ensures analyzer data is loaded from default files.

=back

=cut

##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Analyzer::LangId::Simple: Methods: Analysis: v1.x: API
=pod

=head2 Methods: Analysis: v1.x: API

=over 4

=item analyzeTypes

 $doc = $anl->analyzeTypes($doc,\%types,\%opts);

perform type-wise analysis of all (text) types in $doc-E<gt>{types}

=item analyzeSentences

 $doc = $anl->analyzeSentences($doc,\%opts);

perform sentence-wise analysis of all sentences in $doc-E<gt>{body}.

=back

=cut

##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl

##======================================================================
## Footer
##======================================================================
=pod

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2013-2019 by Bryan Jurish

This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.

=head1 SEE ALSO

L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<dta-cab-convert.perl(1)|dta-cab-convert.perl>,
L<dta-cab-http-server.perl(1)|dta-cab-http-server.perl>,
L<dta-cab-http-client.perl(1)|dta-cab-http-client.perl>,
L<dta-cab-xmlrpc-server.perl(1)|dta-cab-xmlrpc-server.perl>,
L<dta-cab-xmlrpc-client.perl(1)|dta-cab-xmlrpc-client.perl>,
L<DTA::CAB::Server(3pm)|DTA::CAB::Server>,
L<DTA::CAB::Client(3pm)|DTA::CAB::Client>,
L<DTA::CAB::Format(3pm)|DTA::CAB::Format>,
L<DTA::CAB(3pm)|DTA::CAB>,
L<perl(1)|perl>,
...

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.