Lingua-Guess/lib/Lingua/Guess.pm
package Lingua::Guess;
use strict;
use warnings;
require 5.008;
use Carp;
use File::Spec::Functions 'catfile';
use Unicode::Normalize qw/NFC/;
use Unicode::UCD 'charinfo';
use JSON::Parse 'read_json';
our $VERSION = '0.03';
# Maximum distance, used by __distance.
our $MAX = 300;
our @BASIC_LATIN = qw/English cebuano hausa somali pig_latin klingon
indonesian hawaiian welsh latin swahili/;
our @EXOTIC_LATIN = qw/Czech Polish Croatian Romanian Slovak Slovene
Turkish Hungarian Azeri Lithuanian Estonian/;
our @ACCENTED_LATIN = (qw/Albanian Spanish French German Dutch Italian
Danish Icelandic Norwegian Swedish Finnish
Latvian Portuguese /, @EXOTIC_LATIN);
our @ALL_LATIN = ( @BASIC_LATIN, @EXOTIC_LATIN, @ACCENTED_LATIN);
our @CYRILLIC = qw/Russian Ukrainian Belarussian Kazakh Uzbek
Mongolian Serbian Macedonian Bulgarian Kyrgyz/;
our @ARABIC = qw/Arabic Farsi Jawi Kurdish Pashto Sindhi Urdu/;
our @DEVANAGARI = qw/Bhojpuri Bihari Hindi Kashmiri Konkani Marathi
Nepali Sanskrit/;
our @SINGLETONS = qw/Armenian Hebrew Bengali Gurumkhi Greek Gujarati
Oriya Tamil Telugu Kannada Malayalam Sinhala
Thai Lao Tibetan Burmese Georgian Mongolian/;
my $dir = __FILE__;
$dir =~ s!\.pm$!!;
my $lang2codes = read_json ("$dir/lang.json");
sub make_ret
{
my ($lang, $score) = @_;
my %ret;
$lang = lc $lang;
my $codes = $lang2codes->{$lang};
if ($codes) {
$ret{code2} = $codes->[0];
$ret{code3} = $codes->[1];
}
$ret{score} = $score;
$ret{name} = $lang;
return \%ret;
}
sub new
{
my ($class, %params) = @_;
if (! $params{modeldir}) {
my $md = "$dir/train";
$params{modeldir} = $md;
}
if (! -d $params{modeldir}) {
croak "Model directory '$params{modeldir}' does not exist";
}
my $self = bless { %params }, $class;
return $self;
}
sub guess
{
my ($self, $string) = @_;
unless (defined $self->{models}) {
$self->load_models ();
}
my @runs = find_runs($string);
my %scripts;
for my $run (@runs) {
$scripts{$run->[1]}++;
}
return $self->identify ($string, %scripts);
}
sub simple_guess
{
my ($self, $string) = @_;
my $got = $self->guess ($string);
return $got->[0]{name};
}
sub load_models
{
my ($self) = @_;
opendir my $dh, $self->{modeldir} or die "Unable to open dir:$!";
my %models;
while (my $f = readdir $dh) {
unless ($f =~ /\.train$/) {
next;
}
my ($name) = $f =~ m|(.*)\.|;
my $path = catfile ($self->{modeldir}, $f);
open my $fh, "<:encoding(utf8)", $path or die "Failed to open file: $!";
my %model;
while (my $line = <$fh>) {
chomp $line;
my ($k, $v) = $line =~ m|(.{3})\s+(.*)|;
unless (defined $k) {
next;
}
$model{$k} = $v;
}
$models{$name} = \%model;
}
$self->{models} = \%models;
}
sub find_runs
{
my ($raw) = @_;
my @chars = split m//, $raw;
my $prev = '';
my @c;
my @runs;
my @run_types;
my $current_run = -1;
for my $c (@chars) {
my $is_alph = $c =~ /[[:alpha:]]/o;
my $inf = get_charinfo ($c);
if ($is_alph and ! ($inf->{block} eq $prev)) {
$prev = $inf->{block};
@c = ();
$current_run++;
$run_types[$current_run] = $prev;
}
push @c, $c;
if ($current_run > -1) {
push @{ $runs[$current_run] }, $c;
}
}
my ($newruns, $newtypes) = reconcile_latin (\@runs, \@run_types);
my $counter = 0;
my @result;
for my $r (@$newruns) {
push @result, [ $r, $newtypes->[$counter]];
$counter++;
}
return @result;
}
# Cached lookups from charinfo
my %cache;
# Look up characters using charinfo, but with a cache to save repeated
# lookups.
sub get_charinfo
{
my ($char) = @_;
my $known = $cache{$char};
if ($known) {
return $known;
}
my $inf = charinfo (ord ($char));
$cache{$char} = $inf;
return $inf;
}
sub reconcile_latin
{
my ($runs, $types) = @_;
my @types = @$types;
my (@new_runs, @new_types);
my $last_type = '';
my $upgrade;
if (has_supplemental_latin (@$types)) {
$upgrade = 'Accented Latin';
}
if (has_extended_latin (@$types)) {
$upgrade = 'Exotic Latin' ;
}
if (has_latin_extended_additional (@$types)) {
$upgrade = 'Superfreak Latin';
}
unless ($upgrade) {
return ($runs, $types);
}
my $run_count = -1;
for my $r (@$runs) {
my $type = shift @types;
if ($type =~ /Latin/) {
$type = $upgrade;
}
unless ($type eq $last_type) {
$run_count++;
}
push @{$new_runs[$run_count]}, @$r;
$new_types[$run_count] = $type;
$last_type = $type;
}
return (\@new_runs, \@new_types);
}
sub has_extended_latin
{
my (@types) = @_;
return scalar grep { /Latin Extended-A/ } @types;
}
sub has_supplemental_latin
{
my (@types) = @_;
return scalar grep { /Latin-1 Supplement/ } @types;
}
sub has_latin_extended_additional
{
my (@types) = @_;
return scalar grep { /Latin Extended Additional/ } @types;
}
sub identify
{
my ($self, $sample, %scripts) = @_;
# Check for Korean
if (exists $scripts{'Hangul Syllables'} ||
exists $scripts{'Hangul Jamo'} ||
exists $scripts{'Hangul Compatibility Jamo'} ||
exists $scripts{'Hangul'}) {
return [make_ret ('korean', 1)];
}
if (exists $scripts{'Greek and Coptic'}) {
return [make_ret ('greek', 1)];
}
if (exists $scripts{'Katakana'} ||
exists $scripts{'Hiragana'} ||
exists $scripts{'Katakana Phonetic Extensions'}) {
return [make_ret ('japanese', 1)];
}
if (exists $scripts{'CJK Unified Ideographs'} ||
exists $scripts{'Bopomofo'} ||
exists $scripts{'Bopomofo Extended'} ||
exists $scripts{'KangXi Radicals'}) {
return [make_ret ('chinese', 1)];
}
if (exists $scripts{'Cyrillic'}) {
return $self->check ($sample, @CYRILLIC);
}
if (exists $scripts{'Arabic'} ||
exists $scripts{'Arabic Presentation Forms-A'} ||
exists $scripts{'Arabic Presentation Forms-B'}) {
return $self->check ($sample, @ARABIC);
}
if (exists $scripts{'Devanagari'}) {
return $self->check ($sample, @DEVANAGARI);
}
# Try languages with unique scripts
for my $s (@SINGLETONS) {
if (exists $scripts{$s}) {
return [make_ret (lc ($s), 1)];
}
}
if (exists $scripts{'Superfreak Latin'}) {
return [make_ret ('vietnamese', 1)];
}
if (exists $scripts{'Exotic Latin'}) {
return $self->check ($sample, @EXOTIC_LATIN);
}
if (exists $scripts{'Accented Latin'}) {
return $self->check ($sample, @ACCENTED_LATIN);
}
if (exists $scripts{'Basic Latin'}) {
return $self->check ($sample, @ALL_LATIN);
}
return [{ name => "unknown script: '". (join ", ", keys %scripts)."'",
score => 1}];
}
sub check
{
my ($self, $sample, @langs) = @_;
my $mod = __make_model ($sample);
my $num_tri = scalar keys %$mod;
my %scores;
for my $key (@langs) {
my $l = lc ($key);
unless (exists $self->{models}{$l}) {
next;
}
my $score = __distance ($mod, $self->{models}{$l});
$scores{$l} = $score;
}
my @sorted = sort { $scores{$a} <=> $scores{$b} } keys %scores;
my @out;
$num_tri ||=1;
for my $s (@sorted) {
my $norm = $scores{$s}/$num_tri;
push @out, make_ret ($s, $norm);
}
my $total = 0.0;
for (@out) {
$total += $_->{score}
}
for (@out) {
$_->{score} /= $total;
}
return \@out;
}
sub __distance
{
my ($m1, $m2) = @_;
my $dist = 0;
for my $k (keys %$m1) {
$dist += (exists $m2->{$k} ? abs($m2->{$k} - $m1->{$k}) : $MAX);
}
return $dist;
}
sub __make_model
{
my ($content) = @_;
my %trigrams;
$content = NFC ($content); # normal form C
# Substitute all non-word characters with spaces
$content =~ s/[^[:alpha:]']/ /g;
for (my $i = 0; $i < length ($content) - 2; $i++) {
my $tri = lc (substr ($content, $i, 3));
$trigrams{$tri}++;
}
my @sorted = sort { $trigrams{$b} == $trigrams{$a} ?
$a cmp $b :
$trigrams{$b} <=> $trigrams{$a} }
grep { !/\s\s/o } keys %trigrams;
my @trimmed = splice (@sorted, 0, 300);
my $counter = 0;
my %res;
for my $t (@trimmed) {
$res{$t} = $counter++;
}
return \%res;
}
1;