Group
Extension

Text-TEI-Collate/scripts/show_columns.pl

#!/usr/bin/perl -w

use strict;
use lib 'lib';
use Data::Dumper;
use Getopt::Long;
use Storable qw( store_fd retrieve );
#use Text::WagnerFischer::Armenian;
use Text::TEI::Collate;
#use Words::Armenian;

binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
eval { no warnings; binmode $DB::OUT, ":utf8"; };

# Default option values
my( $col_width, $fuzziness, $language ) = ( 25, 50, 'Default' );
my( $CSV, $storable, $outfile, $infile, $text, $cx, $json, $debug, %argspec );

GetOptions( 'csv' => \$CSV,
	    'width=i' => \$col_width,
	    'storable' => \$storable,
	    'outfile=s' => \$outfile,
	    'store=s' => \$infile,
	    'text' => \$text,
	    'cx' => \$cx,
	    'json' => \$json,
	    'debug:i' => \$debug,
	    'argspec=s' => \%argspec,
	    'fuzziness=i' => \$fuzziness,
	    'l|language=s' => \$language,
    );

## Option checking
if( defined $debug ) {
    # If it's defined but false, no level was passed.  Use default 1.
    $debug = 1 unless $debug;
} else {
    $debug = 0;
}
if( $storable && !$outfile ) {
    warn( "Cannot output Storable data without an output file target" );
    exit;
}
if( $storable && $infile ) {
    warn( "You probably don't want to store a Storable.  Try again." );
    exit;
}
if( $outfile ) {
    open( OUT, ">$outfile" ) or die "Cannot open $outfile for writing";
}
my $fuzzy_hash = { 'short' => 6, 'shortval' => 50 };
$fuzzy_hash->{'val'} = $fuzziness;

unless( keys %argspec ) {
    %argspec = ( 'word' => 'shorter', 'sliding' => 0, 'inclusive' => 0 );
}

## Get busy. 
my( @files ) = @ARGV;
if( $json ) {  # The 'file' is the JSON string.
    my @lines;
    while( <> ) {
	push( @lines, $_ );
    }
    @files = ( join ( '', @lines ) );
}

my $aligner = Text::TEI::Collate->new( 'fuzziness_sub' => \&fuzzy_match,
				       'debug' => $debug,
				       'language' => $language,
    );


my @mss;
if( $infile ) {
    no warnings 'once'; 
    $Storable::Eval = 1;
    my $savedref = retrieve( $infile );
    @mss = @$savedref;
    @files = map { $_->sigil } @mss;
} else {
    foreach ( @files ) {
	push( @mss, $aligner->read_source( $_ ) );
    }
    $aligner->align( @mss );
    if( $cx || $json ) {
	# We didn't have filenames to display; use sigla instead.
	@files = map { $_->sigil } @mss;
    }
}

if( $storable ) {
    # Store the array.
    no warnings 'once';
    $Storable::Deparse = 1;
    store_fd( \@mss, \*OUT );
    exit;
} 

# Print the array.
print_fnames() if $CSV;
foreach my $i ( 0 .. $#{$mss[0]->words } ) {
    my $output_str;
    if( $CSV ) {
	$output_str = join( ',', map { '"' . $_->words->[$i]->printable . '"' } @mss ) . "\n";
    } else {
	my $format = '%-' . $col_width . "s";
	$output_str = sprintf( "%-4d:", $i ) . join( '| ', map { sprintf( $format, $_->words->[$i]->printable ) } @mss ) . "\n";
    }
    # print_fnames( $i ) unless ( $CSV || $i % 100 );
    print $output_str;
}


print "Done.\n";


sub open_file {
    my( $file ) = shift;
    local *FH;
    open( FH, "<:utf8", $file ) or die "Could not open file $file\n";
    return *FH;
}

sub print_fnames {
    print "\n\tLine $_[0]\n" if $_[0];
    my @titles = @files;
    if( $cx ) {
	@titles = ( 0 .. $#files );
    } elsif( $json ) {
	@titles = map { $_->sigil } @mss;
    }
    if( $CSV ) {
	print join( ',', @titles ) . "\n";
    } else {
	print join( '| ', map { sprintf( "%-25s", $_ ) } @titles ) . "\n";
    }
}

sub fuzzy_match {
    my ( $str1, $str2, $dist ) = @_;
    my $ref_str;
    if( $argspec{'word'} eq 'first' ) {
	$ref_str = $str1;
    } elsif( $argspec{'word'} eq 'longer' ) {
	 $ref_str = length( $str1 ) > length( $str2 ) ? $str1 : $str2;
    } elsif( $argspec{'word'} eq 'shorter' ) {
	 $ref_str = length( $str1 ) < length( $str2 ) ? $str1 : $str2;
    }
    
    my $fuzz = $fuzzy_hash->{'val'};
    if( length( $ref_str ) <= $fuzzy_hash->{'short'} 
	&& $argspec{'sliding'} ) {
	$fuzz = $fuzzy_hash->{'shortval'};
    }
    if( $argspec{'inclusive'} ) {
	return( $dist <= ( length( $ref_str ) * $fuzz / 100 ) );
    } else {
	return( $dist < ( length( $ref_str ) * $fuzz / 100 ) );
    }
}

sub normalize_unicode {
    my $word = shift;
    my @normalized;
    my @letters = split( '', lc( $word ) );
    foreach my $l ( @letters ) {
	my $d = chr( ord( NFKD( $l ) ) );
	push( @normalized, $d );
    }
    return join( '', @normalized );
}


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.