Text-TEI-Collate/lib/Text/TEI/Collate.pm
package Text::TEI::Collate;
use strict;
use warnings;
use 5.010;
use vars qw( $VERSION );
use Moose;
use Encode qw( decode_utf8 );
use File::Temp;
use Graph::Easy;
use IPC::Run qw( run binary );
use JSON qw( decode_json );
use Module::Load;
use Text::CSV_XS;
use Text::TEI::Collate::Diff;
use Text::TEI::Collate::Error;
use Text::TEI::Collate::Word;
use Text::TEI::Collate::Manuscript;
use TryCatch;
use XML::LibXML;
$VERSION = "2.1";
eval { no warnings; binmode $DB::OUT, ":utf8" };
### Instance attributes
has 'debuglevel' => (
is => 'ro',
isa => 'Int',
default => 0,
);
has 'title' => (
is => 'rw',
isa => 'Str',
default => 'An nCritic collation',
);
has 'language' => (
is => 'rw',
isa => 'Str',
default => 'Default',
);
has 'fuzziness' => (
is => 'rw',
isa => 'HashRef[Int]',
default => sub{ { 'val' => 40, 'short' => 6, 'shortval' => 50 } },
);
has 'binmode' => (
is => 'ro',
isa => 'Str',
default => 'utf8',
predicate => 'has_binmode',
);
has 'distance_sub' => (
is => 'rw',
isa => 'CodeRef',
);
has 'fuzziness_sub' => (
is => 'rw',
isa => 'CodeRef',
predicate => 'has_fuzziness_sub',
);
=head1 NAME
Text::TEI::Collate - a collation program for variant manuscript texts
=head1 SYNOPSIS
use Text::TEI::Collate;
my $aligner = Text::TEI::Collate->new( 'language' => 'Armenian' );
# Read from strings.
my @manuscripts;
foreach my $str ( @strings_to_collate ) {
push( @manuscripts, $aligner->read_source( $str ) );
}
$aligner->align( @manuscripts; );
# Read from files. Also works for XML::LibXML::Document objects.
@manuscripts = ();
foreach my $xml_file ( @TEI_files_to_collate ) {
push( @manuscripts, $aligner->read_source( $xml_file ) )
}
$aligner->align( @manuscripts );
# Read from a JSON input.
@manuscripts = $aligner->read_source( $JSON_string );
$aligner->align( @manuscripts );
=head1 DESCRIPTION
Text::TEI::Collate is a collation program for multiple (transcribed)
manuscript copies of a known text. It is an object-oriented interface,
mostly for the convenience of the author and for the ability to have global
settings.
The object is the alignment engine, or "aligner". The methods that a user
will care about are "read_source" and "align", as well as the various
output methods; the other methods in this file are public in case a user
needs a subset of this package's functionality.
An aligner takes two or more texts; the texts can be strings, filenames, or
XML::LibXML::Document objects. It returns two or more Manuscript objects --
one for each text input -- in which identical and similar words are lined
up with each other, via empty-string padding.
Please see the documentation for L<Text::TEI::Collate::Manuscript> and
L<Text::TEI::Collate::Word> for more information about the manuscript and
word objects.
=head1 METHODS
=head2 new
Creates a new aligner object. Takes a hash of options; available
options are listed.
=over 4
=item B<debuglevel> - Default 0. The higher the number (between 0 and 3), the
more the debugging output.
=item B<title> - Display title for the collation output results, should those
results need a display title (e.g. TEI or JSON output).
=item B<language> - Specify the language module we should use from those
available in Text::TEI::Collate::Lang. Default is 'Default'.
=item B<fuzziness> - The maximum allowable word distance for an approximate
match, expressed as a percentage of word distance / word length. It can
also be expressed as a hashref with keys 'val', 'short', and 'shortval', if
you want to increase the tolerance for short words (defined as at or below the
value of 'short').
=item B<binmode> - If STDERR should be using something other than UTF-8, you
can set it here. You are probably in for a world of hurt anyway though.
=back
=begin testing
use Text::TEI::Collate;
my $aligner = Text::TEI::Collate->new();
is( ref( $aligner ), 'Text::TEI::Collate', "Got a Collate object from new()" );
=end testing
=cut
# Set the options. Main option is a pointer to the fuzzy matching algorithm
# that the user wishes to use.
around BUILDARGS => sub {
my $orig = shift;
my $class = shift;
my %args = @_;
# Support a single 'fuzziness' argument
if( exists $args{'fuzziness'} && !ref( $args{'fuzziness'} ) ) {
my $fuzz = $args{'fuzziness'};
$args{'fuzziness'} = { val => $fuzz, short => '0', shortval => $fuzz };
}
return $class->$orig( %args );
};
sub BUILD {
my $self = shift;
if( $self->has_binmode ) {
my $b = $self->binmode;
binmode STDERR, ":$b";
}
$self->_use_language( $self->language );
}
around 'language' => sub {
my $orig = shift;
my $self = shift;
if( @_ ) {
# Check that we can use this language.
$self->_use_language( @_ );
}
# We didn't throw an exception? Good.
$self->$orig( @_ );
};
=begin testing
use Text::TEI::Collate;
use TryCatch;
my $aligner = Text::TEI::Collate->new();
is( $aligner->distance_sub, \&Text::TEI::Collate::Lang::Default::distance, "Have correct default distance sub" );
my $ok = eval { $aligner->language( 'Armenian' ); };
ok( $ok, "Used existing language module" );
is( $aligner->distance_sub, \&Text::TEI::Collate::Lang::Armenian::distance, "Set correct distance sub" );
$aligner->language( 'default' );
is( $aligner->distance_sub, \&Text::TEI::Collate::Lang::Default::distance, "Back to default distance sub" );
# TODO test Throwable object
try {
$aligner->language( 'Klingon' );
} catch( Text::TEI::Collate::Error $e ) {
is( $e->ident, 'bad language module', "Caught the lang module error we expected" );
} catch {
ok( 0, "FAILED to catch expected exception" );
}
=end testing
=cut
sub _use_language {
my( $self, $lang ) = @_;
# Are we reverting to a default?
if( !$lang || $lang =~ /default/i ) {
# Use the default.
$lang = 'Default';
}
# Is the given language module defined, and does it have all the
# required subroutines?
my $mod = 'Text::TEI::Collate::Lang::' . $lang;
try {
load( $mod );
} catch {
throw( ident => 'bad language module',
message => "Could not load $lang module: $@" );
}
foreach my $langsub ( qw/ distance canonizer comparator / ) {
unless( $mod->can( $langsub ) ) {
throw( ident => 'bad language module',
message => "Language module $lang has no $langsub function" );
}
}
$self->distance_sub( $mod->can( 'distance' ) );
}
=head2 read_source
Pass in a word source (a plaintext file, a TEI XML file, or a JSON structure)
and a set of options, and get back one or more manuscript objects that can be
collated. Options include:
=over
=item B<encoding> - The encoding of the word source if we are reading from a file.
Defaults to utf-8.
=item B<sigil> - The sigil that should be assigned to this manuscript in the collation
output. Should be a valid XML attribute value. This can also be read from a
TEI XML source.
=item B<identifier> - A string to identify this manuscript (e.g. library, MS number).
Can also be read from a TEI <msdesc/> element.
=back
=begin testing
use XML::LibXML;
my $aligner = Text::TEI::Collate->new();
$aligner->language( 'Armenian' );
# Test a manuscript with a plaintext source, filename
my @mss = $aligner->read_source( 't/data/plaintext/test1.txt',
'identifier' => 'plaintext 1',
);
is( scalar @mss, 1, "Got a single object for a plaintext file");
my $ms = pop @mss;
is( ref( $ms ), 'Text::TEI::Collate::Manuscript', "Got manuscript object back" );
is( $ms->sigil, 'A', "Got correct sigil A");
is( scalar( @{$ms->words}), 181, "Got correct number of words in A");
# Test a manuscript with a plaintext source, string
open( T2, "t/data/plaintext/test2.txt" ) or die "Could not open test file";
my @lines = <T2>;
close T2;
@mss = $aligner->read_source( join( '', @lines ),
'identifier' => 'plaintext 2',
);
is( scalar @mss, 1, "Got a single object for a plaintext string");
$ms = pop @mss;
is( ref( $ms ), 'Text::TEI::Collate::Manuscript', "Got manuscript object back" );
is( $ms->sigil, 'B', "Got correct sigil B");
is( scalar( @{$ms->words}), 183, "Got correct number of words in B");
is( $ms->identifier, 'plaintext 2', "Got correct identifier for B");
# Test two manuscripts with a JSON source
open( JS, "t/data/json/testwit.json" ) or die "Could not read test JSON";
@lines = <JS>;
close JS;
@mss = $aligner->read_source( join( '', @lines ) );
is( scalar @mss, 2, "Got two objects from the JSON string" );
is( ref( $mss[0] ), 'Text::TEI::Collate::Manuscript', "Got manuscript object 1");
is( ref( $mss[1] ), 'Text::TEI::Collate::Manuscript', "Got manuscript object 2");
is( $mss[0]->sigil, 'MsAJ', "Got correct sigil for ms 1");
is( $mss[1]->sigil, 'MsBJ', "Got correct sigil for ms 2");
is( scalar( @{$mss[0]->words}), 182, "Got correct number of words in ms 1");
is( scalar( @{$mss[1]->words}), 263, "Got correct number of words in ms 2");
is( $mss[0]->identifier, 'JSON 1', "Got correct identifier for ms 1");
is( $mss[1]->identifier, 'JSON 2', "Got correct identifier for ms 2");
# Test a manuscript with an XML source
@mss = $aligner->read_source( 't/data/xml_plain/test3.xml' );
is( scalar @mss, 1, "Got a single object from XML file" );
$ms = pop @mss;
is( ref( $ms ), 'Text::TEI::Collate::Manuscript', "Got manuscript object back" );
is( $ms->sigil, 'BL5260', "Got correct sigil BL5260");
is( scalar( @{$ms->words}), 178, "Got correct number of words in MsB");
is( $ms->identifier, 'London OR 5260', "Got correct identifier for MsB");
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file( 't/data/xml_plain/test3.xml' );
@mss = $aligner->read_source( $doc );
is( scalar @mss, 1, "Got a single object from XML object" );
$ms = pop @mss;
is( ref( $ms ), 'Text::TEI::Collate::Manuscript', "Got manuscript object back" );
is( $ms->sigil, 'BL5260', "Got correct sigil BL5260");
is( scalar( @{$ms->words}), 178, "Got correct number of words in MsB");
is( $ms->identifier, 'London OR 5260', "Got correct identifier for MsB");
## The mss we will test the rest of the tests with.
$aligner->language( 'Greek' );
@mss = $aligner->read_source( 't/data/cx/john18-2.xml' );
is( scalar @mss, 28, "Got correct number of mss from CX file" );
my %wordcount = (
'base' => 57,
'P60' => 20,
'P66' => 55,
'w1' => 58,
'w11' => 57,
'w13' => 58,
'w17' => 58,
'w19' => 57,
'w2' => 58,
'w21' => 58,
'w211' => 54,
'w22' => 57,
'w28' => 57,
'w290' => 46,
'w3' => 56,
'w30' => 59,
'w32' => 58,
'w33' => 57,
'w34' => 58,
'w36' => 58,
'w37' => 56,
'w38' => 57,
'w39' => 58,
'w41' => 58,
'w44' => 56,
'w45' => 58,
'w54' => 57,
'w7' => 57,
);
foreach( @mss ) {
is( scalar @{$_->words}, $wordcount{$_->sigil}, "Got correct number of words for " . $_->sigil );
}
=end testing
=cut
sub read_source {
my( $self, $wordsource, %options ) = @_;
my @docroots; # Holds an array of { sigil, source }
my $format;
if( !ref( $wordsource ) ) { # Assume it's a filename.
my $parser = XML::LibXML->new();
my $doc;
eval { local $SIG{__WARN__} = sub { 1 }; $doc = $parser->parse_file( $wordsource ); };
if( $doc ) {
( $format, @docroots) = _get_xml_roots( $doc );
return unless @docroots;
} else {
# It's not an XML document filename. Determine plaintext
# filename, plaintext string, or JSON string.
my $encoding = delete $options{'binmode'};
$encoding ||= 'utf8';
my $binmode = "<:" . $encoding;
my $rc = open( INFILE, $binmode, $wordsource );
$format = 'plaintext';
if( $rc ) {
# It is a filename, thus plaintext.
my @lines = <INFILE>;
close INFILE;
@docroots = ( { source => join( '', @lines ) } );
} else {
my $json;
eval { $json = decode_json( $wordsource ) };
if( $json ) {
# It is a JSON string.
$format = 'json';
push( @docroots, map { { source => $_ } } @{$json->{'witnesses'}} );
} else {
# Assume plain old string input.
@docroots = ( { source => $wordsource } );
}
}
}
} elsif ( ref( $wordsource ) eq 'XML::LibXML::Document' ) { # A LibXML object
( $format, @docroots ) = _get_xml_roots( $wordsource );
} else {
throw( ident => 'bad source',
message => "Unrecognized object $wordsource; reading no words" );
}
# Add any language-specific canonizer / comparator that we have defined.
$options{'language'} = $self->language;
# We have the representations of the manuscript(s). Initialize our object(s).
my @ms_objects;
foreach my $doc ( @docroots ) {
push( @ms_objects, Text::TEI::Collate::Manuscript->new(
'sourcetype' => $format,
%options,
%$doc,
) );
}
return @ms_objects;
}
sub _get_xml_roots {
my( $xmldoc ) = @_;
my( @docroots, $format );
if( $xmldoc->documentElement->nodeName =~ /^examples/i ) {
# It is CollateX simple input format. Read the text
# strings and then treat it as plaintext.
my @collationtexts = $xmldoc->documentElement->getChildrenByTagName( 'example' );
if( @collationtexts ) {
# Use the first text example in the file; we do not handle multiple
# collation runs on different texts.
my @witnesses = $collationtexts[0]->getChildrenByTagName( 'witness' );
@docroots = map { { sigil => $_->getAttribute( 'id' ),
source => $_->textContent } } @witnesses;
$format = 'plaintext';
} else {
throw( ident => 'bad source',
message => "Found no example elements in CollateX XML" );
}
} else {
# Assume that it is TEI format. We will throw an error later if not.
@docroots = ( { source => $xmldoc->documentElement } );
$format = 'xmldesc';
}
return( $format, @docroots );
}
=head2 align
The meat of the program. Takes a list of Text::TEI::Collate::Manuscript
objects (created by new_manuscript above.) Returns the same objects with
their wordlists collated.
=begin testing
my $aligner = Text::TEI::Collate->new();
my @mss = $aligner->read_source( 't/data/cx/john18-2.xml' );
my @orig_wordlists = map { $_->words } @mss;
$aligner->align( @mss );
my $cols = 75;
foreach( @mss ) {
is( scalar @{$_->words}, $cols, "Got correct collated columns for " . $_->sigil);
}
foreach my $i ( 0 .. $#mss ) {
my $ms = $mss[$i];
my @old_words = map { $_->canonical_form } @{$orig_wordlists[$i]};
my @real_words = map { $_->canonical_form } grep { !$_->invisible } @{$ms->words};
is( scalar @old_words, scalar @real_words, "Manuscript " . $ms->sigil . " has an unchanged word total" );
foreach my $j ( 0 .. $#old_words ) {
my $rw = $j < scalar @real_words ? $real_words[$j] : '';
is( $rw, $old_words[$j], "...word at index $j is correct" );
}
}
=end testing
=cut
sub align {
my( $self, @manuscripts ) = @_;
if( scalar( @manuscripts ) == 1 ) {
# That was easy then.
return @manuscripts;
}
# At this point we have an array of arrays. Each member array
# contains a hash object for each word, describing its
# characteristics. These are the uncollated texts, now in the
# object form that we will eventually return.
# The first file becomes the base, for now.
# SOMEDAY: Work parsimony info into the choosing of a base
my @ms_texts = map { $_->words } @manuscripts;
my $base_text = shift @ms_texts;
for ( 0 .. $#ms_texts ) {
my $text = $ms_texts[$_];
$self->debug( "Beginning run of build_array for text " . ($_+2) );
my( $result1, $result2 ) = $self->build_array( $base_text, $text );
# Are the resulting arrays the same length?
if( scalar( @$result1 ) != scalar( @$result2 ) ) {
throw( ident => 'bad collation',
message => "Result arrays for text $_ are not the same length!" );
}
# Generate the new base by flattening result2 onto the back of result1,
# filling in all the gaps.
$base_text = $self->generate_base( $result1, $result2 );
}
# $base_text now holds all the words, linked in one way or another.
# Make a result array from this.
my @result_array = map { [] } @manuscripts;
my %ridx;
foreach( 0 .. $#manuscripts ) {
$ridx{ $manuscripts[$_]->sigil } = $_;
}
foreach my $word ( @$base_text ) {
my %unseen;
map { $unseen{$_->sigil} = 1 } @manuscripts;
my @row_words;
push( @row_words, $word, $word->links );
foreach ( $word->variants ) {
push( @row_words, $_, $_->links );
}
foreach my $r ( @row_words ) {
push( @{$result_array[$ridx{$r->ms_sigil}]}, $r );
delete $unseen{$r->ms_sigil};
}
foreach my $s ( keys %unseen ) {
push( @{$result_array[$ridx{$s}]}, $self->empty_word );
}
}
# Take the contents of @result_array and put them back into the
# manuscripts.
foreach my $i ( 0 .. $#result_array ) {
$manuscripts[$i]->replace_words( $result_array[$i] );
}
# Top and tail each array.
$self->begin_end_mark( @manuscripts );
return @manuscripts;
}
# Small utility to get a string out of an array of word objects.
sub _stripped_words {
my $text = shift;
my @words = map { $_->comparison_form } @$text;
return @words;
}
sub empty_word {
my $self = shift;
unless( defined $self->{'null_word'}
&& ref( $self->{'null_word'} ) eq 'Text::TEI::Collate::Word' ) {
# Make a null word and save it.
$self->{'null_word'} = Text::TEI::Collate::Word->new( empty => 1 );
}
return $self->{'null_word'};
}
# Given two collections of word objects, return two collated collections of
# word objects. Pass a ref to the whole array so far so that we can consult
# it if necessary. That array should *not* be written to here below.
sub build_array {
my $self = shift;
my( $base_text, $text ) = @_;
my( @base_result, @new_result ); # All the good things we'll return.
# Generate our fuzzy-match lookup table.
$self->make_fuzzy_matches( $base_text, $text );
# Do the diff.
my $diff = Text::TEI::Collate::Diff->new( $base_text, $text, $self );
while( my $diffpos = $diff->Next ) {
if( $diff->Same ) {
$self->_handle_diff_same( $diff, $base_text, $text, \@base_result, \@new_result );
} elsif( !scalar( $diff->Range( 1 ) ) ) { # Addition
$self->_handle_diff_interpolation( $diff, 2, $text, \@new_result, \@base_result );
} elsif( !scalar( $diff->Range( 2 ) ) ) { # Deletion
$self->_handle_diff_interpolation( $diff, 1, $base_text, \@base_result, \@new_result );
} else { # No fuzzy matching here.
$self->debug( "Diff: collating words "
. join( '.', map { $_->comparison_form } $diff->Items( 1 ) ) . " / "
. join( '.', map { $_->comparison_form } $diff->Items( 2 ) ), 1 );
# Grab the word sets from each text.
my @base_wlist = @{$base_text}[$diff->Range( 1 )];
my @new_wlist = @{$text}[$diff->Range( 2 )];
# Does the base have variants against which we can collate the
# new words? If so, try running against the variants, and
# collate according to the result.
my @var_wlist;
my %base_idx;
map { push( @var_wlist, $_->variants ) } @base_wlist;
my $matched_variants;
my( $b, $n );
if( scalar @var_wlist ) {
# Keep track of which base index each variant is at
foreach my $i ( 0 .. $#base_wlist ) {
foreach my $v ( $base_wlist[$i]->variants ) {
$base_idx{$v} = $i;
}
}
# Get the last variant(s) of the previous hunk
if( @base_result ) {
unshift( @var_wlist, $base_result[-1]->variants );
foreach my $v ( $base_result[-1]->variants ) {
$base_idx{$v} = -1;
}
}
# Get the first variant(s) of the next hunk
if( $diff->Next && $diff->Items(1) ) {
my @next = $diff->Items(1);
push( @var_wlist, $next[0]->variants );
foreach my $v ( $next[0]->variants ) {
$base_idx{$v} = scalar @base_wlist;
}
}
# Put the diff back where it was.
$diff->Reset( $diffpos );
# Collate against the variants
my @match_sets = $self->_match_variants( \@var_wlist, \@new_wlist, \%base_idx );
if( @match_sets ) {
$matched_variants = 1;
( $b, $n ) = $self->_add_variant_matches( \@match_sets, \@base_wlist, \@new_wlist, \%base_idx );
}
}
unless( $matched_variants ) {
( $b, $n ) = ( \@base_wlist, \@new_wlist );
$self->_balance_arrays( $b, $n );
}
push( @base_result, @$b );
push( @new_result, @$n );
}
}
return( \@base_result, \@new_result );
}
sub _balance_arrays {
my( $self, $base, $new, $nolink ) = @_;
my $difflen = @$base - @$new;
my $shorter = $difflen > 0 ? $new : $base;
push( @$shorter, ( $self->empty_word ) x abs( $difflen ) ) if $difflen;
# Set variant links.
unless( $nolink ) {
foreach my $i ( 0 .. $#{$base} ) {
next if $base->[$i] eq $self->empty_word;
next if $new->[$i] eq $self->empty_word;
$base->[$i]->add_variant( $new->[$i] );
}
}
return( $base, $new );
}
=begin testing
use Text::TEI::Collate;
my @test = (
'the black dog had his day',
'the white dog had her day',
'the bright red dog had his day',
'the bright white cat had her day',
);
my $aligner = Text::TEI::Collate->new();
my @mss = map { $aligner->read_source( $_ ) } @test;
$aligner->align( @mss );
my $base = $aligner->generate_base( @mss );
# Get rid of the specials
pop @$base;
shift @$base;
is( scalar @$base, 8, "Got right number of words" );
is( $base->[0]->word, 'the', "Got correct first word" );
is( scalar $base->[0]->links, 3, "Got 3 links" );
is( scalar $base->[0]->variants, 0, "Got 0 variants" );
is( $base->[1]->word, 'black', "Got correct second word" );
is( scalar $base->[1]->links, 0, "Got 0 links" );
is( scalar $base->[1]->variants, 1, "Got 1 variant" );
is( $base->[1]->get_variant(0)->word, 'bright', "Got correct first variant" );
is( scalar $base->[1]->get_variant(0)->links, 1, "Got a variant link" );
is( $base->[2]->word, 'white', "Got correct second word" );
is( scalar $base->[2]->links, 1, "Got 1 links" );
is( scalar $base->[2]->variants, 0, "Got 0 variants" );
is( $base->[3]->word, 'red', "Got correct third word" );
is( scalar $base->[3]->links, 0, "Got 0 links" );
is( scalar $base->[3]->variants, 1, "Got a variant" );
is( $base->[3]->get_variant(0)->word, 'cat', "Got correct second variant" );
is( scalar $base->[3]->get_variant(0)->links, 0, "Variant has no links" );
is( $base->[4]->word, 'dog', "Got correct fourth word" );
is( scalar $base->[4]->links, 2, "Got 2 links" );
is( scalar $base->[4]->variants, 0, "Got 0 variants" );
is( $base->[5]->word, 'had', "Got correct fifth word" );
is( scalar $base->[5]->links, 3, "Got 3 links" );
is( scalar $base->[5]->variants, 0, "Got 0 variants" );
is( $base->[6]->word, 'his', "Got correct sixth word" );
is( scalar $base->[6]->links, 1, "Got 1 link" );
is( scalar $base->[6]->variants, 1, "Got 1 variant" );
is( scalar $base->[6]->get_variant(0)->links, 1, "Got 1 variant link" );
is( $base->[6]->get_variant(0)->word, 'her', "Got correct third variant");
is( $base->[7]->word, 'day', "Got correct seventh word" );
is( scalar $base->[7]->links, 3, "Got 3 links" );
is( scalar $base->[7]->variants, 0, "Got 0 variants" );
=end testing
=cut
sub _match_variants {
my( $self, $variants, $new, $base_idx ) = @_;
my @match_sets;
my $last_idx_matched = -1;
my %variant_matched;
foreach my $n_idx ( 0 .. $#{$new} ) {
my $n = $new->[$n_idx];
foreach my $v ( @$variants ) {
next if $base_idx->{$v} < $last_idx_matched;
next if exists $variant_matched{$v};
if( $self->{fuzzy_matches}->{$n->comparison_form}
eq $self->{fuzzy_matches}->{$v->comparison_form} ) {
$v->add_link( $n );
$variant_matched{$v} = 1;
push( @match_sets, [ $base_idx->{$v}, $n_idx, $v ] );
$last_idx_matched = $base_idx->{$v};
last; # N is matched, stop looking at Vs.
}
}
}
return @match_sets;
}
=begin testing
use Text::TEI::Collate;
use Text::TEI::Collate::Word;
my $aligner = Text::TEI::Collate->new();
# Set up the base: 'and|B(very|D) white|B(green|C/special|D)'
my @base;
foreach my $w ( qw/ and white / ) {
push( @base, Text::TEI::Collate::Word->new( 'string' => $w, 'ms_sigil' => 'B' ) );
}
my $v1 = Text::TEI::Collate::Word->new( 'string' => 'very', 'ms_sigil' => 'D' );
$base[0]->add_variant( $v1 );
my $v2 = Text::TEI::Collate::Word->new( 'string' => 'green', 'ms_sigil' => 'C' );
my $v3 = Text::TEI::Collate::Word->new( 'string' => 'special', 'ms_sigil' => 'D' );
$v2->add_variant( $v3 );
$base[1]->add_variant( $v2 );
# Set up the new: 'not very special'
my @new;
foreach my $w ( qw/ not very special / ) {
push( @new, Text::TEI::Collate::Word->new( 'string' => $w, 'ms_sigil' => 'E' ) );
}
# Set up the base_idx
my $base_idx = { $v1 => 0, $v2 => 1, $v3 => 1 };
# Get the right matches in the first place
$aligner->make_fuzzy_matches( [ @base, $v1, $v2, $v3 ], \@new );
my @matches = $aligner->_match_variants( [ $v1, $v2, $v3 ], \@new, $base_idx );
is( scalar @matches, 2, "Got two matches from constructed case" );
is_deeply( $matches[0], [ 0, 1, $v1 ], "First match is correct" );
is_deeply( $matches[1], [ 1, 2, $v3 ], "Second match is correct" );
# Now do the real testing
my( $nb, $nn ) = $aligner->_add_variant_matches( \@matches, \@base, \@new, $base_idx );
is( scalar @$nb, 3, "Got three base words" );
is( scalar @$nn, 3, "Got three new words" );
is( $nb->[0], $aligner->empty_word, "Empty word at front of base" );
=end testing
=cut
sub _add_variant_matches {
my( $self, $match_sets, $base, $new, $base_idx ) = @_;
my( $base_wlist, $new_wlist ) = ( [], [] );
my( $last_b, $last_n ) = ( -1, -1 );
my %seen_base_indices;
foreach my $p ( @$match_sets ) {
my( $b_idx, $n_idx, $v ) = @$p;
# Balance the arrays up to the indices we have.
my( @tb, @tn );
if( $b_idx > $last_b+1
&& $b_idx < scalar @$base ) {
@tb = @{$base}[ ( $last_b < 0 ? 0 : $last_b ) .. $b_idx-1];
}
if( $n_idx > $last_n+1 ) {
@tn = @{$new}[ ( $last_n < 0 ? 0 : $last_n ) .. $n_idx-1];
}
$self->_balance_arrays( \@tb, \@tn );
push( @$base_wlist, @tb ) if @tb;
push( @$new_wlist, @tn ) if @tn;
# If this is the first occurrence of $b_idx, push the pair.
# If it is not the first occurrence, we have more than one 'new'
# match on one 'base' plus variants. Unlink the subsequent
# variant into its own column and then push the pair.
if( $seen_base_indices{$b_idx}
|| $b_idx == -1
|| $b_idx == scalar( @$base ) ) {
# Unlink variant from base, push as extra.
$v->variant_of->unlink_variant( $v );
# Push the variant.
push( @$base_wlist, $v );
} else {
# Just push the base.
push( @$base_wlist, $base->[$b_idx] );
}
# Either way, push the new.
push( @$new_wlist, $new->[$n_idx] );
$seen_base_indices{$b_idx} = 1;
# Save the index pair we were just working on.
( $last_b, $last_n ) = ( $b_idx, $n_idx );
}
# Now push whatever remains of each array.
my( @tb, @tn );
if( scalar @$base > $last_b+1 ) {
@tb = @{$base}[$last_b+1 .. $#{$base}];
}
if( scalar @$new > $last_n+1 ) {
@tn = @{$new}[$last_n+1 .. $#{$new}];
}
$self->_balance_arrays( \@tb, \@tn );
push( @$base_wlist, @tb ) if @tb;
push( @$new_wlist, @tn ) if @tn;
# ...and return the whole.
return( $base_wlist, $new_wlist );
}
=begin testing
use Test::More::UTF8;
use Text::TEI::Collate;
use Text::TEI::Collate::Word;
use Text::WagnerFischer;
my $base_word = Text::TEI::Collate::Word->new( ms_sigil => 'A', string => 'հարիւրից' );
my $variant_word = Text::TEI::Collate::Word->new( ms_sigil => 'A', string => 'զ100ից' );
my $match_word = Text::TEI::Collate::Word->new( ms_sigil => 'A', string => 'զհարիւրից' );
my $new_word = Text::TEI::Collate::Word->new( ms_sigil => 'A', string => '100ից' );
my $different_word = Text::TEI::Collate::Word->new( ms_sigil => 'A', string => 'անգամ' );
# not really Greek, but we want Text::WagnerFischer::distance here
my $aligner = Text::TEI::Collate->new( 'language' => 'Greek' );
$base_word->add_variant( $variant_word );
is( $aligner->word_match( $base_word, $match_word), $base_word, "Matched base word" );
is( $aligner->word_match( $base_word, $new_word), $variant_word, "Matched variant word" );
is( $aligner->word_match( $base_word, $different_word), undef, "Did not match irrelevant words" );
my( $ms1 ) = $aligner->read_source( 'Jn bedwange harde swaer Doe riepen si op gode met sinne' );
my( $ms2 ) = $aligner->read_source( 'Jn bedvanghe harde suaer. Doe riepsi vp gode met sinne.' );
$aligner->make_fuzzy_matches( $ms1->words, $ms2->words );
is( scalar keys %{$aligner->{fuzzy_matches}}, 15, "Got correct number of vocabulary words" );
my %unique;
map { $unique{$_} = 1 } values %{$aligner->{fuzzy_matches}};
is( scalar keys %unique, 11, "Got correct number of fuzzy matching words" );
=end testing
=cut
# TODO This doesn't match against base variants - does that matter?
sub make_fuzzy_matches {
my( $self, $base, $other ) = @_;
my %frequency;
map { $frequency{$_->comparison_form}++ } @$base;
map { $frequency{$_->comparison_form}++ } @$other;
my $fm = $self->{fuzzy_matches};
unless( $fm ) {
$fm = {};
$self->{fuzzy_matches} = $fm;
}
my @all_words = sort { $frequency{$b} <=> $frequency{$a} } keys %frequency;
while( @all_words ) {
my $w = shift @all_words;
# Skip it if we already have a fuzzy match for $w.
next if exists $fm->{$w};
# $w matches itself if nothing else.
$fm->{$w} = $w;
# What else does $w match?
foreach my $x ( @all_words ) {
if( $self->_is_near_word_match( $w, $x ) ) {
# If $x already exists, it was probably more popular. Use
# it instead.
if( exists $fm->{$x} ) {
$fm->{$w} = $x;
last;
} else {
# Otherwise make $x match $w.
$fm->{$x} = $w;
}
}
}
}
}
# A key generation function for our Diff module. Always return the comparison
# string for the base text word; if the non-base word is in $a and it doesn't
# match the base (which is therefore in $b), return its own comparison string.
sub diff_key {
my( $self, $word ) = @_;
return $self->{fuzzy_matches}->{$word->comparison_form};
}
sub word_match {
# A and B are word objects. We want to match if b matches a,
# but also if b matches a variant of a.
my( $self, $a, $b, $use_diffkey ) = @_;
my $a_key = $a->comparison_form;
$a_key = $self->diff_key( $a ) if $self->diff_key( $a ) && $use_diffkey;
my $b_key = $b->comparison_form;
$b_key = $self->diff_key( $b ) if $self->diff_key( $b ) && $use_diffkey;
if( $self->_is_near_word_match( $a_key, $b_key ) ) {
return $a;
}
foreach my $v ( $a->variants ) {
my $v_key = $v->comparison_form;
$v_key = $self->diff_key( $v ) if $self->diff_key( $v ) && $use_diffkey;
if( $self->_is_near_word_match( $v_key, $b_key ) ) {
return $v;
}
}
return undef;
}
=begin testing
use Test::More::UTF8;
use Text::TEI::Collate;
my $aligner = Text::TEI::Collate->new();
ok( $aligner->_is_near_word_match( 'Արդ', 'Արդ' ), "matched exact string" );
ok( $aligner->_is_near_word_match( 'հաւասն', 'զհաւասն' ), "matched near-exact string" );
ok( !$aligner->_is_near_word_match( 'հարիւրից', 'զ100ից' ), "did not match differing string" );
ok( !$aligner->_is_near_word_match( 'ժամանակական', 'զշարագրական' ), "did not match differing string 2" );
ok( $aligner->_is_near_word_match( 'ընթերցողք', 'ընթերցողսն' ), "matched near-exact string 2" );
ok( $aligner->_is_near_word_match( 'պատմագրացն', 'պատգամագրացն' ), "matched pretty close string" );
ok( $aligner->_is_near_word_match( 'αι̣τια̣ν̣', 'αιτιαν' ), "matched string one direction" );
ok( $aligner->_is_near_word_match( 'αιτιαν', 'αι̣τια̣ν̣' ), "matched string other direction" );
=end testing
=cut
sub _is_near_word_match {
my $self = shift;
my( $word1, $word2 ) = @_;
# Find our distance routine in case we need it.
unless( ref $self->distance_sub ) {
throw( ident => 'bad language module',
message => "No word comparison algorithm specified." );
}
my $dist = $self->distance_sub->( $word1, $word2 );
# Now see if the distance is low enough to be a match.
my $answer;
if( $self->has_fuzziness_sub ) {
$answer = $self->fuzziness_sub->( $word1, $word2, $dist );
} else {
my $ref_str = length( $word1 ) < length( $word2 ) ? $word1 : $word2;
my $fuzz = length( $ref_str ) > $self->fuzziness->{short}
? $self->fuzziness->{val} : $self->fuzziness->{shortval};
$answer = $dist <= ( length( $ref_str ) * $fuzz / 100 );
}
# $self->debug( "Words $word1 and $word2 " . ( $answer ? 'matched' : 'did not match' ), 3 );
return $answer;
}
## Diff handling functions. Used in build_array and in match_and_align_words.
## Thanks to our array-substitution trickery in match_and_align_words, we may
## not assume that the $diff object has the actual items we want. Only the
## indices are meaningful.
sub _handle_diff_same {
my $self = shift;
my( $diff, $base_text, $new_text, $base_result, $new_result ) = @_;
# Get the index range.
my @rbase = $diff->Range( 1 );
my @rnew = $diff->Range( 2 );
my @base_wlist = @{$base_text}[@rbase];
my @new_wlist = @{$new_text}[@rnew];
my $msg_words = join( ' ', _stripped_words( \@base_wlist ) );
$msg_words .= ' / ' . join( ' ', _stripped_words( \@new_wlist ) );
$self->debug( "Diff: pushing matched words $msg_words", 2 );
foreach my $i ( 0 .. $#base_wlist ) {
# Link the word to its match. This means having to compare
# the words again, grr argh. Use the diff key this time because
# we used it when finding these 'same'.
my $matched = $self->word_match( $base_wlist[$i], $new_wlist[$i], 1 );
$DB::single = 1 if !$matched;
$matched->add_link( $new_wlist[$i] );
}
push( @$base_result, @base_wlist );
push( @$new_result, @new_wlist );
}
sub _handle_diff_interpolation {
my $self = shift;
my( $diff, $which, $from_text, $from_result, $to_result ) = @_;
# $which has either 1 or 2, stating which array in $diff has the items.
# $from_result corresponds to $which.
my $op = $which == 1 ? 'deletion' : 'addition';
my @range = $diff->Range( $which );
my @wlist = @{$from_text}[@range];
$self->debug( "DBrecord: pushing $op "
. join( ' ', _stripped_words( \@wlist ) ), 2 );
push( @$to_result, ( $self->empty_word ) x scalar( @wlist ) );
push( @$from_result, @wlist );
}
# generate_base: Take an array of text arrays and flatten them. There
# should not be a blank element in the resulting base. Currently
# used for only two input arrays at a time.
sub generate_base {
my $self = shift;
my @texts = @_;
my @word_arrays;
foreach( @texts ) {
push( @word_arrays,
ref( $_ ) eq 'Text::TEI::Collate::Manuscript' ? $_->words : $_ );
}
# Error checking: are they all the same length?
my $width = scalar @word_arrays;
my $length = scalar @{$word_arrays[0]};
foreach my $t ( @word_arrays ) {
throw( ident => 'bad result',
message => 'Word arrays differ in length: ' . scalar @$t . "vs. $length" )
unless @$t == $length;
}
# Get busy. Take a word from T0 if it's there; otherwise take a word
# from T1, otherwise T2, etc.
my @new_base;
foreach my $idx ( 0 .. $length-1 ) {
my $word = $self->empty_word; # We should never end up using this
# word, but just in case there is a
# gap, it should be the right object.
foreach my $col ( 0 .. $width - 1 ) {
if( $word_arrays[$col]->[$idx]->comparison_form ne '' ) {
$word = $word_arrays[$col]->[$idx];
$word->is_base( 1 );
last;
}
}
# Disabled due to BEGIN shenanigans
# warn( "No word found in any column at index $idx!" )
# if( $word eq $self->empty_word );
push( @new_base, $word );
}
return \@new_base;
}
# Helper function for begin_end_mark
sub _wordlist_slice {
my $self = shift;
my( $list, $entry, $replace ) = @_;
my( $toss, $size, $idx ) = split( /_/, $entry );
if( $replace ) {
my @repl_array;
if( $replace eq 'empty' ) {
@repl_array = ( $self->empty_word ) x $size;
} elsif( ref $replace eq 'ARRAY' ) {
@repl_array = @$replace;
}
splice( @$list, $idx-$size+1, $size, @repl_array );
} else {
return @{$list}[ ($idx-$size+1) .. $idx ];
}
}
=begin testing
use Text::TEI::Collate;
my $aligner = Text::TEI::Collate->new();
my( $base ) = $aligner->read_source( 'The black cat' );
my( $other ) = $aligner->read_source( 'The black and white little cat' );
$aligner->align( $base, $other );
# Check length
is( scalar @{$base->words}, 8, "Got six columns plus top and tail" );
is( scalar @{$other->words}, 8, "Got six columns plus top and tail" );
# Check contents
is( $base->words->[-1]->special, 'END', "Got ending mark at end" );
is( $base->words->[0]->special, 'BEGIN', "Got beginning mark at start" );
is( $other->words->[-1]->special, 'END', "Got ending mark at end" );
is( $other->words->[0]->special, 'BEGIN', "Got beginning mark at start" );
# Check empty spaces
my $base_exp = [ 'BEGIN', 'the', 'black', '', '', '', 'cat', 'END' ];
my $other_exp = [ 'BEGIN', 'the', 'black', 'and', 'white', 'little', 'cat', 'END' ];
my @base_str = map { $_->printable } @{$base->words};
my @other_str = map { $_->printable } @{$other->words};
is_deeply( \@base_str, $base_exp, "Right sequence of words in base" );
is_deeply( \@other_str, $other_exp, "Right sequence of words in other" );
my @test = (
'The black dog chases a red cat.',
'A red cat chases the black dog.',
'A red cat chases the yellow dog<',
);
my @mss = map { $aligner->read_source( $_ ) } @test;
$aligner->align( @mss );
$base = $mss[0];
$other = $mss[2];
is( scalar @{$base->words}, 13, "Got 11 columns plus top and tail" );
is( scalar @{$other->words}, 13, "Got 11 columns plus top and tail" );
$base_exp = [ 'BEGIN', 'the', 'black', 'dog', 'chases', 'a', 'red', 'cat', 'END', '', '', '', '' ];
$other_exp = [ '', '', '', '', 'BEGIN', 'a', 'red', 'cat', 'chases', 'the', 'yellow', 'dog', 'END' ];
@base_str = map { $_->printable } @{$base->words};
@other_str = map { $_->printable } @{$other->words};
is_deeply( \@base_str, $base_exp, "Right sequence of words in base" );
is_deeply( \@other_str, $other_exp, "Right sequence of words in other" );
is( $base->words->[-5]->special, 'END', "Got ending mark at end for base" );
is( $base->words->[0]->special, 'BEGIN', "Got beginning mark at start for base" );
is( $other->words->[-1]->special, 'END', "Got ending mark at end for other" );
is( $other->words->[4]->special, 'BEGIN', "Got beginning mark at start for other" );
=end testing
=cut
# begin_end_mark: Note, with special words spliced in, where each
# text actually begins and ends.
my $GAP_MIN_SIZE = 18;
sub begin_end_mark {
my $self = shift;
my @manuscripts = @_;
foreach my $text( @manuscripts ) {
my $wordlist = $text->words;
my $sigil = $text->sigil;
my $first_word_idx = -1;
my $last_word_idx = -1;
my $gap_start = -1;
my $gap_end = -1;
foreach my $idx ( 0 .. $#{$wordlist} ) {
my $word_obj = $wordlist->[$idx];
if( $first_word_idx > -1 ) {
# We have found and coped with the first word;
# now we are looking for substantive gaps.
if ( !$word_obj->is_empty ) {
$last_word_idx = $idx;
if( $gap_start > 0 &&
( $gap_end - $gap_start ) > $GAP_MIN_SIZE ) {
# Put in the gap start & end markers. Here we are
# replacing a blank, rather than adding to the array.
# This should be okay as we are not changing the index
# of the rest of the word elements.
foreach( $gap_start, $gap_end ) {
my $tag = $_ < $gap_end ? 'BEGINGAP' : 'ENDGAP';
my $gapdesc = $tag . "_1_$_";
$self->_wordlist_slice( $wordlist, $gapdesc,
[ _special( $tag, $sigil ) ] );
}
}
# Either way we are not now in a gap. Reset the counters.
$gap_end = $gap_start = -1;
# else empty space; have we found a gap?
} elsif( $gap_start < 0 ) {
$gap_start = $idx;
# else we know we are in a gap; push the end forward.
} else {
$gap_end = $idx;
}
# else we are still looking for the first non-blank word.
} elsif( !$word_obj->is_empty ) {
# We have found the first real word. Note where the begin
# marker should go.
$first_word_idx = $idx;
} # else it's a blank before the first word.
} ## end foreach
# Splice in the BEGIN element before the $first_word_idx.
my $slicedesc = join( '_', 'begin', 0, $first_word_idx-1 );
$self->_wordlist_slice( $wordlist, $slicedesc, [ _special( 'BEGIN', $sigil ) ] );
# Now put in the END element after the last word found.
# First account for the fact that we just spliced a BEGIN into the array.
$slicedesc = join( '_', 'end', 0, $last_word_idx + 1 );
$self->_wordlist_slice( $wordlist, $slicedesc,
[ _special( 'END', $sigil ) ] );
}
}
# Helper function for begin_end_mark, to create a mark
sub _special {
my( $mark, $sigil ) = @_;
return Text::TEI::Collate::Word->new( special => $mark,
ms_sigil => $sigil );
}
=head1 OUTPUT METHODS
=head2 to_json
Takes a list of aligned manuscripts and returns a data structure suitable for
JSON encoding; documented at L<http://gregor.middell.net/collatex/api/collate>
=begin testing
my $aligner = Text::TEI::Collate->new();
my @mss = $aligner->read_source( 't/data/cx/john18-2.xml' );
$aligner->align( @mss );
my $jsondata = $aligner->to_json( @mss );
ok( exists $jsondata->{alignment}, "to_json: Got alignment data structure back");
my @wits = @{$jsondata->{alignment}};
is( scalar @wits, 28, "to_json: Got correct number of witnesses back");
# Without the beginning and end marks, we have 75 word spots.
my $columns = 73;
foreach ( @wits ) {
is( scalar @{$_->{tokens}}, $columns, "to_json: Got correct number of words back for witness")
}
=end testing
=cut
sub to_json {
my( $self, @mss ) = @_;
my $result = { 'title' => $self->title, 'alignment' => [] };
my @invisible_row;
# Leave out the rows with no actual word tokens.
foreach my $i ( 0 .. $#{$mss[0]->words} ) {
my @rowitems = map { $_->words->[$i] } @mss;
push( @invisible_row, $i )
unless grep { $_ && !$_->invisible } @rowitems;
}
foreach my $ms ( @mss ) {
push( @{$result->{'alignment'}},
{ 'witness' => $ms->sigil,
'tokens' => $ms->tokenize_as_json( @invisible_row )->{'tokens'}, } );
}
return $result;
}
=head2 to_csv
Takes a list of aligned Manuscript objects and returns a CSV file, one
column per Manuscript. The first row contains the manuscript sigla; the
subsequent rows contain the aligned text.
=begin testing
use IO::String;
use Text::CSV_XS;
use Test::More::UTF8;
my $aligner = Text::TEI::Collate->new();
my @mss = $aligner->read_source( 't/data/cx/john18-2.xml' );
$aligner->align( @mss );
my $csvstring = $aligner->to_csv( @mss );
ok( $csvstring, "Got a CSV string returned" );
# Parse the CSV data and test that it parsed
my $io = IO::String->new( $csvstring );
my $csv = Text::CSV_XS->new( { binary => 1 } );
# Test the number of columns in the first row
my $sigilrow = $csv->getline( $io );
ok( $sigilrow, "Got a row" );
is( scalar @$sigilrow, 28, "Got the correct number of witnesses" );
# Test the number of rows in the table
my $rowctr = 0;
while( my $row = $csv->getline( $io ) ) {
is( scalar @$row, 28, "Got a reading for all columns" );
$rowctr++;
if( $rowctr == 1 ) {
# Test that we are getting our encoding right
is( $row->[0], "λέγει", "Got the right first word" );
}
}
is( $rowctr, 73, "Got expected number of rows in CSV" );
=end testing
=cut
sub to_csv {
my( $self, @mss ) = @_;
my @out;
my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
# First get the witness sigla.
my @sigla = map { $_->sigil } @mss;
$csv->combine( @sigla );
push( @out, decode_utf8( $csv->string ) );
# Now go through the aligned text, leaving out invisible-only rows.
my $length = scalar @{$mss[0]->words};
foreach my $i ( 0 .. $length-1 ) {
my @words = map { $_->words->[$i] } @mss;
next unless grep { $_ && !$_->invisible } @words;
my $status = $csv->combine( map { $_ ? $_->word : undef } @words );
throw( ident => 'output error',
message => "Could not convert " . $csv->error_input . " to CSV" )
unless $status;
push( @out, decode_utf8( $csv->string ) );
}
return join( "\n", @out );
}
=head2 to_tei
Takes a list of aligned Manuscript objects and returns a fairly simple TEI
XML document in parallel segmentation format, with the words lexically marked
as such. At the moment returns a single paragraph, with the original div and
paragraph breaks for each witness marked as a <witDetail/> in the apparatus.
=begin testing
use Text::TEI::Collate;
use XML::LibXML::XPathContext;
# Get an alignment to test with
my $testdir = "t/data/xml_plain";
opendir( XF, $testdir ) or die "Could not open $testdir";
my @files = readdir XF;
my @mss;
my $aligner = Text::TEI::Collate->new(
'fuzziness' => '50',
'language' => 'Armenian',
'title' => 'Test Armenian collation',
);
foreach ( sort @files ) {
next if /^\./;
push( @mss, $aligner->read_source( "$testdir/$_" ) );
}
$aligner->align( @mss );
my $doc = $aligner->to_tei( @mss );
is( ref( $doc ), 'XML::LibXML::Document', "Made TEI document header" );
my $xpc = XML::LibXML::XPathContext->new( $doc->documentElement );
$xpc->registerNs( 'tei', $doc->documentElement->namespaceURI );
# Test the creation of a document header from TEI files
my @witdesc = $xpc->findnodes( '//tei:witness/tei:msDesc' );
is( scalar @witdesc, 5, "Found five msdesc nodes");
my $title = $xpc->findvalue( '//tei:titleStmt/tei:title' );
is( $title, $aligner->title, "TEI doc title set correctly" );
# Test the creation of apparatus entries
my @apps = $xpc->findnodes( '//tei:app' );
is( scalar @apps, 107, "Got the correct number of app entries");
my @words_not_in_app = $xpc->findnodes( '//tei:body/tei:div/tei:p/tei:w' );
is( scalar @words_not_in_app, 175, "Got the correct number of matching words");
my @details = $xpc->findnodes( '//tei:witDetail' );
my @detailwits;
foreach ( @details ) {
my $witstr = $_->getAttribute( 'wit' );
push( @detailwits, split( /\s+/, $witstr ));
}
is( scalar @detailwits, 13, "Found the right number of witness-detail wits");
# TODO test the reconstruction of witnesses from the parallel-seg.
=end testing
=cut
## Block for to_tei logic
{
## Counter variables
my $app_id_ctr = 0; # for xml:id of <app/> tags
my $word_id_ctr = 0; # for xml:id of <w/> tags that have witDetails
## Constants
my $ns_uri = 'http://www.tei-c.org/ns/1.0';
# Local globals
my ( $doc, $body );
sub to_tei {
my( $self, @mss ) = @_;
( $doc, $body ) = _make_tei_doc( $self->title, @mss );
## Generate a base by flattening all the results
my $initial_base = $self->generate_base( map { $_->words } @mss );
foreach my $idx ( 0 .. $#{$initial_base} ) {
my %seen;
map { $seen{$_->sigil} = 0 } @mss;
_make_tei_app( $initial_base->[$idx], %seen );
}
return $doc;
}
sub _make_tei_doc {
my $title = shift;
my @mss = @_;
my $doc = XML::LibXML->createDocument( '1.0', 'UTF-8' );
my $root = $doc->createElementNS( $ns_uri, 'TEI' );
# Make the header
my $teiheader = $root->addNewChild( $ns_uri, 'teiHeader' );
my $filedesc = $teiheader->addNewChild( $ns_uri, 'fileDesc' );
$filedesc->addNewChild( $ns_uri, 'titleStmt' )->
addNewChild( $ns_uri, 'title' )->
appendText( $title );
$filedesc->addNewChild( $ns_uri, 'publicationStmt' )->
addNewChild( $ns_uri, 'p' )->
appendText( 'Created by nCritic' );
my $witnesslist = $filedesc->addNewChild( $ns_uri, 'sourceDesc')->
addNewChild( $ns_uri, 'listWit' );
foreach my $m ( @mss ) {
my $wit = $witnesslist->addNewChild( $ns_uri, 'witness' );
$wit->setAttribute( 'xml:id', $m->sigil );
if( $m->has_msdesc ) {
my $local_msdesc = $m->msdesc->cloneNode( 1 );
$local_msdesc->removeAttribute( 'xml:id' );
$wit->appendChild( $local_msdesc );
} else {
$wit->appendText( $m->identifier );
}
}
# Make the body element
my $body_p = $root->addNewChild( $ns_uri, 'text' )->
addNewChild( $ns_uri, 'body' )->
addNewChild( $ns_uri, 'div' )->
addNewChild( $ns_uri, 'p' ); # TODO maybe this should be lg?
# Set the root...
$doc->setDocumentElement( $root );
# ...and return the doc and the body
return( $doc, $body_p );
}
sub _make_tei_app {
my( $word_obj, %seen ) = @_;
my @all_words = ( $word_obj, $word_obj->links, $word_obj->variants );
foreach( $word_obj->variants ) {
push( @all_words, $_->links );
}
# Do we have the exact same word across all manuscripts with no pesky
# placeholders? And which manuscripts have words?
my $variation = 0;
foreach( @all_words ) {
$variation = 1 if $_->original_form ne $word_obj->original_form;
# We need an <app/> tag if there is a placeholder to record too.
$variation = 1 if $_->placeholders;
$seen{$_->ms_sigil} = 1 if $_->ms_sigil;
}
# If we do have variation, we create an <app/> element to describe
# it. If we don't, we create a <w/> element to hold the common word.
if( $variation ) {
my $app_el = $body->addNewChild( $ns_uri, 'app');
$app_el->setAttribute( 'xml:id', 'app'.$app_id_ctr++ );
# We want only one reading per unique original_form.
my %forms;
foreach my $rdg ( @all_words ) {
my $rdgkey = $rdg->original_form;
next unless $rdgkey;
push( @{$forms{$rdgkey}}, $rdg );
}
# Now for each form, go through and get the reading witnesses and
# placeholders.
foreach my $form ( keys %forms ) {
my $rdg_el = $app_el->addNewChild( $ns_uri, 'rdg' );
# Set the witness string.
my $wit_str = join( ' ', map { '#'.$_->ms_sigil } @{$forms{$form}});
$rdg_el->setAttribute( 'wit', $wit_str );
# Set the word element within the reading.
my $w_el = $rdg_el->addNewChild( $ns_uri, 'w' );
$w_el->setAttribute( 'xml:id', 'w'.$word_id_ctr++ );
# Arbitrarily use the first reading of this form to get the punctuation.
_wrap_punct( $w_el, $forms{$form}->[0] );
# Add the placeholder information as <witDetail/> elements.
my $witDetails;
foreach my $rdg ( @{$forms{$form}} ) {
foreach my $pl ( $rdg->placeholders ) {
push( @{$witDetails->{'#'.$w_el->getAttribute( 'xml:id' )}->{$pl}}, '#'.$rdg->ms_sigil );
}
}
foreach my $wd ( keys %$witDetails ) {
foreach my $type ( keys %{$witDetails->{$wd}} ) {
my $wd_el = $app_el->addNewChild( $ns_uri, 'witDetail' );
$wd_el->setAttribute( 'target', $wd );
$wd_el->setAttribute( 'wit', join( ' ', @{$witDetails->{$wd}->{$type}}) );
$wd_el->appendText( $type );
}
}
}
my @empty = grep { $seen{$_} == 0 } keys( %seen );
if( @empty ) {
my $rdg_el = $app_el->addNewChild( $ns_uri, 'rdg' );
my $wit_str = join( ' ', map { '#'.$_ } @empty );
$rdg_el->setAttribute( 'wit', $wit_str );
}
} else {
# No variation across manuscripts, just make a <w/> and use the initial
# $word_obj to represent all mss.
my $w_el = $body->addNewChild( $ns_uri, 'w');
$w_el->setAttribute( 'xml:id', 'w'.$word_id_ctr++ );
_wrap_punct( $w_el, $word_obj );
}
}
sub _wrap_punct {
my( $w_el, $word_obj ) = @_;
my $str = $word_obj->original_form;
my @punct = $word_obj->punctuation;
my $last_pos = -1;
foreach my $p ( @punct ) {
my @letters = split( '', $str );
if( $p->{char} eq $letters[$p->{pos}] ) {
my @wordpart = @letters[$last_pos+1..$p->{pos}-1];
$w_el->appendText( join( '', @wordpart ) );
my $char = $w_el->addNewChild( $ns_uri, 'c');
$char->setAttribute( "type", "punct" );
$char->appendText( $p->{char} );
$last_pos = $p->{pos};
} else {
throw( ident => 'data inconsistency',
message => "Punctuation mismatch: "
. join( '/', $p->{char}, $p->{pos} ) . " on " . $str );
}
}
# Now append what is left of the word after the last punctuation.
if( $last_pos < length( $str ) - 1 ) {
my @letters = split( '', $str );
my @wordpart = @letters[$last_pos+1..$#letters];
$w_el->appendText( join( '', @wordpart ) );
}
return $w_el;
}
}
=head2 to_graphml
Takes a list of aligned manuscript objects and returns a GraphML document that
represents the collation as a variant graph. Words in the same location with
the same canonized form are treated as the same node.
=cut
sub to_graphml {
my( $self, @manuscripts ) = @_;
my $graph = $self->to_graph( @manuscripts );
# Make the XML doc
my $GMLNS = 'http://graphml.graphdrawing.org/xmlns';
my $graphml = XML::LibXML::Document->new('1.0', 'UTF-8');
my $root = $graphml->createElementNS( $GMLNS, 'graphml' );
$root->setNamespace( 'http://www.w3.org/2001/XMLSchema-instance', 'xsi', 0 );
$root->setAttribute( 'xsi:schemaLocation', 'http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd');
# Make the interminable graph header
my $graph_el = $root->addNewChild( $GMLNS, 'graph' );
$graph_el->setAttribute( 'id', 'G' );
$graph_el->setAttribute( 'edgedefault', 'directed' );
my $nkey = $graph_el->addNewChild( $GMLNS, 'key' );
$nkey->setAttribute( 'attr.name', 'number' );
$nkey->setAttribute( 'attr.type', 'string' );
$nkey->setAttribute( 'for', 'node' );
$nkey->setAttribute( 'id', 'd0' );
my $tkey = $graph_el->addNewChild( $GMLNS, 'key' );
$tkey->setAttribute( 'attr.name', 'token' );
$tkey->setAttribute( 'attr.type', 'string' );
$tkey->setAttribute( 'for', 'node' );
$tkey->setAttribute( 'id', 'd1' );
my $ms_ctr = 0;
my %ms_key;
foreach my $ms ( @manuscripts ) {
my $wkey = $graph_el->addNewChild( $GMLNS, 'key' );
$wkey->setAttribute( 'attr.name', $ms->sigil );
$wkey->setAttribute( 'attr.type', 'string' );
$wkey->setAttribute( 'for', 'edge' );
$wkey->setAttribute( 'id', 'w'.$ms_ctr++ );
$ms_key{$ms->sigil} = $wkey->getAttribute( 'id' );
}
# Whew. Now add all the nodes
foreach my $n ( $graph->nodes ) {
my $node_el = $graph_el->addNewChild( $GMLNS, 'node' );
$node_el->setAttribute( 'id', $n->name );
my $id_el = $node_el->addNewChild( $GMLNS, 'data' );
$id_el->setAttribute( 'key', 'd0' );
$id_el->appendText( $n->name );
my $token_el = $node_el->addNewChild( $GMLNS, 'data' );
$token_el->setAttribute( 'key', 'd1' );
$token_el->appendText( $n->label );
}
# Finally, add the edges.
my $edge_ctr = 0;
foreach my $n ( $graph->nodes ) {
foreach my $succ ( $n->successors() ) {
my $edge_el = $graph_el->addNewChild( $GMLNS, 'edge' );
$edge_el->setAttribute( 'id', 'e'.$edge_ctr++ );
$edge_el->setAttribute( 'source', $n->name );
$edge_el->setAttribute( 'target', $succ->name );
foreach my $edge ( $n->edges_to( $succ ) ) {
# The edge label is the sigil. Add a data key for that sigil.
my $sig = $edge->name;
my $sig_el = $edge_el->addNewChild( $GMLNS, 'data' );
$sig_el->setAttribute( 'key', $ms_key{$sig} );
$sig_el->appendText( $sig );
}
}
}
$graphml->setDocumentElement( $root );
return $graphml;
}
=head2 to_svg
Takes a list of aligned manuscript objects and returns an SVG representation
of the variant graph, as described for the to_graphml method.
=cut
sub to_svg {
my( $self, @mss ) = @_;
my $graph = $self->to_graph( @mss );
$graph->set_attribute( 'node', 'shape', 'ellipse' );
_combine_edges( $graph );
my $dot = File::Temp->new();
binmode( $dot, ':utf8' );
print $dot $graph->as_graphviz();
close $dot;
my @cmd = qw/dot -Tsvg/;
push( @cmd, $dot->filename );
my( $svg, $err );
run( \@cmd, ">", binary(), \$svg, '2>', \$err );
throw( ident => 'output error',
message => 'SVG output failed: $err' )
if $err;
return $svg;
}
sub _combine_edges {
my $graph = shift;
foreach my $n ( $graph->nodes ) {
foreach my $s ( $n->successors ) {
my @edges = $n->edges_to( $s );
my $new_edge = join( ', ', sort( map { $_->name } @edges ) );
map { $graph->del_edge( $_ ) } @edges;
$graph->add_edge( $n, $s, $new_edge );
}
}
}
=head2 to_graph
Base method for graph-based output - create the (Graph::Easy) graph that will
be used to generate graphml or svg.
=begin testing
use lib 't/lib';
use Text::TEI::Collate;
use XML::LibXML::XPathContext;
eval 'require Graph::Easy;';
unless( $@ ) {
# Get an alignment to test with
my $testdir = "t/data/xml_plain";
opendir( XF, $testdir ) or die "Could not open $testdir";
my @files = readdir XF;
my @mss;
my $aligner = Text::TEI::Collate->new(
'fuzziness' => '50',
'language' => 'Armenian',
);
foreach ( sort @files ) {
next if /^\./;
push( @mss, $aligner->read_source( "$testdir/$_" ) );
}
$aligner->align( @mss );
my $graph = $aligner->to_graph( @mss );
is( ref( $graph ), 'Graph::Easy', "Got a graph object from to_graph" );
is( scalar( $graph->nodes ), 380, "Got the right number of nodes" );
is( scalar( $graph->edges ), 992, "Got the right number of edges" );
}
=end testing
=cut
sub to_graph {
my( $self, @manuscripts ) = @_;
my $graph = Graph::Easy->new();
# All manuscripts run from START to END.
my $start_node = $graph->add_node( 'n0' );
$start_node->set_attribute( 'label', '#START#');
my $end_node = $graph->add_node( 'n1' );
$end_node->set_attribute( 'label', '#END#');
my $textlen = $#{$manuscripts[0]->words};
my $paths = {}; # A list of nodes per manuscript sigil.
my $node_counter = 2; # We've used n0 and n1 already
foreach my $idx ( 0..$textlen ) {
my $unique_words;
my @location_words = map { $_->words->[$idx] } @manuscripts;
foreach my $w ( @location_words ) {
if( $w->special && $w->special eq 'BEGIN' ) {
$paths->{$w->ms_sigil} = [ $start_node ];
} elsif( $w->special && $w->special eq 'END' ) {
push( @{$paths->{$w->ms_sigil}}, $end_node );
} elsif( !$w->is_empty && !$w->special ) {
push( @{$unique_words->{$w->canonical_form}}, $w->ms_sigil )
}
}
foreach my $w ( keys %$unique_words ) {
# Make the node.
my $n = $graph->add_node( 'n'.$node_counter++ );
$n->set_attribute( 'label', $w );
foreach my $sig ( @{$unique_words->{$w}} ) {
push( @{$paths->{$sig}}, $n );
}
}
}
# Have the nodes, now make the edges.
foreach my $sig ( keys %$paths ) {
my $from = shift @{$paths->{$sig}};
foreach my $to ( @{$paths->{$sig}} ) {
$graph->add_edge( $from, $to, $sig );
$from = $to;
}
}
return $graph;
}
## Print a debugging message.
sub debug {
my $self = shift;
my( $msg, $lvl, $no_newline ) = @_;
$lvl = 0 unless $lvl;
print STDERR 'DEBUG ' . ($lvl+1) . ": $msg"
. ( $no_newline ? '' : "\n" )
if $self->debuglevel > $lvl;
}
## Utility function for exception handling
sub throw {
Text::TEI::Collate::Error->throw( @_ );
}
## Utility function for debugging
sub show_links {
my( $self, $base ) = @_;
foreach my $w ( @$base ) {
_show_word_with_links( $w, 1 );
}
}
sub _show_word_with_links {
my( $w, $tab ) = @_;
my $prefix = "\t" x $tab;
print STDERR $w->printable . " " . $w->ms_sigil . "\n";
foreach my $l ( $w->links ) {
print STDERR $prefix . "L: " . $l->printable . " " . $l->ms_sigil . "\n";
}
foreach my $v ( $w->variants ) {
print STDERR $prefix . "Variant: ";
_show_word_with_links( $v, $tab+1 );
}
}
1;
=head1 AUTHOR
Tara L Andrews E<lt>aurum@cpan.orgE<gt>