DTA-CAB/CAB/Format.pm
## -*- Mode: CPerl -*-
##
## File: DTA::CAB::Format.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: Base class for datum I/O
package DTA::CAB::Format;
use DTA::CAB::Format::Registry; ##-- registry
use DTA::CAB::Utils;
use DTA::CAB::Persistent;
use DTA::CAB::Logger;
use DTA::CAB::Datum;
use DTA::CAB::Token;
use DTA::CAB::Sentence;
use DTA::CAB::Document;
use IO::File;
use IO::Handle;
use File::Map qw();
use Carp;
use strict;
##==============================================================================
## Globals
##==============================================================================
our @ISA = qw(DTA::CAB::Persistent DTA::CAB::Logger);
## $CLASS_DEFAULT
## + default format class for newFormat()
our $CLASS_DEFAULT = 'DTA::CAB::Format::TT';
## $REG
## + global format registry, a DTA::CAB::Format::Registry object
our ($REG);
BEGIN {
$REG = DTA::CAB::Format::Registry->new();
}
BEGIN {
*isa = \&UNIVERSAL::isa;
*can = \&UNIVERSAL::can;
}
our $LL_BLK_DEBUG = undef; ##-- log level for block debugging
##==============================================================================
## Constructors etc.
##==============================================================================
## $fmt = CLASS_OR_OBJ->new(%args)
## + object structure: assumed HASH
## {
## ##-- DTA::CAB::IO: common
## utf8 => $bool, ##-- use UTF-8 I/O, where applicable; default=1
##
## ##-- DTA::CAB::IO: input parsing
## #(none)
##
## ##-- DTA::CAB::IO: output formatting
## level => $formatLevel, ##-- formatting level, where applicable
## outbuf => $stringBuffer, ##-- output buffer, where applicable
## }
sub new {
my $that = shift;
my $fmt = bless({
##-- DTA::CAB::IO: common
utf8 => 1,
##-- DTA::CAB::IO: input parsing
#(none)
##-- DTA::CAB::IO: output formatting
#level => undef,
#outbuf => undef,
##-- user args
@_
}, ref($that)||$that);
return $fmt;
}
## undef = $fmt->DESTROY()
## + destructor
## + default implementation calls close()
sub DESTROY {
$_[0]->close();
}
## $fmt = CLASS->newFormat($class_or_short_or_class_suffix, %opts)
## + wrapper for DTA::CAB::Format::Registry::newFormat(); accepts %opts qw(class file)
sub newFormat {
my ($that,$class,%opts) = @_;
return $REG->newFormat($class,%opts);
}
## $fmt = CLASS->newReader(%opts)
## + wraper for DTA::CAB::Format::Registry::newReader; accepts %opts qw(class file)
sub newReader {
my ($that,%opts) = @_;
return $REG->newReader(%opts) // $CLASS_DEFAULT->new(%opts);
}
## $fmt = CLASS->newWriter(%opts)
## + wraper for DTA::CAB::Format::Registry::newWriter; accepts %opts qw(class file)
sub newWriter {
my ($that,%opts) = @_;
return $REG->newWriter(%opts) // $CLASS_DEFAULT->new(%opts);
}
##==============================================================================
## Methods: Global Format Registry
## \%registered = $CLASS_OR_OBJ->registerFormat(%opts)
## + wrapper for DTA::CAB::Format::Registry::register()
sub registerFormat {
my ($that,%opts) = @_;
$opts{name} = (ref($that)||$that) if (!defined($opts{name}));
return $REG->register(%opts);
}
## \%registered_or_undef = $CLASS_OR_OBJ->guessFilenameFormat($filename)
## + wrapper for DTA::CAB::Format::Registry::guessFilenameFormat()
sub guessFilenameFormat {
return $REG->guessFilenameFormat($_[1]);
}
## $readerClass_or_undef = $CLASS_OR_OBJ->fileReaderClass($filename)
## + wrapper for DTA::CAB::Format::Registry::fileReaderClass()
sub fileReaderClass {
return $REG->fileReaderClass($_[1]);
}
## $readerClass_or_undef = $CLASS_OR_OBJ->fileWriterClass($filename)
## + wrapper for DTA::CAB::Format::Registry::fileWriterClass()
sub fileWriterClass {
return $REG->fileWriterClass($_[1]);
}
## $registered_or_undef = $CLASS_OR_OBJ->short2reg($shortname)
## + wrapper for DTA::CAB::Format::Registry::short2reg()
sub short2reg {
return $REG->short2reg($_[1]);
}
## $registered_or_undef = $CLASS_OR_OBJ->base2reg($basename)
## + wrapper for DTA::CAB::Format::Registry::base2reg()
sub base2reg {
return $REG->base2reg($_[1]);
}
##==============================================================================
## Methods: Persistence
##==============================================================================
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
## + default returns qw(outbuf fh tmpfh)
sub noSaveKeys {
return qw(outbuf fh tmpfh);
}
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + default inherited from DTA::CAB::Persistent
##==============================================================================
## Methods: I/O : Generic
##==============================================================================
## $fmt = $fmt->close()
## $fmt = $fmt->close($savetmp)
## + close current input source, if any
## + default calls $fmt->{tmpfh}->close() if available and $savetmp is false (default)
## + always deletes $fmt->{fh} and $fmt->{doc}
sub close {
if (!$_[1] && $_[0]{tmpfh}) {
$_[0]{tmpfh}->close() if ($_[0]{tmpfh}->opened);
delete($_[0]{tmpfh});
}
#$_[0]{fh}->close() if ($_[0]{fh} && $_[0]{fh}->opened());
delete(@{$_[0]}{qw(fh doc)});
return $_[0];
}
## @layers = $fmt->iolayers()
## + returns PerlIO layers to use for I/O handles
## + default returns ':utf8' if $fmt->{utf8} is true, otherwise ':raw'
sub iolayers {
return ($_[0]{utf8} ? ':utf8' : ':raw');
}
## $fmt = $fmt->setLayers()
## $fmt = $fmt->setLayers($fh)
## $fmt = $fmt->setLayers($fh,@layers)
## + wrapper for binmode($fh,$_) foreach (@layers ? @layers : $fmt->iolayers)
sub setLayers {
my ($fmt,$fh,@layers) = @_;
$fh = $fmt->{fh} if (!defined($fh));
return $fmt if (!defined($fh));
binmode($fh,$_) foreach (@layers ? @layers : $fmt->iolayers);
return $fmt;
}
##==============================================================================
## Methods: I/O: Block-wise
##==============================================================================
##--------------------------------------------------------------
## Methods: I/O: Block-wise: Generic
## %blockOpts = $CLASS_OR_OBJECT->blockDefaults()
## + returns default block options as for blockOptions()
## + default implementation just returns (bsize=>(128*1024), eob=>'s')
sub blockDefaults {
return (bsize=>(128*1024), eob=>'s');
}
## %blockOpts = $CLASS_OR_OBJECT->blockOptions($block_spec)
## + parses $block_spec as a block-boundary spec, which is a string of the form
## MIN_BYTES[{k,M,G,T}][@EOB]
## where:
## - MIN_BYTES[{k,M,G,T}] is the minimum block size in bytes, with optional SI suffix
## - EOB indicates desired block boundary: either 's' (sentence) or 'w' (word)
## + returns a hash with 'size' and 'where' keys
## + pukes if not parseable
sub blockOptions {
my ($fmt,$bspec) = @_;
if (($bspec||'') =~ /^([0-9\.]*)([bkmgt])?(?:[:\@](.*))?$/i) {
my ($n,$suff,$eob) = ($1,lc($2),$3);
$n *= 2**10 if ($suff eq 'k');
$n *= 2**20 if ($suff eq 'm');
$n *= 2**30 if ($suff eq 'g');
$n *= 2**40 if ($suff eq 't');
return ($fmt->blockDefaults(), ($n ? (bsize=>$n) : qw()), ($eob ? (eob=>$eob) : qw()));
}
$fmt->logconfess("parseBlockOpts(): could not parse block specification '$bspec'");
return $fmt->blockDefaults();
}
##--------------------------------------------------------------
## Methods: I/O: Block-wise: Input
## \@blocks = $fmt->blockScan($infile, %opts)
## + scans $filename for block boundaries according to %opts, which may contain:
## (
## bsize => $bytes, ##-- minimum block-size in bytes
## eob => $eob, ##-- block boundary type; either 's' (sentence) or 't' (word); default='w'
## )
## + sets local keys in %opts passed to sub-methods blockScan{Head,Body,Foot}()
## (
## ifile => $infile, ##-- (in) input filename
## ifsize => $bytes, ##-- (in) total size of $infile in bytes (-s $infile)
## ihead => [$off,$len], ##-- (in) offset, length of header in $infile
## ifoot => [$off,$len], ##-- (in) offset, length of footer in $infile
## ibody => \@iblocks, ##-- (in) blocks computed by blockScanBody()
## ohead => [$off,$len], ##-- (out) offset, length of header in $$odata
## ofoot => [$off,$len], ##-- (out) offset, length of footer in $$odata
## )
## + returns an ARRAY ref of block specifications \@blocks = [$blk1,$blk2,...]
## where each $blk \in @blocks is a HASH-ref containing at least the following keys:
## {
## ifile => $infile, ##-- (in) input filename
## #isize => $bytes, ##-- (in) total size of $filename in bytes (-s $filename)
## ioff => $offset, ##-- (in) byte-offset of block beginning in $infile
## ilen => $len, ##-- (in) byte-length of block in $infile
## id => [$i,$N] ##-- (in/out) indices s.t. $blk=$blocks[$i], $N=$#blocks
## }
## + additionally, $blk may contain the following keys:
## {
## ihead => [$off,$len], ##-- (in) set by blockScanHead() for $infile
## ifoot => [$off,$len], ##-- (in) set by blockScanFoot() for $infile
## ibody => \@iblocks, ##-- (in) blocks computed by blockScanBody()
## eos => $bool, ##-- (in/out) true if block ends on a sentence boundary (for TT, TJ)
## odata => \$odata, ##-- (out) block data octets (for blockAppend())
## ohead => [$off,$len], ##-- (out) set by blockScanHead() for $odata
## ofoot => [$off,$len], ##-- (out) set by blockScanFoot() for $odata
## ofile => $ofilename, ##-- (out) output filename (for Queue::Server::addblock())
## ofmt => $class, ##-- (out) output formatter class or short name (for Queue::Server::addblock())
## }
## + default implementation here calls $fmt->blockScanHead(), $fmt->blockScanBody(), $fmt->blockScanFoot();
## then sets @$blk{qw(ifile ihead ifoot id)} for each body block
sub blockScan {
my ($fmt,$infile,%opts) = @_;
$opts{bsize} = 128*1024 if (!defined($opts{bsize}));
$opts{eob} = 'w' if (!defined($opts{eob}));
$fmt->vlog('trace', "blockScan(size=$opts{bsize}, eob=$opts{eob}, file=$infile)");
##-- mmap file
$opts{ifile} = $infile;
$opts{ifsize} = (-s $infile);
my ($buf);
File::Map::map_file($buf, $infile,'<',0,$opts{fsize});
##-- scan blocks into head, body, foot
my $ihead = $opts{ihead} = $fmt->blockScanHead(\$buf,'i',\%opts);
my $ibody = $opts{ibody} = $fmt->blockScanBody(\$buf, \%opts);
my $ifoot = $opts{ifoot} = $fmt->blockScanFoot(\$buf,'i',\%opts);
##-- adopt 'n', 'head', 'foot' keys into body blocks
my ($blk);
my $llBlockScan = undef;
$fmt->vlog($llBlockScan, "blockScan: $infile \[head]: $ihead->[0] +$ihead->[1] =".($ihead->[0]+$ihead->[1])." <$opts{ifsize}");
foreach (0..$#$ibody) {
$blk = $ibody->[$_];
$blk->{id} = [$_,$#$ibody] if (!defined($blk->{id}));
$blk->{ifile} = $infile if (!defined($blk->{ifile}));
$blk->{ihead} = $ihead if (!defined($blk->{ihead}));
$blk->{ifoot} = $ifoot if (!defined($blk->{ifoot}));
$fmt->vlog($llBlockScan, "blockScan: $infile \[".($_+1)."/".($#$ibody+1)."]: $blk->{ioff} +$blk->{ilen} =".($blk->{ioff}+$blk->{ilen})." <$opts{ifsize}");
}
$fmt->vlog($llBlockScan, "blockScan: $infile \[foot]: $ifoot->[0] +$ifoot->[1] =".($ifoot->[0]+$ifoot->[1])." <$opts{ifsize}");
##-- cleanup & return
File::Map::unmap($buf);
return $ibody;
}
## \@head = $fmt->blockScanHead(\$buf,$io,\%opts)
## + scans for block header (${io}head); returns [$offset,$length] for block header in (mmaped) \$buf
## + defatult implementation just returns [0,0] (empty header)
sub blockScanHead {
my ($fmt,$bufr,$io,$opts) = @_;
return [0,0];
}
## \@foot = $fmt->blockScanFoot(\$buf,$io,\%opts)
## + scans for block footer (${io}foot); returns [$offset,$length] for block footer in (mmaped) \$buf
## + may adjust contents of $opts{${io}body}
## + default implementation just returns [0,0] (empty footer)
sub blockScanFoot {
my ($fmt,$bufr,$io,$opts) = @_;
return [0,0];
}
## \@blocks = $fmt->blockScanBody(\$buf,\%opts)
## + guts for blockScan(); input only
## + default implementation just dies
sub blockScanBody {
my ($fmt,$bufr,$opts) = @_;
$fmt->logconfess("blockScanBody(): method not implemented in abstract base class ", __PACKAGE__);
}
## \$buf = $fmt->blockReadChunk($fh,$f_off,$f_len, \$buf, $b_off=length($buf))
## + append a string of $f_len bytes starting from $f_off in file $fh to buffer \$buf at $b_off
sub blockReadChunk {
my ($fmt, $fh,$off,$len, $bufr,$boff) = @_;
$boff = defined($$bufr) ? length($$bufr) : 0 if (!defined($boff));
sysseek($fh, $off, SEEK_SET)
or $fmt->logconfess("blockReadChunk(): sysseek($off) failed: $!");
sysread($fh, $$bufr, $len, $boff)==$len
or $fmt->logconfess("blockReadChunk(): sysread() failed for chunk of length $len: $!");
return $bufr;
}
## \$buf = $fmt->blockRead(\%blk)
## \$buf = $fmt->blockRead(\%blk,\$buf)
## + reads block input data for \%blk into \$bufr
## + default implementation just appends raw bytes for:
## - block header @{$blk{ihead}}
## - block body @blk{qw(ioff ilen)}
## - block footer @{$blk{ifoot}}
sub blockRead {
my ($fmt,$blk,$bufr) = @_;
$bufr = \(my $buf) if (!defined($bufr));
$$bufr = '';
my $infile = ($blk->{ifile} || $blk->{file});
my $infh = IO::File->new("<$infile")
or $fmt->logconfess("blockRead(): open failed for '$infile': $!");
binmode($infh,':raw');
$fmt->blockReadChunk($infh, @{$blk->{ihead}}, $bufr) if ($blk->{ihead} && $blk->{ihead}[1]); ##-- head
$fmt->blockReadChunk($infh, @$blk{qw(ioff ilen)}, $bufr) if ($blk->{ilen}); ##-- body
$fmt->blockReadChunk($infh, @{$blk->{ifoot}}, $bufr) if ($blk->{ifoot} && $blk->{ifoot}[1]); ##-- foot
$infh->close();
return $bufr;
}
## $doc = $fmt->parseBlock(\%blk)
## + parses a block into a DTA::CAB::Document
## + wrapper for blockRead(), parseString(), close()
sub parseBlock {
my ($fmt,$blk) = @_;
my $ibufr = $fmt->blockRead($blk);
my $doc = $fmt->parseString($ibufr);
$fmt->close();
return $doc;
}
##--------------------------------------------------------------
## Methods: I/O: Block-wise: Output
## $blk = $fmt->blockStore(\$odata,$blk,\%bopt={})
## + store output buffer \$buf in $blk->{odata}
## + additionally store keys qw(ofmt ohead odata ofoot) relative to $blk->{odata}
## + default calls blockScanHead(), blockScanFoot() with dummy options only if not already set in $blk
sub blockStore {
my ($fmt,$bufr,$blk,$bopt) = @_;
$bopt = {} if (!defined($bopt));
$blk->{id} = [0,0] if (!defined($blk->{id}));
$blk->{ohead} = ($blk->{id}[0]==0 ? [0,0] : $fmt->blockScanHead($bufr,'o',{%$blk,%$bopt})) if (!defined($blk->{ohead}));
$blk->{obody} = [$blk] if (!defined($blk->{obody}));
$blk->{ofoot} = ($blk->{id}[0]==$blk->{id}[1] ? [0,0] : $fmt->blockScanFoot($bufr,'o',{%$blk,%$bopt})) if (!defined($blk->{ofoot}));
$blk->{odata} = $bufr if (!defined($blk->{odata}));
$blk->{ofmt} = $fmt->shortName if (!defined($blk->{ofmt}));
return $blk;
}
## $fmt = $fmt->putDocumentBlock($doc,$blk)
## + wrapper for $fmt->toString(\(my $buf))->putDocumentRaw()->flush()->blockStore(\$buf,$blk)
sub putDocumentBlock {
my ($fmt,$doc,$blk) = @_;
my $buf = '';
$fmt->toString(\$buf)->putDocumentRaw($doc)->flush()->blockStore(\$buf,$blk);
return $fmt;
}
## $fmt_or_undef = $fmt->blockAppend($blk)
## $fmt_or_undef = $fmt->blockAppend($blk,$ofile)
## + append a block $block to a file $ofile (default=$blk->{ofile})
## + $block is a HASH-ref as returned by blockScan()
## + default implementation just dumps $blk->{odata} to $filename;
## modulo @$blk{qw(ohead ofoot)} as appropriate
sub blockAppend {
my ($fmt,$blk,$ofile) = @_;
$ofile = $blk->{ofile} if (!defined($ofile));
##-- common variables
use bytes;
my $bufr = $blk->{odata};
my $id = $blk->{id} || [0,0];
my $ohead = $blk->{ohead} || [0,0];
my $ofoot = $blk->{ofoot} || [0,0];
my $blkid = "${ofile}:$id->[0]/$id->[1]";
my $outfh = IO::File->new(($id->[0]==0 ? '>' : '>>').$ofile)
or $fmt->logconfess("blockAppend(): open failed for '$ofile': $!");
binmode($outfh, utf8::is_utf8($$bufr) ? ':utf8' : ':raw');
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): begin: pos=", $outfh->tell, "; buflen=", bytes::length($$bufr));
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): ohead=[$ohead->[0],$ohead->[1]]; ofoot=[$ofoot->[0],$ofoot->[1]]");
##-- dump: header (initial block only)
if ($id->[0]==0 && $ohead->[1]>0) {
$outfh->print(substr($$bufr, $ohead->[0], $ohead->[1]-$ohead->[0]))
or $fmt->logconfess("blockAppend(): print failed to '$ofile' for initial-block header: $!");
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): wrote ", ($ohead->[1]-$ohead->[0]), " header bytes\n");
}
##-- dump: body
$outfh->print(substr($$bufr, $ohead->[1], ($ofoot->[0]||length($$bufr))-($ohead->[0]+$ohead->[1])))
or $fmt->logconfess("blockAppend(): print failed to '$ofile' for block body: $!");
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): wrote ", (($ofoot->[0]||length($$bufr))-($ohead->[0]+$ohead->[1])), " data bytes");
##-- dump: footer (final block only)
if ($id->[0]==$id->[1] && $ofoot->[1]>0) {
$outfh->print(substr($$bufr, $ofoot->[0], $ofoot->[1]-$ofoot->[0]))
or $fmt->logconfess("blockAppend(): print failed to '$ofile' for final-block footer: $!");
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): wrote ", ($ofoot->[1]-$ofoot->[0]), " footer bytes");
}
##-- cleanup & return
$fmt->vlog($LL_BLK_DEBUG, "blockAppend($blkid): finished, pos=", $outfh->tell);
$outfh->close;
return $fmt;
}
##==============================================================================
## Methods: Input
##==============================================================================
##--------------------------------------------------------------
## Methods: Input: Input selection
## $fmt = $fmt->from(fh=>$fh)
## $fmt = $fmt->from(file=>$file)
## $fmt = $fmt->from(string=>$str)
## $fmt = $fmt->from(string=>\$str)
## + open $fmt for input from specified source
## + wraps fromFh(), fromFile(), fromString()
sub from {
my $fmt = shift;
my $which = shift;
return $fmt->fromFh(@_) if ($which eq 'fh');
return $fmt->fromFile(@_) if ($which eq 'file');
return $fmt->fromString(@_);
}
## $fmt = $fmt->fromString( $string)
## $fmt = $fmt->fromString(\$string)
## + select input from string $string
## + default calls $fmt->fromFh($fmt->{tmpfh}=$new_fh)
sub fromString {
my $fmt = shift;
$fmt->close;
my $fh = IO::Handle->new();
CORE::open($fh, '<', ref($_[0]) ? $_[0] : \$_[0])
or $fmt->logconfess("fromString(): open failed for string input: $!");
return $fmt->fromFh($fmt->{tmpfh}=$fh);
}
## $fmt = $fmt->fromFile($filename)
## + select input from file $filename
## + default calls $fmt->fromFh($fmt->{tmpfh}=$new_fh)
sub fromFile {
my ($fmt,$file) = @_;
$fmt->close;
my $fh = (ref($file) ? $file : IO::File->new("<$file"))
or $fmt->logconfess("fromFile(): open failed for '$file'");
return $fmt->fromFh($fmt->{tmpfh}=$fh);
}
## $fmt = $fmt->fromFh($fh)
## + select input from open filenandle $fh
## + default implementation just calls $fmt->close(1) and sets $fmt->{fh}=$fh
sub fromFh {
my ($fmt,$fh) = @_;
$fmt->logconfess("fromFh(): abstract method called for object instance") if ($fmt->can('fromFh') eq \&fromFh); ##-- sanity check
$fmt->close(1); ##-- keep $fmt->{tmpfh} for auto-close
$fmt->{fh} = $fh; ##-- save this handle for later use
#$fmt->setLayers(); ##-- set perlIO layers
#$fmt->logconfess("fromFh() not implemented");
return $fmt;
}
## $fmt = $fmt->fromFh_str($fh)
## + alternate fromFh() implementation which slurps contents of $fh and calls $fmt->fromString(\$str)
sub fromFh_str {
my ($fmt,$fh) = @_;
$fmt->DTA::CAB::Format::fromFh($fh);
$fmt->setLayers();
local $/=undef;
my $str = <$fh>;
return $fmt->fromString(\$str);
}
##--------------------------------------------------------------
## Methods: Input: Generic API
## $doc = $fmt->parseDocument()
## + parse document from currently selected input source
sub parseDocument {
my $fmt = shift;
$fmt->logconfess("parseDocument() not implemented in abstract base class ", __PACKAGE__ );
}
## $doc = $fmt->parseString( $str)
## $doc = $fmt->parseString(\$str)
## + wrapper for $fmt->fromString(\$str)->parseDocument()
sub parseString {
my $doc = $_[0]->fromString(ref($_[1]) ? $_[1] : \$_[1])->parseDocument;
$_[0]->close();
return $doc;
}
## $doc = $fmt->parseFile($filename_or_fh)
## + wrapper for $fmt->fromFile($filename_or_fh)->parseDocument()
sub parseFile {
my $doc = $_[0]->fromFile($_[1])->parseDocument;
$_[0]->close();
return $doc;
}
## $doc = $fmt->parseFh($fh)
## + wrapper for $fmt->fromFh($filename_or_fh)->parseDocument()
sub parseFh {
my $doc = $_[0]->fromFh($_[1])->parseDocument;
$_[0]->close();
return $doc;
}
##--------------------------------------------------------------
## Methods: Input: Utilties
## $doc = $fmt->forceDocument($reference)
## + attempt to tweak $reference into a DTA::CAB::Document
## + a slightly more in-depth version of DTA::CAB::Datum::toDocument()
sub forceDocument {
my ($fmt,$any) = @_;
if (!ref($any)) {
##-- string: token-like
#return bless({body=>[ bless({tokens=>[bless({text=>$any},'DTA::CAB::Token')]},'DTA::CAB::Sentence') ]},'DTA::CAB::Document');
$any ={body=>[ {tokens=>[{text=>$any}] }] };
}
elsif (isa($any,'DTA::CAB::Document')) {
##-- document
; #$any;
}
elsif (isa($any,'DTA::CAB::Sentence')) {
##-- sentence
$any = {body=>[$any]};
}
elsif (isa($any,'DTA::CAB::Token')) {
##-- token
#return bless({body=>[ bless({tokens=>[$any]},'DTA::CAB::Sentence') ]},'DTA::CAB::Document');
$any= {body=>[ {tokens=>[$any]} ]};
}
elsif (ref($any) eq 'HASH') {
##-- hash
if (exists($any->{body})) {
##-- hash, document-like
#return bless($any,'DTA::CAB::Document');
;
}
elsif (exists($any->{tokens})) {
##-- hash, sentence-like
$any = {body=>[$any]};
}
elsif (exists($any->{text})) {
##-- hash, token-like
$any = {body=>[ {tokens=>[$any]} ]};
}
}
elsif (ref($any) eq 'ARRAY') {
##-- array
if (!ref($any->[0])) {
##-- array; assumedly of token strings
$_ = {text=>$_} foreach (grep {!ref($_)} @$any);
$any = {body=>[ {tokens=>$any} ]};
}
}
else {
##-- something else
$fmt->warn("forceDocument(): cannot massage non-document '".(ref($any)||$any)."'");
return $any;
}
$any = bless($any,'DTA::CAB::Document') if (!isa($any,'DTA::CAB::Document'));
return $any;
}
##==============================================================================
## Methods: Output
##==============================================================================
##--------------------------------------------------------------
## Methods: Output: Generic
## $type = $fmt->mimeType()
## + default returns text/plain
sub contentType { return $_[0]->mimeType(@_[1..$#_]); }
sub mimeType { return 'text/plain'; }
## $ext = $fmt->defaultExtension()
## + returns default filename extension for this format (default='.cab')
sub defaultExtension { return '.cab'; }
## $short = $fmt->shortName()
## + returns "official" short name for this format
## + default just returns package suffix
sub shortName {
my $short = shift;
$short = ref($short) || $short;
if ($short =~ s/^DTA::CAB::Format:://) {
$short =~ s/://g;
} else {
$short =~ s/^.*\:\://;
}
return lc($short);
}
## $lvl = $fmt->formatLevel()
## $fmt = $fmt->formatLevel($level)
## + set output formatting level
sub formatLevel {
my ($fmt,$level) = @_;
return $fmt->{level} if (!defined($level));
$fmt->{level}=$level;
return $fmt;
}
## $fmt = $fmt->flush()
## + flush any buffered output to selected output source
## + default implementation deletes $fmt->{outbuf} and calls $fmt->{fh}->flush()
sub flush {
delete($_[0]{outbuf});
$_[0]{fh}->flush() if (defined($_[0]{fh}));
return $_[0];
}
##--------------------------------------------------------------
## Methods: Output: output selection
## $fmt = $fmt->to(fh=>$fh)
## $fmt = $fmt->to(file=>$file)
## $fmt = $fmt->to(string=>\$str)
## + open $fmt for output to specified destination
## + wraps toFh(), toFile(), toString()
sub to {
my $fmt = shift;
my $which = shift;
return $fmt->toFh(@_) if ($which eq 'fh');
return $fmt->toFile(@_) if ($which eq 'file');
return $fmt->toString(@_);
}
## $fmt = $fmt->toString(\$str, $level)
## + select output to $str
## + default implementation just wraps $fmt->toFh($fmt->{tmpfh}=$new_fh, $level)
sub toString {
my $fmt = shift;
$fmt->close;
my $fh = IO::Handle->new();
CORE::open($fh, '>', ref($_[0]) ? $_[0] : \$_[0])
or $fmt->logconfess("toString(): open failed for string output: $!");
return $fmt->toFh($fmt->{tmpfh}=$fh, $_[1]);
}
## $fmt_or_undef = $fmt->toFile($filename, $formatLevel)
## + select output to named file $filename
## + default implementation just wraps $fmt->toFh($fmt->{tmpfh}=$new_fh, $level)
sub toFile {
my ($fmt,$file,$level) = @_;
$fmt->close;
my $fh = (ref($file) ? $file : IO::File->new(">$file"))
or $fmt->logconfess("toFile(): open failed for '$file'");
return $fmt->toFh($fmt->{tmpfh}=$fh, $level);
}
## $fmt_or_undef = $fmt->toFh($fh,$level)
## + select output to an open filehandle $fh
## + default implementation just calls $fmt->formatLevel($level) and sets $fmt->{fh}=$fh
sub toFh {
my ($fmt,$fh,$level) = @_;
#$fmt->logconfess("toFh(): abstract method called for object instance") if ($fmt->can('toFh') eq \&toFh); ##-- sanity check
$fmt->formatLevel($level) if (defined($level));
$fmt->{fh}=$fh;
#$fmt->setLayers($fh) ##-- set I/O layers
#$fh->print($fmt->{outbuf});
return $fmt;
}
## $fmt_or_undef = $fmt->buf2fh(\$inbuf=\$fmt->{outbuf},$fh)
## + low-level utility which dumps $$buf to $fh
## + may call utf8::encode() or utf8::upgrade() on $inbuf
sub buf2fh {
my ($fmt,$bufr,$fh) = @_;
$bufr = \$fmt->{outbuf} if (!defined($bufr));
my $buf_u8 = utf8::is_utf8($$bufr);
my $fh_u8 = grep {$_ eq 'utf8'} PerlIO::get_layers($fh);
if ($buf_u8 && !$fh_u8) {
##-- utf8 -> bytes: encode buffer
utf8::encode($$bufr);
} elsif (!$buf_u8 && $fh_u8) {
##-- bytes -> utf8: upgrade buffer
utf8::upgrade($$bufr);
}
$fh->print($$bufr);
return $fmt;
}
## $fmt_or_undef = $fmt->buf2bytes(\$inbuf=\$fmt->{outbuf},\$outbuf)
## + low-level utility which copies raw bytes of $$inbuf to $$outbuf
sub buf2bytes {
my ($fmt,$ibufr,$obufr) = @_;
$ibufr = \$fmt->{outbuf} if (!defined($ibufr));
$$obufr = $$ibufr;
utf8::encode($$obufr) if (utf8::is_utf8($$obufr));
return $fmt;
}
## $fmt_or_undef = $fmt->toFh_buf($fh,$formatLevel)
## + toFh() implementation which dumps $fmt->{outbuf} to $fmt->{fh}=$fh
#sub toFh_buf {
# my $fmt = shift;
# $fmt->DTA::CAB::Format::toFh($fmt,@_); ##-- set $fmt->{level}, $fmt->{fh}
# binmode($fmt->{fh}, (utf8::is_utf8($fmt->{outbuf}) ? ':utf8' : ':raw'));
# $fmt->{fh}->print($fmt->{outbuf});
# return $fmt;
#}
##--------------------------------------------------------------
## Methods: Output: Recommended API
## $fmt = $fmt->putToken($tok)
## + default implementations of other methods assume output is concatenated onto $fmt->{outbuf}
sub putTokenRaw { return $_[0]->putToken($_[1]); }
sub putToken {
my $fmt = shift;
$fmt->logconfess("putToken() not implemented!");
return undef;
}
## $fmt = $fmt->putSentence($sent)
## + default implementation just iterates $fmt->putToken() & appends 1 additional "\n" to $fmt->{outbuf}
sub putSentenceRaw { return $_[0]->putSentence($_[1]); }
sub putSentence {
my ($fmt,$sent) = @_;
$fmt->putToken($_) foreach (@{toSentence($sent)->{tokens}});
$fmt->{outbuf} .= "\n";
return $fmt;
}
##--------------------------------------------------------------
## Methods: Output: Required API
## $fmt = $fmt->putDocument($doc)
## + default implementation just iterates $fmt->putSentence()
## + should be non-destructive for $doc
sub putDocument {
my ($fmt,$doc) = @_;
$fmt->putSentence($_) foreach (@{toDocument($doc)->{body}});
return $fmt;
}
## $fmt = $fmt->putDocumentRaw($doc)
## + may copy plain $doc reference
sub putDocumentRaw { return $_[0]->putDocument($_[1]); }
## $fmt = $fmt->putData($data)
## + put arbitrary raw data (e.g. for YAML, JSON, XmlPerl)
sub putData {
$_[0]->logconfess("putData() not implemented!");
}
1; ##-- be happy
__END__
##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl, & edited
##========================================================================
## NAME
=pod
=head1 NAME
DTA::CAB::Format - Base class for DTA::CAB::Datum I/O
=cut
##========================================================================
## SYNOPSIS
=pod
=head1 SYNOPSIS
use DTA::CAB::Format;
##========================================================================
## Constructors etc.
$fmt = $CLASS_OR_OBJ->new(%args);
$fmt = $CLASS->newFormat($class_or_class_suffix, %opts);
$fmt = $CLASS->newReader(%opts);
$fmt = $CLASS->newWriter(%opts);
##========================================================================
## Methods: Global Format Registry
\%classReg_or_undef = $CLASS_OR_OBJ->registerFormat(%classRegOptions);
\%classReg_or_undef = $CLASS_OR_OBJ->guessFilenameFormat($filename);
$readerClass_or_undef = $CLASS_OR_OBJ->fileReaderClass($filename);
$readerClass_or_undef = $CLASS_OR_OBJ->fileWriterClass($filename);
$class_or_undef = $CLASS_OR_OBJ->shortReaderClass($shortname);
$class_or_undef = $CLASS_OR_OBJ->shortWriterClass($shortname);
$registered_or_undef = $CLASS_OR_OBJ->short2reg($shortname);
$registered_or_undef = $CLASS_OR_OBJ->base2reg($basename);
##========================================================================
## Methods: Persistence
@keys = $class_or_obj->noSaveKeys();
##========================================================================
## Methods: MIME
$short = $fmt->shortName();
$type = $fmt->mimeType();
$ext = $fmt->defaultExtension();
##========================================================================
## Methods: Input
$fmt = $fmt->close();
$fmt = $fmt->fromString(\$string);
$fmt = $fmt->fromFile($filename);
$fmt = $fmt->fromFh($fh);
$doc = $fmt->parseDocument();
$doc = $fmt->parseString(\$str);
$doc = $fmt->parseFile($filename);
$doc = $fmt->parseFh($fh);
$doc = $fmt->forceDocument($reference);
##========================================================================
## Methods: Output
$lvl = $fmt->formatLevel();
$fmt = $fmt->flush();
$fmt_or_undef = $fmt->toString(\$str, $formatLevel);
$fmt_or_undef = $fmt->toFile($filename_or_handle, $formatLevel);
$fmt_or_undef = $fmt->toFh($fh, $formatLevel);
$fmt = $fmt->putDocument($doc);
$fmt = $fmt->putDocumentRaw($doc);
=cut
##========================================================================
## DESCRIPTION
=pod
=head1 DESCRIPTION
DTA::CAB::Format is an abstract base class and API specification
for objects implementing an I/O format for the
L<DTA::CAB::Datum|DTA::CAB::Datum> subhierarchy in general,
and for L<DTA::CAB::Document|DTA::CAB::Document> objects in particular.
Each I/O format (subclass) has a characteristic abstract `base class' as well as optional
`reader' and `writer' subclasses which perform the actual I/O (although in
the current implementation, all reader/writer classes are identical with
their respective base classes). Individual formats may be invoked
either directly by their respective classes (SUBCLASS-E<gt>new(), etc.),
or by means of the global L<DTA::CAB::Format::Registry|DTA::CAB::Format::Registry>
object $REG (L</registerFormat>, L</newFormat>, L</newReader>, L</newWriter>, etc.).
See L</SUBCLASSES> for a list of common built-in formats and their registry data.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Globals
=pod
=head2 Globals
=over 4
=item @ISA
DTA::CAB::Format inherits from
L<DTA::CAB::Persistent|DTA::CAB::Persistent>
and
L<DTA::CAB::Logger|DTA::CAB::Logger>.
=item $CLASS_DEFAULT
Default class returned by L</newFormat>()
if no known class is specified.
=item Variable: $REG
Default global format registry used,
a L<DTA::CAB::Format::Registry|DTA::CAB::Format::Registry> object
used by L</registerFormat>, L</newFormat>, etc.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Constructors etc.
=pod
=head2 Constructors etc.
=over 4
=item new
$fmt = CLASS_OR_OBJ->new(%args);
Constructor.
%args, %$fmt:
##-- DTA::CAB::Format: common
##
##-- DTA::CAB::Format: input parsing
#(none)
##
##-- DTA::CAB::Format: output formatting
level => $formatLevel, ##-- formatting level, where applicable
outbuf => $stringBuffer, ##-- output buffer, where applicable
=item newFormat
$fmt = CLASS->newFormat($class_or_class_suffix, %opts);
Wrapper for L</new>() which allows short class suffixes to
be passed in as format names.
=item newReader
$fmt = CLASS->newReader(%opts);
Wrapper for L<DTA::CAB::Format::Registry::newReader|DTA::CAB::Format::Registry/newReader>
which accepts %opts:
class => $class, ##-- classname or DTA::CAB::Format:: suffix
file => $filename, ##-- attempt to guess format from filename
=item newWriter
$fmt = CLASS->newWriter(%opts);
Wrapper for L<DTA::CAB::Format::Registry::newWriter|DTA::CAB::Format::Registry/newWriter>
which accepts %opts:
class => $class, ##-- classname or DTA::CAB::Format:: suffix
file => $filename, ##-- attempt to guess format from filename
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: Global Format Registry
=pod
=head2 Methods: Global Format Registry
The global format registry lives in the package variable $REG.
The following methods are backwards-compatible wrappers for
method calls to this registry object.
=over 4
=item registerFormat
\%registered = $CLASS_OR_OBJ->registerFormat(%opts);
Registers a new format subclass;
wrapper for
L<DTA::CAB::Format::Registry::register|DTA::CAB::Format::Registry/register>().
=item guessFilenameFormat
\%registered_or_undef = $CLASS_OR_OBJ->guessFilenameFormat($filename);
Returns registration record for most recently registered format subclass
whose C<filenameRegex> matches $filename.
Wrapper for L<DTA::CAB::Format::Registry::guessFilenameFormat|DTA::CAB::Format::Registry/guessFilenameFormat>().
=item fileReaderClass
$readerClass_or_undef = $CLASS_OR_OBJ->fileReaderClass($filename);
Attempts to guess reader class name from $filename.
Wrapper for
L<DTA::CAB::Format::Registry::fileReaderClass|DTA::CAB::Format::Registry/fileReaderClass>().
=item fileWriterClass
$readerClass_or_undef = $CLASS_OR_OBJ->fileWriterClass($filename);
Attempts to guess writer class name from $filename.
Wrapper for
L<DTA::CAB::Format::Registry::fileWriterClass|DTA::CAB::Format::Registry/fileWriterClass>().
=item short2reg
$registered_or_undef = $CLASS_OR_OBJ->short2reg($shortname);
Gets the most recent subclass registry HASH ref for the short class name $shortname.
Wrapper for
L<DTA::CAB::Format::Registry::short2reg|DTA::CAB::Format::Registry/short2reg>().
=item base2reg
$registered_or_undef = $CLASS_OR_OBJ->base2reg($basename);
Gets the most recent subclass registry HASH ref for the claass basename name $basename.
Wrapper for
L<DTA::CAB::Format::Registry::base2reg|DTA::CAB::Format::Registry/base2reg>().
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: Persistence
=pod
=head2 Methods: Persistence
=over 4
=item noSaveKeys
@keys = $class_or_obj->noSaveKeys();
Returns list of keys not to be saved
This implementation ignores the key C<outbuf>,
which is used by some many writer subclasses.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: MIME
=pod
=head2 Methods: MIME
=over 4
=item shortName
$short = $fmt->shortName();
Get short name for $fmt. Default just returns lower-cased DTA::CAB::Format:: class suffix.
Short names are all lower-case by default.
=item mimeType
$type = $fmt->mimeType();
Returns MIME type for $fmt.
Default returns 'text/plain'.
=item defaultExtension
$ext = $fmt->defaultExtension();
Returns default filename extension for $fmt (default='.cab').
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: Input
=pod
=head2 Methods: Input
=over 4
=item close
$fmt = $fmt->close();
$fmt = $fmt->close($savetmp);
Close current input source, if any.
Default implementation calls $fmt-E<gt>{tmpfh}->close() iff available and $savetmp is false (default).
Always deletes @$fmt{qw(fh doc)}.
=item fromString
$fmt = $fmt->fromString(\$string);
Select input from the string $string.
Default implementation calls L<$fmt-E<gt>fromFh($fmt-E<gt>{tmpfh}=$new_fh)|/fromFh>.
=item fromFile
$fmt = $fmt->fromFile($filename);
Select input from file $filename.
Default implementation calls L<$fmt-E<gt>fromFh($fmt-E<gt>{tmpfh}=$new_fh)|/fromFh>().
=item fromFh
$fmt = $fmt->fromFh($fh);
Select input from open filehandle $fh.
Default implementation just calls L<$fmt-E<gt>close(1)|/close> and sets $fmt->{fh}=$fh.
=item fromFh_str
$fmt = $fmt->fromFh_str($handle);
Alternate fromFh() implementation which slurps contents of $fh and calls L<$fmt-E<gt>fromString(\$str)|/fromString>.
=item parseDocument
$doc = $fmt->parseDocument();
Parse document from currently selected input source.
=item parseString
$doc = $fmt->parseString($str);
Wrapper for $fmt-E<gt>fromString($str)-E<gt>parseDocument().
=item parseFile
$doc = $fmt->parseFile($filename_or_fh);
Wrapper for $fmt-E<gt>fromFile($filename_or_fh)-E<gt>parseDocument()
=item parseFh
$doc = $fmt->parseFh($fh);
Wrapper for $fmt-E<gt>fromFh($filename_or_fh)-E<gt>parseDocument()
=item forceDocument
$doc = $fmt->forceDocument($reference);
Attempt to tweak $reference into a L<DTA::CAB::Document|DTA::CAB::Document>.
This is
a slightly more in-depth version of L<DTA::CAB::Datum::toDocument()|DTA::CAB::Datum/item_toDocument>.
Current supported $reference forms are:
=over 4
=item L<DTA::CAB::Document|DTA::CAB::Document> object
returned literally
=item L<DTA::CAB::Sentence|DTA::CAB::Sentence> object
returns a new document
with a single sentence $reference.
=item L<DTA::CAB::Token|DTA::CAB::Token> object
returns a new document
with a single token $reference.
=item non-reference
returns a new document with a single token
whose 'text' key is $reference.
=item HASH reference with 'body' key
returns a bless()ed $reference as a L<DTA::CAB::Document|DTA::CAB::Document>.
=item HASH reference with 'tokens' key
returns a new document with the single
sentence $reference
=item HASH reference with 'text' key
returns a new document with the single
token $reference
=item ARRAY reference with non-reference initial element
returns a new document with a single sentence
whose 'tokens' field is set to $reference.
=item ... anything else
will cause a warning to be emitted and $reference to be
returned as-is.
=back
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: Output
=pod
=head2 Methods: Output
=over 4
=item formatLevel
$lvl = $fmt->formatLevel();
$fmt = $fmt->formatLevel($level)
Get/set output formatting level.
=item flush
$fmt = $fmt->flush();
Flush any buffered output to selected output source.
Default implementation deletes $fmt-E<gt>{outbuf} and calls $fmt-E<gt>{fh}->flush() if available.
=item toString
$fmt = $fmt->toString(\$str);
$fmt = $fmt->toString(\$str,$formatLevel)
Select output to byte-string $str.
Default implementation just wraps $fmt-E<gt>toFh($fmt-E<gt>{tmpfh}=$new_fh, $level).
=item toString_buf
$fmt_or_undef = $fmt->toString_buf(\$str)
Alternate toString() implementation which sets $str=$fmt->{outbuf}.
=item toFile
$fmt_or_undef = $fmt->toFile($filename_or_handle, $formatLevel);
Select output to named file $filename.
Default implementation just wraps L<$fmt-E<gt>toFh($fmt-E<gt>{tmpfh}=$new_fh, $level)|/toFh>.
=item toFh
$fmt_or_undef = $fmt->toFh($fh,$formatLevel);
Select output to an open filehandle $fh.
Default implementation just calls $fmt-E<gt>formatLevel($level) and sets $fmt-E<gt>{fh}=$fh.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DTA::CAB::Format: Methods: Output: Recommended API
=pod
=head2 Methods: Output: Recommended API
=over 4
=item putToken
$fmt = $fmt->putToken($tok);
Append a token to the selected output sink.
Should be non-destructive for $tok.
No default implementation,
but default implementations of other methods assume output is concatenated onto $fmt-E<gt>{outbuf}.
=item putTokenRaw
$fmt = $fmt->putTokenRaw($tok)
Copy-by-reference version of L</putToken>.
Default implementation just calls L<$fmt-E<gt>putToken($tok)|/putToken>.
=item putSentence
$fmt = $fmt->putSentence($sent)
Append a sentence to the selected output sink.
Should be non-destructive for $sent.
Default implementation just iterates $fmt->putToken() & appends 1 additional "\n" to $fmt->{outbuf}.
=item putSentenceRaw
$fmt = $fmt->putSentenceRaw($sent)
Copy-by-reference version of L</putSentence>.
Default implementation just calls L</putSentence>.
=item putDocument
$fmt = $fmt->putDocument($doc);
Append document contents to the selected output sink.
Should be non-destructive for $doc.
Default implementation just iterates $fmt-E<gt>putSentence()
=item putDocumentRaw
$fmt = $fmt->putDocumentRaw($doc);
Copy-by-reference version of L</putDocument>.
=back
=cut
##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl
##======================================================================
## See Also
##======================================================================
=pod
=head1 SUBCLASSES
The following formats are provided by the default distribution.
In some cases, external dependencies are also required which
may not be available on all systems.
=over 4
=item L<DTA::CAB::Format::Builtin|DTA::CAB::Format::Builtin>
Just a convenience package: load all built-in DTA::CAB::Format subclasses.
=item L<DTA::CAB::Format::ExpandList|DTA::CAB::Format::ExpandList>
Formatter for runtime term expansion, for use e.g. with
DDC L<Cab Expander|http://odo.dwds.de/~moocow/software/ddc/ddc_opt.html#Cab>,
registerd as:
name=>__PACKAGE__, short=>'xl', filenameRegex=>qr/\.(?i:xl|xlist|l|lst)$/
=item L<DTA::CAB::Format::CONLLU|DTA::CAB::Format::CONLLU>
Datum parser|formatter for "vertical" text conforming to the L<C<CONLL-U>|https://universaldependencies.org/format.html>
format, with optional special handling for additional C<MISC> fields, including
C<json=JSON> for embedding L<DTA::CAB::Format::TJ|DTA::CAB::Format::TJ> CAB-token structure.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:conllu|conll[_-]u|cab[\.-]connlu|cab[\.-]conll[\.-]u)$/
Aliases: C<conllu conll-u cab-conllu cab-conll-u>
=item L<DTA::CAB::Format::JSON|DTA::CAB::Format::JSON>
Abstract datum parser|formatter for JSON I/O.
Transparently wraps one of the
L<DTA::CAB::Format::JSON::XS|DTA::CAB::Format::JSON::XS>
or
L<DTA::CAB::Format::JSON::Syck|DTA::CAB::Format::JSON::Syck>
classes, depending on the availability of the underlying Perl modules
(L<JSON::XS|JSON::XS> and L<JSON::Syck|JSON::Syck>, respectively).
If you have the L<JSON::XS|JSON::XS> module installed, this module provides
the fastest I/O of all available human-readable format classes.
Registered as:
name=>__PACKAGE__, short=>'json', filenameRegex=>qr/\.(?i:json|jsn)$/
=item L<DTA::CAB::Format::LemmaList|DTA::CAB::Format::LemmaList>
Formatter for runtime term lemmatization, for use e.g. with
DDC L<Cab Expander|http://odo.dwds.de/~moocow/software/ddc/ddc_opt.html#Cab>.
By default, returns all lemmata for function word input tokens (whose tag matches
the regex C</^(?:[CKP\$]|A[PR]|V[AM])/>), otherwise only the "best" lemma.
Regisered as:
(name=>__PACKAGE__, short=>$_, filenameRegex=>qr/\.(?i:ll|llist|lemmas|lemmata)/)
foreach (qw(LemmaList llist ll lemma))
A variant which returns all known lemmata for each input token is registered as:
(name=>__PACKAGE__, short=>$_, opts=>{cctagre=>''})
foreach (qw(LemmaListAll LemmasAll llist-all ll-all lla lemmas lemmata))
=item L<DTA::CAB::Format::Null|DTA::CAB::Format::Null>
Null-op parser/formatter for debugging and testing purposes.
Registered as:
name=>__PACKAGE__
=item L<DTA::CAB::Format::Perl|DTA::CAB::Format::Perl>
Datum parser|formatter: perl code via Data::Dumper, eval().
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:prl|pl|perl|dump)$/
=item L<DTA::CAB::Format::Raw|DTA::CAB::Format::Raw>
Abstract only format for reading raw untokenized text and
writing simple flat list of canonical forms;
wraps L<DTA::CAB::Format::Raw::Waste|DTA::CAB::Format::Raw::Waste> by default.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:raw)$/
=item L<DTA::CAB::Format::Raw::HTTP|DTA::CAB::Format::Raw::HTTP>
Input-only format for reading raw untokenized text and analyzing it
over HTTP using a remote WASTE FastCGI interface, registered as:
name=>__PACKAGE__, short=>'raw-http', filenameRegex=>qr/\.(?i:raw-http|txt-http)$/
=item L<DTA::CAB::Format::Raw::Perl|DTA::CAB::Format::Raw::Perl>
Input-only format for reading raw untokenized text and analyzing it
using simple pure-perl heuristics. Registered as:
name=>__PACKAGE__, short=>'raw-perl', filenameRegex=>qr/\.(?i:raw-perl|txt-perl)$/
=item L<DTA::CAB::Format::Raw::Waste|DTA::CAB::Format::Raw::Waste>
Input-only format for reading raw untokenized text and analyzing it
using the L<Moot::Waste|Moot::Waste> module, registered as:
name=>__PACKAGE__, short=>'raw-waste', filenameRegex=>qr/\.(?i:raw-waste|txt-waste)$/
=begin comment text
=item L<DTA::CAB::Format::SQLite|DTA::CAB::Format::SQLite>
=end comment
=item L<DTA::CAB::Format::Storable|DTA::CAB::Format::Storable>
Binary datum parser|formatter using the L<Storable|Storable> module.
Very fast, but neither human-readable nor easily portable beyond Perl.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:sto|bin)$/
=item L<DTA::CAB::Format::SynCoPe::CSV|DTA::CAB::Format::SynCoPe::CSV>
Datum parser|formatter for SynCoPe named entity recognizer C<-tab_input> mode.
Registered as:
name=>__PACKAGE__, short=>'syncope-csv', filenameRegex=>qr/\.(?i:syn(?:cope)?[-\.](?:csv|tsv|tab)|)$/
=item L<DTA::CAB::Format::TCF|DTA::CAB::Format::TCF>
Datum parser|formatter for CLARIN-D TCF XML.
Handles annoation layers tokens, sentences, orthography, postags, and lemmas.
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/\.(?i:(?:tcf[\.\-_]?xml)|(?:tcf))$/)
(name=>__PACKAGE__, short=>$_, opts=>{tcflayers=>'tokens sentences orthography'}) foreach (qw(tcf-orth tcf-web))
(name=>__PACKAGE__, short=>$_, opts=>{tcflayers=>'tokens sentences orthography postags lemmas'}) foreach (qw(tcf tcf-xml tcfxml full-tcf xtcf))
=item L<DTA::CAB::Format::TEI|DTA::CAB::Format::TEI>
Datum parser|formatter: for raw un-tokenized TEI XML (with or without //c elements) using L<DTA::TokWrap|DTA::TokWrap>.
Any //s or //w elements in the input will be B<IGNORED> and input will be (re-)tokenized.
Outputs files are themselves parseable by L<DTA::CAB::Format::TEIws|DTA::CAB::Format::TEIws>.
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/\.(?i:(?:c|chr|txt|tei(?:[\.\-_]?p[45])?)[\.\-_]xml|xml)$/)
(name=>__PACKAGE__, short=>$_) foreach (qw(chr-xml c-xml cxml tei-xml teixml tei xml))
By default, this module uses L<DTA::CAB::Format::XmlTokWrap|DTA::CAB::Format::XmlTokWrap> to format the low-level
document data, and splices the result back into the original TEI document.
The following additional aliases are provided for using the L<DTA::CAB::Format::XmlTokWrapFast|DTA::CAB::Format::XmlTokWrapFast>
module to format the low-level flat token data (faster but not as flexible as the default):
(name=>__PACKAGE__, short=>$_, opts=>{txmlfmt=>'DTA::CAB::Format::XmlTokWrapFast'})
foreach (qw(fast-tei-xml ftei-xml fteixml ftei))
Additionally, the following aliases are provided for using the L<DTA::CAB::Format::XmlLing|DTA::CAB::Format::XmlLing>
to format the low-level flat token data using TEI att.linguistic conventions:
(name=>__PACKAGE__, short=>$_, opts=>{'att.linguistic'=>1})
foreach (qw(ling-tei-xml ltei-xml lteixml ltei tei-ling tei+ling teiling))
=item L<DTA::CAB::Format::TEIws|DTA::CAB::Format::TEIws>
Datum parser|formatter: for TEI XML pre-tokenized into (possibly fragmented) //w and //s elements, as output by DTA::TokWrap.
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/\.(?i:(?:spliced|tei[\.\-\+]?ws?|wst?)[\.\-]xml)$/)
(name=>__PACKAGE__, short=>$_) foreach (qw(tei-ws tei+ws tei+w tei-w teiw wst-xml wstxml teiws-xml));
By default, this module uses L<DTA::CAB::Format::XmlTokWrap|DTA::CAB::Format::XmlTokWrap> to format the low-level
document data, and splices the result back into the original TEI document.
The following aliases are provided for using the L<DTA::CAB::Format::XmlLing|DTA::CAB::Format::XmlLing>
to format the low-level flat token data using TEI att.linguistic conventions:
(name=>__PACKAGE__, short=>$_, opts=>{'att.linguistic'=>1})
foreach (qw(lteiws teilws teiwsl ltei-ws ltei+ws tei+w ltei-w lteiw lwst-xml lwstxml lteiws-xml),
qw(ling-tei-ws tei+ling+ws tei+ws+ling teiws-ling-xml teiws+ling-xml))
=item L<DTA::CAB::Format::Text|DTA::CAB::Format::Text>
Datum parser|formatter: verbose human-readable text
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:txt|text|cab\-txt|cab\-text)$/
=item L<DTA::CAB::Format::TJ|DTA::CAB::Format::TJ>
Datum parser|formatter: "vertical" text, one token per line, with a single TAB-separated
attribute field encoding token data as JSON.
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/\.(?i:tj|tjson|cab\-tj|cab\-tjson)$/);
=item L<DTA::CAB::Format::TT|DTA::CAB::Format::TT>
Datum parser|formatter: "vertical" text, one token per line, TAB-separated attribute fields
with conventional attribute-name prefixes.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:t|tt|ttt|cab\-t|cab\-tt|cab\-ttt)$/
=item L<DTA::CAB::Format::YAML|DTA::CAB::Format::YAML>
Abstract datum parser|formatter for YAML I/O.
Transparently wraps one of the
L<DTA::CAB::Format::YAML::XS|DTA::CAB::Format::YAML::XS>,
L<DTA::CAB::Format::YAML::Syck|DTA::CAB::Format::YAML::Syck>,
or
L<DTA::CAB::Format::YAML::Lite|DTA::CAB::Format::YAML::Lite>
classes, depending on the availability of the underlying Perl modules
(L<YAML::XS|YAML::XS>, L<YAML::Syck|YAML::Syck>, and L<YAML::Lite|YAML::Lite>, respectively).
Registered as:
name=>__PACKAGE__, short=>'yaml', filenameRegex=>qr/\.(?i:yaml|yml)$/
=item L<DTA::CAB::Format::XmlCommon|DTA::CAB::Format::XmlCommon>
Datum parser|formatter: XML: abstract base class.
=item L<DTA::CAB::Format::XmlNative|DTA::CAB::Format::XmlLing>
Datum parser|formatter: minimalistic flat TokWrap-like XML using only TEI att.linguistic attributes.
Based on L<DTA::CAB::Format::XmlTokWrapFast|DTA::CAB::Format::XmlTokWrapFast>,
the L<XmlLing|DTA::CAB::Format::XmlLing> parser reads and writes only IDs and the TEI att.linguistic attributes,
(L<http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-att.linguistic.html>)).
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/(?:\.(?i:(?:ling|l[tuws])(?:\.?)xml))$/)
(name=>__PACKAGE__, short=>$_) foreach (qw(ltxml lxml ling-xml lt-xml ltwxml ltw-xml))
=item L<DTA::CAB::Format::XmlNative|DTA::CAB::Format::XmlNative>
Datum parser|formatter: XML (native).
Nearly compatible with C<.t.xml> files as created by L<dta-tokwrap.perl(1)|dta-tokwrap.perl>.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:xml\-native|xml\-dta\-cab|(?:dta[\-\._]cab[\-\._]xml)|xml)$/
and aliased as:
name=>__PACKAGE__, short=>'xml'
=item L<DTA::CAB::Format::XmlPerl|DTA::CAB::Format::XmlPerl>
Datum parser|formatter: XML (perl-like). Not really reccommended.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:xml(?:\-?)perl|perl(?:[\-\.]?)xml)$/
=item L<DTA::CAB::Format::XmlRpc|DTA::CAB::Format::XmlRpc>
Datum parser|formatter: XML-RPC data structures using RPC::XML. Much too bloated
to be of any real practical use.
Registered as:
name=>__PACKAGE__, filenameRegex=>qr/\.(?i:xml(?:\-?)rpc|rpc(?:[\-\.]?)xml)$/
=item L<DTA::CAB::Format::XmlTokWrap|DTA::CAB::Format::XmlTokWrap>
Datum parser|formatter(s): XML as read/written by L<DTA::TokWrap>.
(name=>__PACKAGE__, filenameRegex=>qr/\.(?i:[tuws]\.?xml)$/)
(name=>__PACKAGE__, short=>$_) foreach (qw(txml t-xml twxml tw-xml))
=item L<DTA::CAB::Format::XmlTokWrapFast|DTA::CAB::Format::XmlTokWrapFast>
Datum parser|formatter(s): XML as read/written by L<DTA::TokWrap>.
Unlike the C<XmlTokWrap> format,
the L<XmlTokWrapFast|DTA::CAB::Format::XmlTokWrapFast> class does not read and/or write the full document structure,
but rather restricts itself to a finite hard-coded subset of the most commonly
used document-, sentence-, and token-level attributes. The input parser
uses the expat-based L<XML::Parser|XML::Parser> module, which usually results in much faster
and memory-friendlier document parsing than offered by the L<XmlTokWrap|DTA::CAB::Format::XmlTokWrap> class.
Registered as:
(name=>__PACKAGE__, filenameRegex=>qr/(?:\.(?i:f[tuws](?:\.?)xml))$/);
(name=>__PACKAGE__, short=>$_) foreach (qw(ftxml ft-xml ftwxml ftw-xml))
=back
=cut
##======================================================================
## Footer
##======================================================================
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2009-2020 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.
=cut