DiaColloDB/DiaColloDB.pod
##========================================================================
## POD DOCUMENTATION, auto-generated by podextract.perl
##========================================================================
## NAME
=pod
=head1 NAME
DiaColloDB - diachronic collocation database, top-level
=cut
##========================================================================
## SYNOPSIS
=pod
=head1 SYNOPSIS
##========================================================================
## PRELIMINARIES
use DiaColloDB;
##========================================================================
## Constructors etc.
$coldb = CLASS_OR_OBJECT->new(%args);
##========================================================================
## I/O: open/close
$coldb_or_undef = $coldb->open($dbdir,%opts);
@dbkeys = $coldb->dbkeys();
$coldb_or_undef = $coldb->close();
$bool = $coldb->opened();
@files = $obj->diskFiles();
##========================================================================
## I/O: header
@keys = $coldb->headerKeys();
$bool = $coldb->loadHeaderData();
##========================================================================
## create/compile (see DiaColloDB::methods::compile)
$bool = $coldb->create($corpus,%opts);
$coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts);
##========================================================================
## Export/Import (see DiaColloDB::methods::export)
$bool = $coldb->dbexport();
$coldb = $coldb->dbimport();
##========================================================================
## Info
\%info = $coldb->dbinfo();
##========================================================================
## Profiling: Utils
$relname = $coldb->relname($rel);
$obj_or_undef = $coldb->relation($rel);
\@ids = $coldb->enumIds($enum,$req,%opts);
($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi)
= $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0);
$compiler = $coldb->qcompiler();
$cquery_or_undef = $coldb->qparse($ddc_query_string);
$cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs
\@aqs = $coldb->queryAttributes($cquery,%opts);
\@aqs = $coldb->parseRequest($request, %opts);
\%groupby = $coldb->groupby($groupby_request, %opts);
$cqfilter = $coldb->query2filter($attr,$cquery,%opts);
($CQCountKeyExprs,\$CQRestrict,\@CQFilters)
= $coldb->parseGroupBy($groupby_string_or_request,%opts);
##========================================================================
## Profiling: Generic
$mprf = $coldb->profile($relation, %opts);
$mprf = $coldb->extend($relation,%opts);
\%opts = $CLASS_OR_OBJECT->profileOptions(\%opts);
##========================================================================
## Profiling: Comparison (diff)
$mprf = $coldb->compare($relation, %opts);
\%opts = $CLASS_OR_OBJECT->compareOptions(\%opts);
=cut
##========================================================================
## DESCRIPTION
=pod
=head1 DESCRIPTION
The DiaColloDB package is the top-level module
for the DiaColloDB diachronic collocation database distribution.
As a Perl class, a DiaColloDB object can be used to create
or query a local native database instance.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Globals & Constants
=pod
=head2 Globals & Constants
=over 4
=item Variable: $VERSION
Package version.
=item Variable: @ISA
L<DiaColloDB|DiaColloDB> inherits from L<DiaColloDB::Client|DiaColloDB::Client>,
and provides the low-level basis for the L<DiaColloDB::Client|DiaColloDB::Client> API.
=item Variables: Corpus Filters
Default corpus-content filter values are
imported from L<DiaColloDB::Corpus::Filters|DiaColloDB::Corpus::Filters>.
=item Variable: $TDF_MGOOD_DEFAULT
Default positive meta-field regex for document parsing (tdf only).
Default = C<q/^(?:author|pnd|title|basename|collection|flags|textClass|genre)$/>.
=item Variable: $TDF_MBAD_DEFAULT
Fefault negative meta-field regex for document parsing (tdf only).
Default = C<q/_$/>.
=item Variable: $ECLASS
enum class; default 'DiaColloDB::EnumFile::MMap'.
Default = 'DiaColloDB::EnumFile::MMap'.
=item Variable: $XECLASS
fixed-length enum class.
Default = 'DiaColloDB::EnumFile::FixedLen'
=item Variable: $MMCLASS
multimap class.
Default = 'DiaColloDB::MultiMapFile'
=item Variable: %TDF_OPTS
Default options for L<DiaColloDB::Relation::TDF-E<gt>new()|DiaColloDB::Relation::TDF/new>.
Default:
mgood => $TDF_MGOOD_DEFAULT, ##-- positive filter regex for metadata attributes
mbad => $TDF_MBAD_DEFAULT, ##-- negative filter regex for metadata attributes
##
minFreq=>undef, ##-- minimum total term-frequency for model inclusion (default=from $coldb->{tfmin})
minDocFreq=>4, ##-- minimim "doc-frequency" (#/docs per term) for model inclusion
minDocSize=>4, ##-- minimum doc size (#/tokens per doc) for model inclusion (default=8; formerly $coldb->{vbnmin})
maxDocSize=>'inf', ##-- maximum doc size (#/tokens per doc) for model inclusion (default=inf; formerly $coldb->{vbnmax})
##
vtype=>'float', ##-- store compiled values as 32-bit floats
itype=>'long', ##-- store compiled indices as 32-bit integers
=item Variable: $NJOBS
Number of parallel jobs (threads) for various compile-time operations.
Setting this variable to C<0> (zero) should cause all DiaColloDB operations
to run in pure serial.
On UNIX/Linux systems with a parseable F</proc/cpuinfo> pseudo-file,
setting this to C<-1> will run as many threads as there are CPU cores
on the system; on other systems, negative values behave like C<0>.
Default: -1.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Constructors etc.
=pod
=head2 Constructors etc.
=over 4
=item new
$coldb = CLASS_OR_OBJECT->new(%args);
%args, object structure:
(
##-- options
dbdir => $dbdir, ##-- database directory; REQUIRED
flags => $fcflags, ##-- fcntl flags or open()-style mode string; default='r'
attrs => \@attrs, ##-- index attributes (input as space-separated or array; compiled to array); default=undef (==>['l'])
## + each attribute can be token-attribute qw(w p l) or a document metadata attribute "doc.ATTR"
## + document "date" attribute is always indexed
info => \%info, ##-- additional data to return in info() method (e.g. collection, maintainer)
pack_id => $fmt, ##-- pack-format for IDs (default='N')
pack_f => $fmt, ##-- pack-format for frequencies (default='N')
pack_date => $fmt, ##-- pack-format for dates (default='n')
pack_off => $fmt, ##-- pack-format for file offsets (default='N')
pack_len => $len, ##-- pack-format for string lengths (default='n')
dmax => $dmax, ##-- maximum distance for collocation-frequencies and implicit ddc near() queries (default=5)
cfmin => $cfmin, ##-- minimum co-occurrence frequency for Cofreqs and ddc queries (default=2)
tfmin => $tfmin, ##-- minimum global term-frequency WITHOUT date component (default=2)
fmin_${a} => $fmin, ##-- minimum independent frequency for value of attribute ${a} (default=undef:from $tfmin)
keeptmp => $bool, ##-- keep temporary files? (default=0)
index_xf => $bool, ##-- xf: create/use unigram index (default=1)
index_cof => $bool, ##-- cof: create/use co-frequency index (default=1)
index_tdf => $bool, ##-- tdf: create/use (term x document) frequency matrix index? (default=undef: if available)
dbreak => $dbreak, ##-- tdf: use break-type $break for tdf index (default=undef: files)
tdfopts => \%tdfopts, ##-- tdf: options for DiaColloDB::Relation::TDF->new(); default=undef (all inherited from %TDF_OPTS)
##
##-- runtime ddc relation options
ddcServer => $server, ##-- server for ddc relation ("$host:$port")
ddcTimeout => $secs, ##-- timeout for ddc relation
##
##-- source filtering (for create() - see DiaColloDB::Corpus::Filters)
pgood => $regex, ##-- positive filter regex for part-of-speech tags
pbad => $regex, ##-- negative filter regex for part-of-speech tags
wgood => $regex, ##-- positive filter regex for word text
wbad => $regex, ##-- negative filter regex for word text
lgood => $regex, ##-- positive filter regex for lemma text
lbad => $regex, ##-- negative filter regex for lemma text
##
##-- logging
logOpen => $level, ##-- log-level for open/close (default='info')
logCreate => $level, ##-- log-level for create messages (default='info')
logCorpusFile => $level, ##-- log-level for corpus file-parsing (default='trace')
logCorpusFileN => $N, ##-- log corpus file-parsing only for every N files (0 for none; default:undef ~ $corpus->size()/100)
logExport => $level, ##-- log-level for export messages (default='info')
logProfile => $level, ##-- log-level for verbose profiling messages (default='trace')
logRequest => $level, ##-- log-level for request-level profiling messages (default='debug')
logCompat => $level, ##-- log-level for compatibility warnings (default='warn')
##
##-- runtime limits
maxExpand => $size, ##-- maximum number of elements in query expansions (default=65535)
##
##-- administrivia
version => $version, ##-- DiaColloDB version of stored db (==$DiaColloDB::VERSION)
upgraded=>\@upgraded, ##-- optional administrative information about auto-magic upgrades
##
##-- attribute data
${a}enum => $aenum, ##-- attribute enum: $aenum : ($dbdir/${a}_enum.*) : $astr<=>$ai : A*<=>N
## e.g. lemmata: $lenum : ($dbdir/l_enum.* ) : $lstr<=>$li : A*<=>N
${a}2t => $a2t, ##-- attribute multimap: $a2t : ($dbdir/${a}_2t.*) : $ai=>@tis : N=>N*
pack_t$a => $fmt ##-- pack format: extract attribute-id $ai from a packed tuple-string $ts ; $ai=unpack($coldb->{"pack_x$a"},$ts)
##
##-- tuple data (-dates)
## + as of v0.10.000, packed term tuples EXCLUDING dates ("t-tuples") are mapped by $coldb->{tenum}
## + prior to v0.10.000, term tuples INCLUDING dates ("x-tuples") were mapped by $coldb->{xenum}, now obsolete
tenum => $tenum, ##-- enum: tuples ($dbdir/tenum.*) : \@ais<=>$ti : N*<=>N
pack_t => $fmt, ##-- symbol pack-format for $tenum : "${pack_id}[Nattrs]"
xenum => $xenum, ##-- enum: tuples ($dbdir/xenum.*) : [@ais,$di]<=>$xi : N*n<=>N
pack_t => $fmt, ##-- symbol pack-format for $tenum : "${pack_id}[Nattrs]"
xdmin => $xdmin, ##-- minimum date (>= v0.04)
xdmax => $xdmax, ##-- maximum date (>= v0.04)
##
##-- relation data
xf => $xf, ##-- ug: [$ti, $date] => f($ti, $date)
cof => $cof, ##-- cf: [$ti1,$date,$ti2] => f($ti1,$date,$ti2)
ddc => $ddc, ##-- ddc client relation
tdf => $tdf, ##-- tdf: (term x document) frequency matrix relation
)
=item promote
$cli_or_undef = $cli->promote($class,%opts);
DiaColloDB::Client method override: unsupported.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: I/O: open/close
=pod
=head2 I/O: open/close
=over 4
=item open
$coldb_or_undef = $coldb->open($dbdir,%opts);
$coldb_or_undef = $coldb->open();
Open the DB.
=item dbkeys
@dbkeys = $coldb->dbkeys();
Returns list of %$coldb keys whose values are expected to be sub-objects.
=item close
$coldb_or_undef = $coldb->close();
Close current DB, if opened.
=item opened
$bool = $coldb->opened();
Returns truee iff db is opened.
=item diskFiles
@files = $coldb->diskFiles();
Returns list of dist files for $coldb.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: I/O: header
=pod
=head2 I/O: header
Largely inherited from DiaColloDB::Persistent.
=over 4
=item headerKeys
@keys = $coldb->headerKeys();
keys to save as header
=item loadHeaderData
$bool = $coldb->loadHeaderData();
$bool = $coldb->loadHeaderData($data)
loads header data.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: create
=pod
=head2 create/compile
See L<DiaColloDB::methods::compile>.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Export/Import
=pod
=head2 Export/Import
See L<DiaColloDB::methods::export>.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Info
=pod
=head2 Info
=over 4
=item dbinfo
\%info = $coldb->dbinfo();
get db info
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Profiling: Utils
=pod
=head2 Profiling: Utils
=over 4
=item relname
$relname = $coldb->relname($rel);
Returns an appropriate relation name for profile() and friends:
=over 4
=item *
returns $rel if $coldb-E<gt>{$rel} supports a profile() method
=item *
otherwise heuristically parses $relationName /xf|f?1|ug/ or /f1?2|c/
=back
=item relation
$obj_or_undef = $coldb->relation($rel);
returns an appropriate relation-like object for profile() and friends;
really just wraps C<$coldb-E<gt>{$coldb-E<gt>relname($rel)}>.
=item relations
@relnames = $coldb->relations();
gets list of relation names supported by $coldb.
=item enumIds
\@ids = $coldb->enumIds($enum,$req,%opts);
parses enum IDs for $req, which is one of:
=over 4
=item *
a DDC::XS::CQTokExact, ::CQTokInfl, ::CQTokSet, ::CQTokSetInfl, or ::CQTokRegex : interpreted
=item *
an ARRAY-ref : list of literal symbol-values
=item *
a Regexp ref : regexp for target strings, passed to $enum-E<gt>re2i()
=item *
a string /REGEX/ : regexp for target strings, passed to $enum-E<gt>re2i()
=item *
another string : space-, comma-, or C<|>-separated list of literal values
=back
%opts:
logLevel => $logLevel, ##-- logging level (default=undef)
logPrefix => $prefix, ##-- logging prefix (default="enumIds(): fetch ids")
=item parseDateRequest
($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi) = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0);
\%dateRequest = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0);
low-level parsing for date (slice) requests. Returns limit and filter information as a list if called
in list context (first form) or as a HASH-ref C<\%dateRequest> if called in scalar context (second form).
Returned C<\%dateRequest> has keys corresponding to the list-elements returned in list context:
dfilter => $dfilter, ##-- filter-sub, called as: $wanted=$dfilter->($date); undef for none
slo => $sliceLo, ##-- minimum slice (inclusive)
shi => $sliceHi, ##-- maximum slice (inclusive)
dlo => $dateLo, ##-- minimum date (inclusive); undef for none, always defined if $fill is true
dhi => $dateHi, ##-- maximum date (inclusive); undef for none, always defined if $fill is true
Accepted formats for input parameter C<$dateRequest>:
=over 4
=item Empty Date
An empty string or a string containing only whitespace and asterisk (C<*>) characters
is ignored (C<$dlo=$dhi=undef>); this
should be interepreted by the caller as requesting the full indexed date range.
=item Date Regex
A date request C</REGEX/> enclosed in slashes is treated as a regular expression
matching all and only the desired dates. Throws an error if
C<$ddcMode> is true, since DDC currently does not support date regexes.
=item Date Range
A date request of the form C<MIN:MAX> matches all dates in the range
I<[MIN..MAX]> (inclusive). For convenience, either or both of I<MIN>
and I<MAX> may be an asterisk (C<*>) to indicate the minimum (rsp. maximum)
date stored in the index.
=item Date List
A whitespace-, comma-, or C<|>-separated list of values is treated as a literal list of target dates.
Throws an error if C<$ddcMode> is true.
=item Date Value
Any other value is treated as a literal single target date.
=back
=item qcompiler
$compiler = $coldb->qcompiler();
get DDC::XS::CQueryCompiler for this object (cached in $coldb-E<gt>{_qcompiler})
=item qparse
$cquery_or_undef = $coldb->qparse($ddc_query_string);
wraps parse in an eval {...} block and sets $coldb-E<gt>{error} on failure
=item parseQuery
$cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs;
$cquery = $coldb->parseQuery(["$attr1:$val1",...], %opts) ##-- compat: ARRAY-of-requests
$cquery = $coldb->parseQuery({$attr1=>$val1, ...}, %opts) ##-- compat: HASH
$cquery = $coldb->parseQuery("$attr1=$val1, ...", %opts) ##-- compat: string
$cquery = $coldb->parseQuery($ddcQueryString, %opts) ##-- ddc string (with shorthand ","->WITH, "&&"->WITH)
Guts for parsing user target and groupby requests;
returns a L<DDC::XS::CQuery|DDC::XS::CQuery> object representing the request.
Index-only items "$l" are mapped to $l=*
%opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn')
logas => $reqtype, ##-- request type for warnings
default => $attr, ##-- default attribute (for query requests)
mapand => $bool, ##-- map CQAnd to CQWith? (default=true unless '&&' occurs in query string)
ddcmode => $bool, ##-- force ddc query mode? (default=false)
If the first argument is a reference, it is parsed as a native query request.
Otherwise, it is assumed to be a string either in the "native" (backwards-compatible)
single-token request-notation
or a valid L<DDC query|http://odo.dwds.de/~moocow/software/ddc/ddc_query.html>.
If the request looks like a simple request, it is parsed into a L<DDC::XS::CQuery|DDC::XS::CQuery> object
using local heuristics; DDC queries are parsed directly. The query syntax for "native"
DiaColloDB queries is:
q_native ::= qn_clause ((" "|",") qn_clause)*
qn_clause ::= ("$"? qn_attr "=")? qn_value
qn_attr ::= STRING
qn_value ::= qn_regex | qn_words
qn_regex ::= "/" REGEX "/" qn_regmod
qn_regmod ::= ("g"|"i"|"m"|"s"|"a"|"l"|"u"|"x")*
qn_words ::= qn_word ("|" qn_word)*
qn_word ::= STRING
Native request clauses are parsed into queries of type
L<CQTokSet|DDC::XS::CQuery>,
L<CQTokExact|DDC::XS::CQuery>,
L<CQTokRegex|DDC::XS::CQuery>,
or L<CQTokAny|DDC::XS::CQuery>,
and the returned query object conjoins multiple native request clauses
using L<CQTokWith|DDC::XS::CQuery>.
DDC queries are much more flexible,
but not all L<DiaColloDB::Relation|DiaColloDB::Relation> types support the full range
of the DDC query syntax.
In particular, the default relation classes
L<DiaColloDB::Relation::Cofreqs|DiaColloDB::Relation::Cofreqs>
and
L<DiaColloDB::Relation::Unigrams|DiaColloDB::Relation::Unigrams>
support only those query types
accepted by
the L<queryAttributes()|/queryAttributes> method.
=item queryAttributes
\@aqs = $coldb->queryAttributes($cquery,%opts);
Utility for decomposing DDC queries into attribute-wise requests;
returns an ARRAY-ref [[$attr1,$val1], ...].
Each value $vali is empty or undef (all values),
a L<CQTokSet|DDC::XS::CQuery>,
a L<CQTokExact|DDC::XS::CQuery>,
a L<CQTokRegex|DDC::XS::CQuery>,
or a L<CQTokAny|DDC::XS::CQuery>.
Chokes on unsupported query types or filters.
%opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn')
logas => $reqtype, ##-- request type for warnings
default => $attr, ##-- default attribute (for query requests)
allowUnknown => $bool, ##-- allow unknown attributes? (default: 0)
=item parseRequest
\@aqs = $coldb->parseRequest($request, %opts);
Guts for parsing user target and groupby requests into attribute-wise ARRAY-ref C<[[$attr1,$val1], ...]>,
used by native profiling methods.
See L<parseQuery()|/parseQuery> method for supported C<$request> formats and C<%opts>.
Wraps C<$coldb-E<gt>queryAttributes($coldb-E<gt>parseQuery($request,%opts))>.
=item groupby
\%groupby = $coldb->groupby($groupby_request, %opts);
\%groupby = $coldb->groupby(\%groupby, %opts);
Parse a user groupby request, used by native profiling methods.
See L<parseRequest()|/parseRequest> for details on syntax of C<$groupby_request>.
Unlike "query" request parsing, native query-request B<attributes> are obligatory and B<values> are optional
in "groupby" parsing mode:
q_groupby ::= qg_clause ((" "|",") qg_clause)*
qg_clause ::= "$"? qn_attr ("=" qn_value)?
Returns a HASH-ref of the form:
req => $request, ##-- save request
ti2g => \&ti2g, ##-- group-tuple extraction code ($ti => $gtuple) : $g_packed = $ti2g->($ti)
ts2g => \&ts2g, ##-- group-tuple extraction code ($ts => $gtuple) : $g_packed = $ts2g->($ts)
g2s => \&g2s, ##-- stringification object suitable for DiaColloDB::Profile::stringify() [CODE,enum, or undef]
g2txt => \&g2txt, ##-- backwards-compatible join()-string stringifcation sub: join("\t",unpack($pack_g,$g_packed))
tpack => \@tpack, ##-- group-attribute-wise pack-templates, given @ttuple
gpack => \@gpack, ##-- group-attribute-wise pack-templates, given @gtuple
areqs => \@areqs, ##-- parsed attribute requests ([$attr,$ahaving],...)
attrs => \@attrs, ##-- like $coldb->attrs($groupby_request), modulo "having" parts
titles => \@titles, ##-- like map {$coldb->attrTitle($_)} @attrs
Options %opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn')
relax => $bool, ##-- allow unsupported attributes (default=0)
tenum => $tenum, ##-- enum to use for \&t2g and \&t2s (default: $coldb->{tenum})
=item query2filter
$cqfilter = $coldb->query2filter($attr,$cquery,%opts);
Converts a CQToken to a CQFilter, for ddc parsing.
%opts:
logas => $logas, ##-- log-prefix for warnings
=item parseGroupBy
($CQCountKeyExprs,\$CQRestrict,\@CQFilters) = $coldb->parseGroupBy($groupby_string_or_request,%opts);
%opts:
date => $date,
slice => $slice,
matchid => $matchid, ##-- default match-id
ddc-mode groupby parsing utility.
In addition to the native groupby syntax supported by
the L<groupby()|/groupby> method,
ddc-mode parsing
also allows specification of a literal DDC L<count-ley list|http://odo.dwds.de/~moocow/software/ddc/ddc_query.html#rule_l_countkeys>
by enclosing it in square brackets:
ddc_groupby ::= q_group | ("#BY"? "[" l_countkeys "]")
This is mainly useful in conjunction with
user-defined L<match-ids|http://odo.dwds.de/~moocow/software/ddc/ddc_query.html#rule_matchid>
in the corresponding parsed L<query|DDC::XS::CQuery>,
document metadata attributes,
and/or server-side regex key transformations;
see L<http://odo.dwds.de/~moocow/software/ddc/ddc_query.html#rule_count_key> for details.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Profiling: Generic
=pod
=head2 Profiling: Generic
=over 4
=item profile
$mprf = $coldb->profile($relation, %opts);
Get a relation profile for selected items as a L<DiaColloDB::Profile::Multi|DiaColloDB::Profile::Multi> object.
%opts:
##-- selection parameters
query => $query, ##-- target request ATTR:REQ...
date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all
##
##-- aggregation parameters
slice => $slice, ##-- date slice (default=1, 0 for global profile)
groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method
##
##-- scoring and trimming parameters
eps => $eps, ##-- smoothing constant (default=0)
score => $func, ##-- scoring function (f,fm,lf,lfm,mi,ld) : default="f"
kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all
cutoff => $cutoff, ##-- minimum score
global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0)
##
##-- profiling and debugging parameters
strings => $bool, ##-- do/don't stringify (default=do)
fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices
onepass => $bool, ##-- if true, use old, fast, incorrect 1-pass method (default=0)
Sets default %opts and wraps $coldb-E<gt>L<relation|/relation>($rel)-E<gt>L<profile|DiaColloDB::Relation/profile>($coldb, %opts).
=item extend
$mprf = $coldb->extend($relation, %opts);
Get independent f2 frequencies for C<$opts{slice2keys}>, which is B<EITHER>
a HASH-ref C<{$sliceLabel1=E<gt>\@sliceKeys1, ...}>,
B<OR>
a JSON-string encoding such a HASH-ref.
Options C<%opts> are as for the L<profile()|/profile> method (mostly ignored),
and also:
slice2keys => \%slice2keys, ##-- target f2-items or JSON-string (REQUIRED)
Returns a L<DiaColloDB::Profile::Multi|DiaColloDB::Profile::Multi> object
containing the appropriate f2 entries. Used by C<list-clients|DiaColloDB::Client::list>
to ensure correct f2 counts for "missing" collocate items;
see L<DiaColloDB::Client::list/"Incorrect Independent Collocate Frequencies"> for details.
=item profileOptions
\%opts = $CLASS_OR_OBJECT->profileOptions(\%opts);
Instantiates default options for L<profile()|/profile> method.
May be used e.g. by L<DiaColloDB::Client|DiaColloDB::Client> subclasses.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DiaColloDB: Profiling: Comparison (diff)
=pod
=head2 Profiling: Comparison (diff)
=over 4
=item compare
$mprf = $coldb->compare($relation, %opts);
Get a relation comparison profile for selected items as a L<DiaColloDB::Profile::MultiDiff|DiaColloDB::Profile::MultiDiff> object.
%opts:
##-- selection parameters
(a|b)?query => $query, ##-- target query as for parseRequest()
(a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all
##
##-- aggregation parameters
groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method
(a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile)
##
##-- scoring and trimming parameters
eps => $eps, ##-- smoothing constant (default=0)
score => $func, ##-- scoring function (f,fm,lf,lfm,mi,ld) : default="f"
kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all
cutoff => $cutoff, ##-- minimum score (UNUSED for comparison profiles)
global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0)
diff => $diff, ##-- low-level score-diff operation (diff|adiff|sum|min|max|avg|havg|gavg|lavg); default='adiff'
##
##-- profiling and debugging parameters
strings => $bool, ##-- do/don't stringify (default=do)
Sets default %opts and wraps $coldb-E<gt>L<relation|/relation>($rel)-E<gt>L<compare|DiaColloDB::Relation/compare>($coldb, %opts)
=item compareOptions
\%opts = $CLASS_OR_OBJECT->compareOptions(\%opts);
Instantiates default options for L<compare()|/compare> method.
May be used e.g. by L<DiaColloDB::Client|DiaColloDB::Client> subclasses.
=back
=cut
##======================================================================
## Footer
##======================================================================
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2020 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.14.2 or,
at your option, any later version of Perl 5 you may have available.
=head1 SEE ALSO
L<DiaColloDB::methods::compile(3pm)|DiaColloDB::methods::compile>,
L<DiaColloDB::methods::export(3pm)|DiaColloDB::methods::export>,
L<DiaColloDB::Client(3pm)|DiaColloDB::Client>,
L<DiaColloDB::Corpus(3pm)|DiaColloDB::Corpus>,
L<DiaColloDB::Document(3pm)|DiaColloDB::Document>,
L<DiaColloDB::Persistent(3pm)|DiaColloDB::Persistent>,
L<DiaColloDB::Profile(3pm)|DiaColloDB::Profile>,
L<DiaColloDB::Relation(3pm)|DiaColloDB::Relation>,
L<DiaColloDB::Temp(3pm)|DiaColloDB::Temp>,
L<DiaColloDB::Utils(3pm)|DiaColloDB::Utils>,
L<dcdb-create.per(1)|dcdb-create.perl>,
L<dcdb-query.perl(1)|dcdb-query.perl>,
L<dcdb-info.perl(1)|dcdb-info.perl>,
L<dcdb-export.perl(1)|dcdb-export.perl>,
L<dcdb-dump.perl(1)|dcdb-dump.perl>,
L<DiaColloDB::WWW(3pm)|DiaColloDB::WWW>,
L<perl(1)|perl>,
...
=cut