Yet Another CPAN Grep

PICA-Data/lib/PICA/Schema/Builder.pm

package PICA::Schema::Builder;
use v5.14.1;

our $VERSION = '2.12';

use Scalar::Util qw(reftype);
use Storable qw(dclone);

use parent 'PICA::Schema';

sub new {
    my $class = shift;
    bless {fields => {}, records => 0, @_}, $class;
}

sub add {
    my ($self, $record) = @_;

    $record = $record->{record} if reftype $record eq 'HASH';
    my $fields = $self->{fields};

    my %field_identifiers;
    foreach (@$record) {
        my ($tag, $occ, @content) = @$_;

        my $id = $self->field_identifier($_);

        # check whether field is repeated within this record
        if ($field_identifiers{$id}) {
            $fields->{$id}{repeatable} = \1;
        }
        else {
            $field_identifiers{$id} = 1;
        }

        # field has not been seen in any record
        if (!$fields->{$id}) {
            next if $self->{ignore_unknown};
            $fields->{$id}
                = {total => 0, records => 0, tag => $tag, subfields => {},};
            $fields->{$id}{occurrence} = $occ if $occ > 0 && length $id gt 4;
            $fields->{$id}{required} = \1 unless $self->{records};
        }

        my $subfields = $fields->{$id}{subfields};
        my %subfield_codes;
        pop @content if @content % 2;    # remove annotation
        while (@content) {
            my ($code, $value) = splice(@content, 0, 2);

            # check whether subfield is repeated within this field
            if ($subfield_codes{$code}) {
                $subfields->{$code}{repeatable} = \1;
            }
            else {
                $subfield_codes{$code} = 1;
            }

            if (!$subfields->{$code}) {
                $subfields->{$code} = {code => $code};
                $subfields->{$code}{required} = \1
                    unless $fields->{$id}{total};
            }
        }

        # subfields not given in this field are not required
        for (grep {!$subfield_codes{$_}} keys %$subfields) {
            delete $subfields->{$_}{required};
        }

        # count total number of this field
        $fields->{$id}{total}++;
    }

    for (grep {exists $fields->{$_}} keys %field_identifiers) {
        $fields->{$_}{records}++;
    }

    # fields not given in this record are not required
    for (grep {!$field_identifiers{$_}} keys %$fields) {
        delete $fields->{$_}{required};
    }

    $self->{records}++;
}

sub schema {
    my $schema = dclone($_[0]->TO_JSON);
    my $fields = $schema->{fields};

    delete $fields->{$_} for grep {!$fields->{$_}{total}} keys %$fields;

    return PICA::Schema->new($schema);
}

1;
__END__

=head1 NAME

PICA::Schema::Builder - Create Avram Schema from examples

=head1 SYNOPSIS

  my $builder = PICA::Schema::Builder->new( title => 'My Schema' );

  while (my $record = get_some_pica_record()) {
      $builder->add($record);
  }

  $schema = $builder->schema;

=head1 DESCRIPTION

An L<Avram Schema|https://format.gbv.de/schema/avram/specification> can be
created automatically from L<PICA::Data> records. The result contains a list of
field that have been used in any of the inspected records. The schema can tell:

=over

=item

the number of inspected C<records>

=item

which C<fields> and their C<subfields> have been found in records

=item

the C<total> number each field has been found and the number of C<records>
with each field.

=item

which of the fields occurr in all records (C<required>)

=item

whether a field has been repeated in a record (C<repeatable>)

=back

And information about requiredness and repeatability for subfields.
Subfield order is not taken into account.

This class is a subclass of L<PICA::Schema>.

=head1 CONSTRUCTOR

The builder can be initialized with information of an existing builder or
schema, in particular C<fields>, C<total>, and C<records>. Option
C<ignore_unknown> will ignore fields not already specified in C<fields>.

=head1 METHODS

=head2 add( $record )

Analyse an additional PICA record.

=head2 schema

Return a L<PICA::Schema> that all analyzed records conform to. This methods
creates a deep copy and removes all fields with C<total> zero.

=head1 SEE ALSO

L<Catmandu::Breaker> can analyze PICA data and create additional statistics.

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.