Group
Extension

Apache-Tika-Async/lib/Apache/Tika/Async.pm

package Apache::Tika::Async;
use strict;
use Moo 2;
use JSON::XS qw(decode_json);
use File::Temp 'tempfile';

our $VERSION = '0.11';

=head1 NAME

Apache::Tika::Async - connect to Apache Tika

=head1 SYNOPSIS

  use Apache::Tika::Async;

  my $tika= Apache::Tika::Async->new;

  my $fn= shift;

  use Data::Dumper;
  my $info = $tika->get_all( $fn );
  print Dumper $info->meta($fn);
  print $info->content($fn);
  # <html><body>...
  print $info->meta->{"meta:language"};
  # en

=head1 ACCESSORS

=cut

=head2 B<jarfile>

  jarfile => '/opt/tika/tika-standard-2.9.9.jar',

Sets the Tika Jarfile to be used. The default is to look
in the directory C<jar/> below the current directory.

=cut

has 'jarfile' => (
    is => 'rw',
    #isa => 'Str',
# tika-server-1.24.1.jar
# tika-server-standard-2.3.0.jar

    default => sub {
        $ENV{PERL_APACHE_TIKA_PATH} ||
        __PACKAGE__->best_jar_file(
              glob 'jar/tika-server-*.jar'
        );
    },
);

=head2 B<tika_args>

  tika_args => [],

Additional Tika command line options.

=cut


has tika_args => (
    is => 'rw',
    #isa => 'Array',
    default => sub { [ ] },
);

=head2 B<java>

  java => '/opt/openjdk-11-jre/bin/java',

Sets the Java executable to be used.

=cut

has java => (
    is => 'rw',
    #isa => 'Str',
    default => 'java',
);

=head2 B<java_args>

  java_args => [],

Sets the Java options to be used.

=cut

has java_args => (
    is => 'rw',
    #isa => 'Array',
    builder => sub { [
        # So that Tika can re-read some problematic PDF files better
        '-Dorg.apache.pdfbox.baseParser.pushBackSize=1000000'
    ] },
);

sub _tika_config_xml {
    my( $self, %entries ) = @_;
    return join '',
'<?xml version="1.0" encoding="UTF-8"?>',
'<properties>',
'<!-- <parsers etc.../> -->',
'<server>',
    '<params>',
    (map { join '', "<$_>" => $entries{ $_ } => "</$_>" } sort keys %entries),
    '</params>',
'</server>',
'</properties>',
}

sub tika_config {
    my( $self, %entries ) = @_;
    return $self->_tika_config_xml(
        logLevel => $self->loglevel,
        %entries
    );
}

sub tika_config_temp_file {
    my( $self, %entries ) = @_;

    my( $fh, $name ) = tempfile();
    binmode $fh;
    print {$fh} $self->tika_config(%entries);
    close $fh;

    return $name;
}

sub best_jar_file {
    my( $package, @files ) = @_;
    # Do a natural sort on the dot-version
    (sort { my $ad; $a =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $ad=sprintf '%02d.%04d', $1, $2;
            my $bd; $b =~ /\bserver-(?:standard-|)(\d+)\.(\d+)/ and $bd=sprintf '%02d.%04d', $1, $2;
                $bd <=> $ad
          } @files)[0]
}

sub cmdline {
    my( $self )= @_;
    $self->java,
    @{$self->java_args},
    '-jar',
    $self->jarfile,
    '--config', $self->tika_config_temp_file,
    @{$self->tika_args},
};

sub fetch {
    my( $self, %options )= @_;
    my @cmd= $self->cmdline;
    push @cmd, $options{ type };
    push @cmd, $options{ filename };
    @cmd= map { qq{"$_"} } @cmd;
    #die "Fetching from local process is currently disabled";
    #warn "[@cmd]";
    '' . readpipe(@cmd)
}

sub decode_csv {
    my( $self, $line )= @_;
    $line =~ m!"([^"]+)"!g;
}

sub get_meta {
    my( $self, $file )= @_;
    #return decode_json($self->fetch( filename => $file, type => 'meta' ));
    # Hacky CSV-to-hash decode :-/
    return $self->fetch( filename => $file, type => 'meta' )->meta->get;
};

sub get_text {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'text' )->get;
};

sub get_test {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'test' )->get;
};

sub get_all {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'all' )->get;
};

sub get_language {
    my( $self, $file )= @_;
    return $self->fetch( filename => $file, type => 'language' )->get;
};

# ->detect_stream wants not a file but the input bytes
# sub detect_stream {
#     my( $self, $file )= @_;
#     return $self->fetch( filename => $file, type => 'all' )->get;
# };

# __PACKAGE__->meta->make_immutable;

1;

=head1 ENVIRONMENT

To specify the Tika jar file from the outside, you can set the
C<PERL_APACHE_TIKA_PATH> environment variable.

=head1 REPOSITORY

The public repository of this module is
L<https://github.com/Corion/Apache-Tika-Async>.

=head1 SUPPORT

The public support forum of this module is
L<https://perlmonks.org/>.

=head1 BUG TRACKER

Please report bugs in this module via the RT CPAN bug queue at
L<https://rt.cpan.org/Public/Dist/Display.html?Name=Apache-Tika-Async>
or via mail to L<apache-tika-async-Bugs@rt.cpan.org>.

=head1 AUTHOR

Max Maischein C<corion@cpan.org>

=head1 COPYRIGHT (c)

Copyright 2014-2019 by Max Maischein C<corion@cpan.org>.

=head1 LICENSE

This module is released under the same terms as Perl itself.

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.