Group
Extension

PLN-PT/lib/PLN/PT.pm

package PLN::PT;
# ABSTRACT: interface for the http://pln.pt web service
$PLN::PT::VERSION = '0.008';
use strict;
use warnings;

use JSON::XS;
use CHI;
use Digest::MD5 qw/md5_base64/;
use LWP::UserAgent;
use Encode;
use utf8::all;

sub new {
  my ($class, $url) = @_;
  my $self = bless( {url=>$url}, $class);

  $self->{ua} = LWP::UserAgent->new;
  $self->{cache} = CHI->new( driver => 'Memory', global => 1 );

  return $self;
}

sub tokenizer {
  my ($self, $text, $opts) = @_;

  my $url = $self->_cat('tokenizer');
  $url .= '?' . $self->_args($opts);

  return $self->_post($url, $text, $opts);
}

sub morph_analyzer {
  my ($self, $word, $opts) = @_;

  $word =~ s/\// /g; # make it sane, if someone tries to go guessing

  my $url = $self->_cat('morph_analyzer', $word);
  $url .= '?' . $self->_args($opts);

  return $self->_get($url, $opts);
}

sub tagger {
  my ($self, $text, $opts) = @_;

  my $url = $self->_cat('tagger');
  $url .= '?' . $self->_args($opts);

  return $self->_post($url, $text, $opts);
}

sub dep_parser {
  my ($self, $text, $opts) = @_;

  my $url = $self->_cat('dep_parser');
  $url .= '?' . $self->_args($opts);

  return $self->_post($url, $text, $opts);
}

sub tf {
  my ($self, $text, $opts) = @_;

  my $url = $self->_cat('tf');
  $url .= '?' . $self->_args($opts);

  return $self->_post($url, $text, $opts);
}

sub stopwords {
  my ($self, $opts) = @_;

  my $url = $self->_cat('stopwords');
  $url .= '?' . $self->_args($opts);

  return $self->_get($url, $opts);
}

sub _post {
  my ($self, $url, $text, $opts) = @_;

  my $key = $url . '-' . md5_base64(Encode::encode_utf8($text));
  my $data = $self->{cache}->get($key);

  unless ($data) {
    my $req = HTTP::Request->new(POST => $url);
    $req->header('Content-Type', 'text/html; charset=UTF-8');
    $req->content(Encode::encode_utf8($text));

    my $res = $self->{ua}->request($req);
    if ($res->is_success) {
      $data = $res->decoded_content;
      $data = $res->content unless $data;
      $data = Encode::decode_utf8($data);
      $self->{cache}->set($key, $data);
    }
    else {
      print STDERR "HTTP POST error: ", $res->code, " - ", $res->message, "\n";
      return undef;
    }
  }

  return $data if ($opts->{output} and $opts->{output} eq 'raw');
  return JSON::XS->new->decode($data);
}

sub _get {
  my ($self, $url, $opts) = @_;

  my $key = $url . '-' . md5_base64(join('', values %$opts));
  my $data = $self->{cache}->get($key);

  unless ($data) {
    my $req = HTTP::Request->new(GET => $url);

    my $res = $self->{ua}->request($req);
    if ($res->is_success) {
      $data = $res->decoded_content;
      $data = $res->content unless $data;
      $data = Encode::decode_utf8($data);
      $self->{cache}->set($key, $data);
    }
    else {
      print STDERR "HTTP GET error: ", $res->code, " - ", $res->message, "\n";
      return undef;
    }
  }

  return $data if ($opts->{output} and $opts->{output} eq 'raw');
  return JSON::XS->new->decode($data);
}

sub _cat {
  my ($self, @args) = @_;

  my @parts = ($self->{url});
  push @parts, @args;

  return join('/', @parts);
}

sub _args {
  my ($self, $opts) = @_;

  my @args;
  foreach (keys %$opts) {
    push @args, join('=', $_, $opts->{$_});
  }

  return join('&', @args);
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

PLN::PT - interface for the http://pln.pt web service

=head1 VERSION

version 0.008

=head1 SYNOPSIS

    # using as a lib
    my $pln = PLN::PT->new('http://api.pln.pt');
    my $data = $pln->tagger($txt);  # [['A','o',''DA0FS0','0.675415'], ...

    # using the pln-pt tool from the command line
    $ echo "A Maria tem razão . " | pln-pt tagger
    A o DA0FS0 0.675415
    Maria maria NCFS000 1
    tem ter VMIP3S0 0.999287
    razão razão NCFS000 0.65
    . . Fp 1

=head1 DESCRIPTION

This module implements an interface for the Natural Language Processing
(NLP) web service provided by L<http://pln.pt>.

=head1 METHODS

=head2 new

Create new object, given as argument the base endpoint for the web service.

Once the object is created, a set of methods described below can be used to
access several operations in the API. All the methods return a data
structure with the corresponding result, typically a list of tokens with
some extra information depending on the operation used.

=head2 tokenizer

Tokenize the text given as argument, i.e. split the text in tokens (words
by default), for more information on the tokenization operation
visit L<http://pln.pt/api>.

    $ echo "A Maria tem razão ." | pln-pt tokenizer
    A
    Maria
    tem
    razão
    .

=head2 morph_analyzer

Morphologic analysis for a single word, given as argument. For more
information on the tagging operation visit L<http://pln.pt/api>.

    $ echo cavalo | pln-pt morph
    cavalo cavalo NCMS000 0.914011 cavalar VMIP1S0 0.0859891

=head2 tagger

Part-of-speech tagging the tokens in the text, given as argument, for more
information on the tagging operation visit L<http://pln.pt/api>.

    $ echo "A Maria tem razão ." | pln-pt tagger
    A o DA0FS0 0.675415
    Maria maria NCFS000 1
    tem ter VMIP3S0 0.999287
    razão razão NCFS000 0.65
    . . Fp 1

=head2 dep_parser

Build a dependency tree for the text given as argument, for more information
on the dependency tree visit L<http://pln.pt/api>.

    $ echo "A Maria tem razão ." | pln-pt dep_parser
    1	A	_	DET	art|<artd>|F|S	(...)	2	det	_	_
    2	Maria	_	PROPN	prop|F|S	(...)	3	nsubj	_	_
    3	tem	_	VERB	v-fin|PR|3S|IND	(...)	0	ROOT	_	_
    4	razão	_	NOUN	n|F|S	(...)	3	dobj	_	_
    5	.	_	PUNCT	punc	(...)	3	punct	_

=head2 tf

Compute term frequency given a text.

=head2 stopwords

Retrieve lists of stop words.

=head1 ACKNOWLEDGEMENTS

This work is partially supported by the project “SmartEGOV: Harnessing EGOV for Smart Governance (Foundations, methods, Tools) / NORTE-01-0145-FEDER-000037”,
supported by Norte Portugal Regional Operational Programme (NORTE 2020),
under the PORTUGAL 2020 Partnership Agreement, through the European Regional
Development Fund (EFDR).

=head1 AUTHORS

=over 4

=item *

Nuno Carvalho <smash@cpan.org>

=item *

Alberto Simões <ambs@cpan.org>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2016 - 2017 by Nuno Carvalho.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.