Group
Extension

PMLTQ-Commands/lib/PMLTQ/Command/query.pm

package PMLTQ::Command::query;
our $AUTHORITY = 'cpan:MATY';
$PMLTQ::Command::query::VERSION = '2.0.3';
# ABSTRACT: WIP: Executes query on treebank

use PMLTQ::Base 'PMLTQ::Command';
use PMLTQ;
use Cwd;
use File::Spec;

use Treex::PML;
use Treex::PML::Instance;
use Treex::PML::Schema;
use Getopt::Long qw(GetOptionsFromArray);
use PMLTQ::Common ':tredmacro';
use HTTP::Request::Common;
use LWP::UserAgent;
use File::Temp;
use Encode;
use Pod::Usage 'pod2usage';
use JSON;

my $extension_dir;
my %opts;

has usage => sub { shift->extract_usage };

sub run {
  my $self = shift;
  my @args = @_;
  GetOptionsFromArray(\@args, \%opts,
  'debug|D',
  'server|s=s',
  'command|c=s',

  'ntred|N',
  'jtred|J',
  'btred|B',
  'sql|S',
  'shared-dir|d=s',
  'keep-tmp-files',
  'filelist|l=s',

  'username=s',
  'password=s',
  'auth-id=s',

  'pmltq-extension-dir|X=s',

  'stdin',
  'query|Q=s',
  'query-id|i=s',
  'query-file|f=s',
  'query-pml-file|p=s',
  'filters|F=s',
  'no-filters',

  'old-api',
  'output-json',

  'netgraph-query|G=s',

  'print-servers|P',
  'config-file|c=s',

  'node-types|n',
  'relations|r',

  'limit|L=i',
  'timeout|t=i',
  'history|H',

  'quiet|q',
  'help|h=s@',
  'usage|u',
  'version|V',
  'man' ) || die "invalid options";
  Treex::PML::AddResourcePath(
       PMLTQ->resources_dir,
       File::Spec->catfile(${FindBin::RealBin},'config'),
       $ENV{HOME}.'/.tred.d'
      );
  Treex::PML::AddBackends(qw(Storable PMLBackend PMLTransformBackend));

  if ($opts{stdin}) {
    local $/;
    $opts{query} = <STDIN>;
  }

  $opts{$1}=1 if defined($opts{server}) and $opts{server}=~s{^[nbj]tred://}{};
  $extension_dir =
    $opts{'pmltq-extension-dir'} ||
    File::Spec->catfile($ENV{HOME},'.tred.d','extensions', 'pmltq');
  Treex::PML::AddResourcePath(File::Spec->catfile($extension_dir,'resources'));
  if ($opts{ntred}) {
    ntred_search();
  } elsif ($opts{jtred}) {
    jtred_search();
  } elsif ($opts{btred}) {
    btred_search(@args);
  } else {
    $self->pmltq_http_search();
  }
}

my %auth;
sub pmltq_http_search {
  my $self = shift;
  my @args = @_;
  my $query;
  if ($opts{query} and !@args) {
    $query = $opts{query};
  } elsif (!$query and @args) {
    $query=join ' ',@args;
  } elsif ($opts{'query-pml-file'}) {
    my $query_file = Treex::PML::Factory->createDocumentFromFile($opts{'query-pml-file'});
    die "Failed to open PML query file $opts{'query-pml-file'}: $Treex::PML::FSError\n" if $Treex::PML::FSError or !$query_file or !$query_file->trees;
    $query = first {
      !$opts{'query-id'} or $_->{id} and $_->{id} eq $opts{'query-id'}
    } $query_file->trees;
    die "Didn't find query $opts{'query-id'} in query file $opts{'query-pml-file'}!" unless $query;
    $query = encode('UTF-8',PMLTQ::Common::as_text($query));
  } elsif ($opts{'query-file'}) {
    local $/;
    open my $fh, $opts{'query-file'}
      or die "Cannot open query file '$opts{'query-file'}': $!";
    $query = <$fh>;
    if ($opts{'query-id'}) {
      $query=~s/#\s*==\s*query:\s*\Q$opts{'query-id'}\E\s* ==(.*?)(?:#\s*==\s*query:\s*\w+\s*==.*|$)/$1/s;
    }
  } elsif (!$opts{'node-types'} and !$opts{'relations'} and !$opts{'print-servers'}) {
    pod2usage(-msg => 'pmltq');
    exit 1;
  }
  if (!$opts{'query-pml-file'} and $opts{'netgraph-query'}) {
    require PMLTQ::NG2PMLTQ;
    $query = PMLTQ::NG2PMLTQ::ng2pmltq($query,{type=>$opts{'netgraph-query'}});
  }

  if (!$opts{'node-types'} and !$opts{'relations'} and !$opts{'print-servers'}) {
    die "Query is empty!" unless $query;

    my $filters = $opts{'filters'};
    if ($filters and $filters=~/\S/) {
      $filters='>> '.$filters unless $filters =~ /^\s*>>/;
      $query .= $filters;
    }
  }

  $opts{'config-file'} ||= Treex::PML::FindInResources('treebase.conf');
  if ($opts{debug}) {
    print STDERR "Reading configuration from $opts{'config-file'}\n";
  }
  my $configs = (-f $opts{'config-file'}) ?
      Treex::PML::Factory->createPMLInstance({ filename=>$opts{'config-file'} })->get_root->{configurations}
  : undef;

  my $id = $opts{'server'};
  $id ||= 'default' unless $opts{'print-servers'};
  my ($conf,$type) = $id ? get_server_conf($configs,$id, $opts{'old-api'}) : ();
  %auth = (
    username => $opts{username},
    password => $opts{password},
   );
  if ($opts{'auth-id'}) {
    my ($auth) = get_server_conf($configs,$opts{'auth-id'}, $opts{'old-api'});
    if ($auth) {
      $auth{$_} ||= $auth->{$_} for qw(username password);
    } else {
      die "Didn't find auth-id configuration: $opts{'auth-id'}\n";
    }
  }
  if ($conf) {
    $auth{$_} ||= $conf->{$_} for qw(username password baseurl);
  }


  if ($opts{'print-servers'}) {
    if ($opts{'server'}) {
      unless ($type eq 'http') {
  die "Cannot query available services on a $type server";
      }
      my $result='';
      $self->http_search($conf->{url},$query,{ other=>1,
          callback => sub { $result.=$_[0] },
          debug=>$opts{debug},
          %auth,
          'baseurl' => $conf->{baseurl}
               });
      my @services = split /\n/,$result;
      for my $srv (@services) {
  my %srv = map { split(':',$_,2) } split /\t/, $srv;
  print $srv{id},"\t",$srv{service},"\t",$srv{title},"\n";
      }
      exit;
    }
    my @types = qw(dbi http);
    my %columns = (
      dbi => [qw(driver host port database username sources)],
      http => [qw(url username cached_description/title)],
     );
    my %configs = (
      map { my $type = $_; ($_ =>[map $_->value, grep { $_->name eq $type } SeqV($configs)]) }
  @types
       );
    for my $type (@types) {
      my $confs = $configs{$type};
      if (@$confs) {
  print uc($type)." configurations:\n";
  print (("-"x60)."\n");
  no warnings;
  for my $c (@$confs) {
    print $c->{id}.": ".(join(", ", map "$_->[0]=$_->[1]",
            grep length($_->[1]),
            map [m{/(.*)} ? $1 : $_,Treex::PML::Instance::get_data($c,$_)], @{$columns{$type}})."\n");
  }
      }
      print "\n";
    }
    exit;
  }



  print STDERR $query,"\n" if $opts{debug};



  if ($type eq 'http') {
    #if($opts{'old-api'}){
    $self->http_search($conf->{url},$query,{ 'node-types'=>$opts{'node-types'},
              'relations'=>$opts{'relations'},
              debug=>$opts{debug},
              %auth,
              'old-api'=>$opts{'old-api'},
              'output-json'=>$opts{'output-json'},
              'baseurl' => $conf->{baseurl}
             });
    #} else { ## NEW API
    #  print STDERR "TODO NEW API\n";
    #}
  } else {
    require PMLTQ::SQLEvaluator;
    my $evaluator = PMLTQ::SQLEvaluator->new(undef,{connect => $conf, debug=>$opts{debug},
               %auth
              });
    $evaluator->connect();
    if ($opts{'node-types'}) {
      print join "\n", @{$evaluator->get_node_types};
    } elsif ($opts{'relations'}) {
      print join "\n", @{$evaluator->get_specific_relations};
    } else {
      search($evaluator,$query);
    }
    $evaluator->{dbi}->disconnect() if $evaluator->{dbi};
  }
}

sub get_server_conf {
  my ($configs,$id, $oldapi)=@_;
  my ($conf,$type);
  if ($id =~ /^https?:/) {
    $type = 'http';
    $conf = {url => $id};
    unless($oldapi) {
      $conf->{baseurl} = $id;
      $conf->{baseurl} =~ s@api/treebanks.*?$@@;
    }
  } else {
    my $conf_el = first { $_->value->{id} eq $id }  SeqV($configs);
    die "Didn't find server configuration named '$id'!\nUse $0 --print-servers and then $0 --server <config-id|URL>\n"
      unless $conf_el;
    $conf = $conf_el->value;
    unless($oldapi) {
      $conf->{baseurl} = $conf->{url};
      $conf->{url} = URI::WithBase->new('/',$conf->{url});
      $conf->{url}->path_segments('api', 'treebanks', $conf->{treebank});
      $conf->{url} = $conf->{url}->abs->as_string;
    }
    $type = $conf_el->name;
  }
  return ($conf,$type);
}

sub http_search {
  my ($self,$url,$query,$opts)=@_;
  $opts||={};
  my $tmp = File::Temp->new( TEMPLATE => 'pmltq_XXXXX',
           TMPDIR => 1,
           UNLINK => 1,
           SUFFIX => '.txt' );
  my $ua;
  if($opts->{'old-api'}) {
    $ua = LWP::UserAgent->new;
    $ua->credentials(URI->new($url)->host_port,'PMLTQ',
         $auth{username}, $auth{password})
      if $opts->{username};
  } else {
    $ua = $self->ua;
    $ua->agent("PMLTQ/1.0 ");
    $self->login($ua,\%auth) if $opts->{username};
  }
  $url.='/' unless $url=~m{^https?://.+/$};
  my $METHOD = \&POST;
  if ($opts->{'node-types'}) {
    $url = $opts->{'old-api'} ? qq{${url}nodetypes} : qq{${url}node-types};
    $METHOD = \&GET unless $opts->{'old-api'};
    $query = '';
  } elsif ($opts->{'relations'}) {
    $url = qq{${url}relations};
    $METHOD = \&GET unless $opts->{'old-api'};
    $query = '';
  } elsif ($opts->{'other'}) {
    $url = qq{${url}other};
     die "Unknown option --other in new api\n" unless $opts->{'old-api'};
    $query = '';
  } else {
    $url = qq{${url}query};
  }
  $ua->timeout($opts{timeout}+2) if $opts{timeout};
  my $q = $query; Encode::_utf8_off($q);
  binmode STDOUT;
  my $sub = $opts->{callback} || sub { print $opts{'output-json'} ? ($_[0]) : (map {join("\t",@$_)."\n"} @{JSON::from_json($_[0])->{results}}) };
  my $res = $ua->request($METHOD->($url, 
    $opts->{'old-api'} ?
      ([
        query => $q,
        format => 'text',
        limit => $opts{limit},
        row_limit => $opts{limit},
        timeout => $opts{timeout},
       ])
       :
       (
        Content_Type => 'application/json;charset=UTF-8',
        User_Agent => 'PML-TQ CLI',
        Content => JSON->new->utf8->encode({
          query => $q,
          limit => $opts{limit},
          # row_limit => $opts{limit}, #TODO: currently not working
          timeout => $opts{timeout},
          nohistory => !!$opts{history}
        })
       )
     ),$sub ,1024*8 );
  unless ($res->is_success) {
    die $res->status_line."\n".$res->content."\n";
  }
}



sub search {
  my ($evaluator,$query)=@_;
  my $results;
  eval {
    $evaluator->prepare_query($query); # count=>1
    $results = $evaluator->run({
      node_limit => $opts{limit},
      row_limit => $opts{limit},
      timeout => $opts{timeout},
      timeout_callback => sub {
  print STDERR "Evaluation of query timed out\n";
  exit 2;
      },
    });
  };
  warn $@ if $@;
  if ($results) {
    for my $r (@$results) {
      print join("\t",@$r)."\n";
    }
    print STDERR $#$results+1," result(s)\n" unless $opts{quiet};
  }
}

sub quote_cmdline {
  my $quoted;
  join ' ', map {
    my $arg = $_;
    $arg =~ s{'}{'\\''}g;
    qq{'$arg'}
  } @_;
}

sub ntred_search {
  my @args = @_;
  my ($host,$port)= $opts{server} ? split(/:/,$opts{server}) : ();
  my $command = $opts{command} || 'ntred';

  my $shared_dir=File::Spec->rel2abs($opts{'shared-dir'} || '.');
  my $filter_file="$shared_dir/pmltq_ntred_filter.$$.pl";

  my @script_flags=('--filter-code-out', $filter_file);
  foreach (qw(query query-id query-file query-pml-file filters netgraph-query)) {
    push @script_flags, '--'.$_, (/file/ ? File::Spec->rel2abs($opts{$_}) : $opts{$_})
      if defined($opts{$_}) and length($opts{$_});
  }

  $command .= ' '.quote_cmdline(
    ((defined($host) and length($host)) ? ('--hub',$host) : ()),
    ((defined($port) and length($port)) ? ('--port',$port) : ()),
    '-q',
    '-I', File::Spec->catfile($extension_dir,qw(contrib pmltq pmltq.ntred)),
    ($opts{filelist} ? ('-l', File::Spec->rel2abs($opts{filelist})) : (@args ? ('-L', '--', @args) : ())),
    '--', @script_flags
  );
  open(my $pipe, $command.' | ') || die "Failed to start ntred client: $!";
  apply_filter($pipe, $filter_file);
  close($pipe);
  unlink $filter_file if -f $filter_file and !$opts{'keep-tmp-files'};
}

sub jtred_search {
  my @args = @_;
  my $command = $opts{command} || 'jtred';

  my $jobname="pmltq_jtred_$$";
  if ($opts{server}) {
    $jobname.="-".$ENV{HOSTNAME};
  }

  my $shared_dir=File::Spec->rel2abs($opts{'shared-dir'} || '.');
  my $filter_file="$shared_dir/$jobname.pl";
  my $filelist;
  if ($opts{filelist}) {
    my ($vol,$dir) = File::Spec->splitpath($opts{filelist});
    my $base = File::Spec->catpath($vol,$dir);
    open my $fh, '<', $opts{filelist} or die "Cannot open filelist $opts{filelist}: $!";
    $filelist = "$shared_dir/$jobname.fl";
    open my $out_fh, '>', $filelist or die "Cannot create temporary filelist $filelist: $!";
    print STDERR "Resolving filelist files to $base...\n" unless $opts{quiet};
    while(<$fh>) {
      chomp;
      print $out_fh File::Spec->rel2abs($_,$base),"\n";
    }
    print STDERR "done.\n" unless $opts{quiet};
    close $fh;
    close $out_fh;
  }
  my @script_flags=('--filter-code-out', $filter_file);
  foreach (qw(query query-id query-file query-pml-file filters netgraph-query)) {
    push @script_flags, '--'.$_, (/file/ ? File::Spec->rel2abs($opts{$_}) : $opts{$_})
      if defined($opts{$_}) and length($opts{$_});
  }
  my @command = (
    $command,
    ($opts{'shared-dir'} ? ('-jw', $shared_dir) : ()),
    '-jn', $jobname,
    ($opts{quiet} ? '-jq' : ()),
    ($filelist ? ('-l', $filelist) : @args),
    '-jb',
    '-q',
    '-I', File::Spec->catfile($extension_dir,qw(contrib pmltq pmltq.ntred)),
    '-o',  @script_flags, '--'
  );
  my $pipe;
  if ($opts{server}) {
    my $cwd = quote_cmdline(getcwd());
    open($pipe, '-|', 'ssh', $opts{server}, <<"SCRIPT".quote_cmdline(@command))
if [ -f ~/.bash_profile ]; then
   . ~/.bash_profile 2>/dev/null 1>&2
elif [ -f ~/.profile ]; then
   . ~/.profile 2>/dev/null 1>&2
fi
cd $cwd;
SCRIPT
      || die "Failed to start jtred on host $opts{server} over ssh: $!"
  } else {
    open($pipe, '-|',@command)
      || die "Failed to start jtred: $!";
  }
  apply_filter($pipe, $filter_file);
  close($pipe);
  unlink $filter_file if -f $filter_file and !$opts{'keep-tmp-files'};
  unlink $filelist if $filelist and !$opts{'keep-tmp-files'};
}

sub btred_search {
  my @args = @_;
  my $command = $opts{command} || 'btred';

  my $jobname="pmltq_btred_$$";
  if ($opts{server}) {
    $jobname.="-".$ENV{HOSTNAME};
  }

  my $shared_dir=File::Spec->rel2abs($opts{'shared-dir'} || '.');
  my $filter_file="$shared_dir/$jobname.pl";

  my @script_flags=('--filter-code-out', $filter_file);
  foreach (qw(query query-id query-file query-pml-file filters netgraph-query)) {
    push @script_flags, '--'.$_, (/file/ ? File::Spec->rel2abs($opts{$_}) : $opts{$_})
      if defined($opts{$_}) and length($opts{$_});
  }
  for (qw(node-types relations)) {
    if ($opts{$_}) {
      push @script_flags, '--info', $_;
      last;
    }
  }

  $command .= ' '.quote_cmdline(
    ($opts{quiet} ? '-Q' : '-q'),
    '-I', File::Spec->catfile($extension_dir,qw(contrib pmltq pmltq.ntred)),
    '-o', '--apply-filters', @script_flags, '--',
    ($opts{filelist} ? ('-l', $opts{filelist}) : @args),
  );
  if ($opts{server}) {
    my $cwd = quote_cmdline(getcwd());
    system('ssh', $opts{server}, <<"SCRIPT");
if [ -f ~/.bash_profile ]; then
   . ~/.bash_profile
elif [ -f ~/.profile ]; then
   . ~/.profile;
fi
cd $cwd
$command
SCRIPT
  } else {
    print STDERR "$command\n";
    system($command);
  }
  unlink $filter_file if -f $filter_file and !$opts{'keep-tmp-files'};
}


sub round {
  my ($value, $precision) = @_;
  my $rounding = ($value >= 0 ? 0.5 : -0.5);
  my $decimalscale = 10**int($precision || 0);
  my $scaledvalue = int($value * $decimalscale + $rounding);
  return $scaledvalue / $decimalscale;
}

sub trunc {
  my ($self, $num, $digits) = @_;
  $digits = int $digits;
  my $decimalscale = 10**abs($digits);
  if ($digits >= 0) {
    return int($num * $decimalscale) / $decimalscale;
  } else {
    return int($num / $decimalscale) * $decimalscale;
  }
}

sub apply_filter {
  my ($input, $filter_file)=@_;
  my $filters;
  my $filter;
  my $first = 1;
  use POSIX qw(ceil floor);

  if ($opts{'no-filters'}) {
    print while (<$input>);
    return;
  }

  my $output_filter = {
    init => sub { },
    process_row => sub {
      my ($self,$row)=@_;
      print(join("\t",@$row)."\n");
    },
    finish => sub { }
   };

  while (<$input>) {
    chomp;
    unless ($filter) {
      if (-f $filter_file and -s $filter_file) {
  open my $fh, "<", $filter_file or
    die "Cannot open $filter_file: $!";
  my $filter_code;
  {
    local $/;
    $filter_code = <$fh>;
  }
  eval "use utf8;\n".$filter_code;
  if ($@) {
    print STDERR $filter_code;
    print STDERR "\n";
    die "Running filter $filter_file failed!";
  }
  my @filters = map {
    my @local_filters = map eval, @{$_->{local_filters_code}};
    my $sub = eval($_->{code});
    die $@ if $@;
    $sub
  } @$filters;

  # connect filters
  my $prev;
  for my $filter (@filters) {
    $prev->{output}=$filter if $prev;
    $prev = $filter;
  }
  if ($prev) {
    $prev->{output} = $output_filter;
  }
  $filter = $filters[0] || die "First filter is empty!";
  $filter->{init}->($filter);
      } else {
  $filter = $output_filter;
      }
    }
    $filter->{process_row}->($filter,[split /\t/,$_]);
  }
  $filter->{finish}->($filter) if $filter;
}

=head1 SYNOPSIS

  pmltq query [--server <URL_or_server_ID> ] [ <options> ] [ --stdin | --query-file <filename> | --query <query> | <query> ]
  pmltq query --btred [ <options> ] [ --stdin | --query-file <filename> | --query <query> ] [ -l <filelist> |  <file(s)> ]
  pmltq query --ntred [ <options> ] [ --stdin | --query-file <filename> | --query <query> ] [ -l <filelist> |  <file(s)> ]
  pmltq query --jtred [ <options> ] [ --stdin | --query-file <filename> | --query <query> ] [ -l <filelist> |  <file(s)> ]

or

  pmltq query [options] [ --print-servers|-P | --node-types | --relations ]


=head1 DESCRIPTION

Run the query.

=head1 OPTIONS

=over 5

=item B<--sql|-S>

Use SQL-based query engine (default).

=item B<--btred|-B>

Query given files or filelist using btred.

=item B<--ntred|-N>

Query given files or filelist using ntred (ntred servers
must be already up and running).

=item B<--jtred|-J>

Run query query over given files/filelist using jtred (multiple btred
instances distributed over an SGE cluster).

=item B<--server|-s> URL_or_ID

If used with SQL-based engine, this option can be used to specify a
URL (http://hostname/APIpath/treebanks/treebankID) to a pmltq http server, or an ID of a
pre-configured SQL or HTTP server (use B<--print-servers> to get a
list).

If used with btred or jtred, it can be used to specify a server to run
btred/jtred on using SSH.

If used with ntred, it can be used to specify a hostname and port
(hostname:port) for the ntred hub.

=item B<--stdin>

Read query from the standard input.

=item B<--query|-Q> string

Specify PML-TQ query on the command-line.

=item B<--query-file> filename

Read PML-TQ query from a given (utf-8 encoded text) file

=item B<--query-pml-file> filename

Read PML-TQ query from a given PML file

=item B<--query-id> ID

Use query with a given ID. If the input is a text file, it can contain more than one
query. In that case, each query must start with a line of the following form:

  # == query: ID ==

where ID is a unique identifier of the query. This option can be used
to select a single query from the input.

If the input is a PML file, then the ID is just the id of the query tree.

=item B<--filelist|-l> filename

This flag can be used with B<--btred>, B<--ntred>, or B<--jtred> to
spedify a file containing a list of files to search, each on a
separate line.

Note that for B<--ntred>, the files must be already loaded on the
B<ntred> servers and this flag simply allows you to specify a subcorpus.

=item B<--auth-id> URL_or_ID

Use username/password stored in the configuration for a given service
(spcified by URL or config-file ID) on the serice specified using --server.

=item B<--username> username

Username for a HTTP or SQL PML-TQ service.

=item B<--password> password

Password for a HTTP or SQL PML-TQ service.

=item B<--limit|-L> number

Only applicable to SQL-based engine.
Specify maximum number of results (i.e. rows printed by pmltq).

=item B<--history|-H>

Sets whether should be query logged to users query history on server.

=item B<--timeout|-t> seconds

Only applicable to SQL-based engine.
Specify a timeout for the query. If the query evaluation takes longer
than a given number of B<seconds>, pmltq terminates the connection
with the server and returns with a message "Evaluation of query timed
out" and exit code 2.

=item B<--config-file|-c> filename

Specify a configuration file. The configuration file is a XML file (in
fact, a PML instance conforming to the treebase_conf_schema.xml) that
lists available SQL engine configurations. If this option is not
provided, B<pmltq> attempts to find a file named treebase.conf in the
resource paths (namely in ~/.tred.d).

=item B<--node-types>

List available node types and exit.

=item B<--netgraph-query|-N> type_name

Assume the query is in NetGraph syntax and translate it to PMLTQ,
using a given node type as the default type.

=item B<--debug|-D>

Print some extended information (e.g. evaluation benchmarks).

=back

=cut

1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.