Yet Another CPAN Grep

Search-OpenSearch-Engine-Lucy/lib/Search/OpenSearch/Engine/Lucy.pm

package Search::OpenSearch::Engine::Lucy;
use Moose;
use Carp;
extends 'Search::OpenSearch::Engine';
use Types::Standard qw( Bool Str );
use Dezi::Lucy::Indexer;
use Dezi::Lucy::Searcher;
use Dezi::Indexer::Doc;
use Lucy::Object::BitVector;
use Lucy::Search::Collector::BitCollector;
use Data::Dump qw( dump );
use Scalar::Util qw( blessed );
use Class::Load;
use Path::Class::Dir;
use Search::Tools;
use Try::Tiny;

our $VERSION = '0.400';

has 'aggregator_class' =>
    ( is => 'rw', isa => Str, default => sub {'Dezi::Aggregator'} );
has 'auto_commit' => ( is => 'rw', isa => Bool, default => sub {1} );

sub type {'Lucy'}

sub BUILD {
    my $self = shift;
    Class::Load::load_class( $self->aggregator_class );
    return $self;
}

sub init_searcher {
    my $self     = shift;
    my $index    = $self->index or confess "index not defined";
    my $searcher = Dezi::Lucy::Searcher->new(
        invindex => [@$index],      # copy so that suggester can use strings
        debug    => $self->debug,
        %{ $self->searcher_config },
    );
    if ( !$self->fields ) {
        $self->fields( $searcher->get_propnames );
    }
    return $searcher;
}

sub init_suggester {
    my $self            = shift;
    my %conf            = %{ $self->suggester_config, };
    my $spellcheck_conf = delete $conf{spellcheck_config} || {};
    $spellcheck_conf->{query_parser}
        = Search::Tools->parser( %{ $self->parser_config } );

    # Text::Aspell is optional, so verify we have it
    # before claiming to have a Suggester.
    my $has_suggester = try {
        require LucyX::Suggester;
        $conf{spellcheck} = Search::Tools->spellcheck(%$spellcheck_conf);
        if ( $ENV{TEST_SPELLCHECK_MISSING} ) {
            die "testing missing spellcheck";
        }
        return 1;
    }
    catch {
        if ( $self->debug and $self->logger ) {
            $self->logger->log("Failed to load LucyX::Suggester: $_");
        }
        return 0;
    };
    return unless $has_suggester;

    my $suggester = LucyX::Suggester->new(
        indexes => $self->index,
        debug   => $self->debug,
        %conf,
    );
    return $suggester;
}

sub build_facets {
    my $self  = shift;
    my $query = shift;
    confess "query required" unless defined $query;
    my $results = shift or confess "results required";
    if ( $self->debug and $self->logger ) {
        $self->logger->log( "build_facets check for self->facets="
                . ( $self->facets || 'undef' ) );
    }
    my $facetobj = $self->facets or return;

    my @facet_names = @{ $facetobj->names };
    my $sample_size = $facetobj->sample_size || 0;
    if ( $self->debug and $self->logger ) {
        $self->logger->log( "building facets for "
                . dump( \@facet_names )
                . " with sample_size=$sample_size" );
    }
    my $searcher      = $self->searcher;
    my $lucy_searcher = $searcher->{lucy};
    my $query_parser  = $searcher->{qp};
    my $bit_vec       = Lucy::Object::BitVector->new(
        capacity => $lucy_searcher->doc_max + 1 );
    my $collector
        = Lucy::Search::Collector::BitCollector->new( bit_vector => $bit_vec,
        );

    $lucy_searcher->collect(
        query     => $query_parser->parse("$query")->as_lucy_query(),
        collector => $collector
    );

    # find the facets
    my %facets;
    my $doc_id = 0;
    my $count  = 0;
    my $loops  = 0;
    while (1) {
        $loops++;
        $doc_id = $bit_vec->next_hit( $doc_id + 1 );
        last if $doc_id == -1;
        last if $sample_size and ++$count > $sample_size;
        my $doc = $lucy_searcher->fetch_doc($doc_id);
        for my $name (@facet_names) {

            # unique-ify
            my %val = map { $_ => $_ }
                split( m/\003/,
                ( defined $doc->{$name} ? $doc->{$name} : '' ) );
            for my $value ( keys %val ) {
                $facets{$name}->{$value}++;
            }
        }
    }

    if ( $self->debug and $self->logger ) {
        $self->logger->log(
            "got " . scalar( keys %facets ) . " facets in $loops loops" );
    }

    # turn the struct inside out a bit, esp for XML
    my %facet_struct;
    for my $f ( keys %facets ) {
        for my $n ( keys %{ $facets{$f} } ) {
            push @{ $facet_struct{$f} },
                { term => $n, count => $facets{$f}->{$n} };
        }
    }
    return \%facet_struct;
}

sub has_rest_api {1}

sub get_allowed_http_methods {
    my $self = shift;
    if ( $self->auto_commit ) {
        return qw( GET POST PUT DELETE );
    }
    return qw( GET POST PUT DELETE COMMIT ROLLBACK );
}

sub _massage_rest_req_into_doc {
    my ( $self, $req ) = @_;

    #dump $req;
    my $doc;

    if ( !blessed($req) ) {
        $doc = Dezi::Indexer::Doc->new(
            version => 3,
            %$req
        );
    }
    else {

        #dump $req->headers;

        # $req should act like a HTTP::Request object.
        my %args = (
            version => 3,
            url     => $req->uri->path,        # TODO test
            content => $req->content,
            size    => $req->content_length,
            type    => $req->content_type,

            # type
            # action
            # parser
            # modtime
        );

        #dump \%args;

        $doc = Dezi::Indexer::Doc->new(%args);

    }

    # use set_parser_from_type so that SWISH::3 does the Right Thing
    # instead of looking at the original mime-type.
    my $aggregator
        = $self->aggregator_class->new( set_parser_from_type => 1 );
    $aggregator->swish_filter($doc);

    return $doc;
}

sub init_indexer {
    my $self = shift;
    my $idx = shift || 0;

    if ( $idx =~ m/\D/ ) {
        confess "idx must be an integer for reading into array of index()";
    }
    if ( $idx > scalar @{ $self->index } ) {
        confess sprintf( "idx %d > than index array size %d",
            $idx, scalar @{ $self->index } );
    }

    # unlike a Searcher, which has an array of invindex objects,
    # the Indexer wants only one. We take the first by default,
    # but a subclass could do more subtle logic here.

    my $indexer = Dezi::Lucy::Indexer->new(
        invindex => $self->index->[$idx],
        debug    => $self->debug,
        %{ $self->indexer_config },
    );
    return $indexer;
}

# PUT only if it does not yet exist
# note PUT operates only on first index if there are multiple.
sub PUT {
    my $self = shift;
    my $req  = shift or confess "request required";
    my $doc  = $self->_massage_rest_req_into_doc($req);
    my $uri  = $doc->url;

    # edge case: index might not yet exist.
    my $exists;
    my $index = $self->index or confess "index not defined";
    if (   -d $index->[0]
        && -s Dezi::Lucy::InvIndex->new( $index->[0] . "" )->header_file )
    {
        $exists = $self->GET($uri);
        if ( $exists->{code} == 200 ) {
            return { code => 409, msg => "Document $uri already exists" };
        }
    }

    my $indexer
        = $self->auto_commit
        ? $self->init_indexer()
        : $self->indexer();
    $indexer->process($doc);

    if ( !$self->auto_commit ) {
        my $total = 1;
        return { code => 202, total => 1, };
    }

    my $total = $indexer->finish();
    $exists = $self->GET( $doc->url );
    if ( $exists->{code} != 200 ) {
        return { code => 500, msg => 'Failed to PUT doc' };
    }
    return { code => 201, total => $total, doc => $exists->{doc} };
}

sub _get_indexer {
    my $self = shift;

    # autocommit means we must manage our own indexer
    # since we want to invalidate and re-create

    if ( $self->auto_commit ) {
        return $self->init_indexer(@_);
    }

    # did we have an indexer and it was invalidated? get new one.
    if ( !$self->indexer ) {
        $self->indexer( $self->init_indexer(@_) );
    }
    return $self->indexer;
}

# POST allows new and updates
# note POST operates only on first index if there are multiple
sub POST {
    my $self    = shift;
    my $req     = shift or confess "request required";
    my $doc     = $self->_massage_rest_req_into_doc($req);
    my $uri     = $doc->url;
    my $indexer = $self->_get_indexer;
    $indexer->process($doc);

    if ( !$self->auto_commit ) {
        my $total = 1;
        return { code => 202, total => 1, };
    }

    my $total  = $indexer->finish();
    my $exists = $self->GET( $doc->url );

    if ( $exists->{code} != 200 ) {
        return { code => 500, msg => 'Failed to POST doc' };
    }
    return { code => 200, total => $total, doc => $exists->{doc} };
}

sub COMMIT {
    my $self = shift;
    if ( $self->auto_commit ) {
        return { code => 400 };
    }
    my $indexer = $self->indexer();

    # is it possible to get here? croak just in case.
    if ( !$indexer ) {
        confess "Can't call COMMIT on an undefined indexer";
    }

    if ( my $total = $indexer->count() ) {
        $indexer->finish();

        # MUST invalidate current indexer
        $self->indexer(undef);

        return { code => 200, total => $total };
    }
    else {
        return { code => 204 };
    }
}

sub ROLLBACK {
    my $self = shift;
    if ( !$self->auto_commit ) {
        my $reverted = $self->indexer->count;
        $self->indexer->abort();
        $self->indexer(undef);
        return { code => 200, total => $reverted };
    }
    else {
        return { code => 400 };
    }
}

sub DELETE {
    my $self     = shift;
    my $uri      = shift or confess "uri required";
    my $existing = $self->GET($uri);
    if ( $existing->{code} != 200 ) {
        return {
            code => 404,
            msg  => "$uri cannot be deleted because it does not exist"
        };
    }

    my $i = 0;
    for my $idx ( @{ $self->index } ) {
        my $indexer = $self->_get_indexer( $i++ );
        $indexer->get_lucy->delete_by_term(
            field => 'swishdocpath',
            term  => $uri,
        );
        next unless $self->auto_commit;
        $indexer->finish();
    }
    if ( !$self->auto_commit ) {
        return { code => 202 };
    }
    return { code => 200, };
}

sub _get_swishdocpath_analyzer {
    my $self = shift;
    return $self->{_uri_analyzer} if exists $self->{_uri_analyzer};
    my $qp    = $self->searcher->{qp};         # TODO expose this as accessor?
    my $field = $qp->get_field('swishdocpath');
    if ( !$field ) {

        # field is not defined as a MetaName, just a PropertyName,
        # so we do not analyze it
        $self->{_uri_analyzer} = 0;    # exists but false
        return 0;
    }
    $self->{_uri_analyzer} = $field->analyzer;
    return $self->{_uri_analyzer};
}

sub _analyze_uri_string {
    my ( $self, $uri ) = @_;
    my $analyzer = $self->_get_swishdocpath_analyzer();

    #warn "uri=$uri";

    if ( !$analyzer ) {
        return $uri;
    }
    else {
        return grep { defined and length } @{ $analyzer->split($uri) };
    }
}

sub GET {
    my $self   = shift;
    my $uri    = shift or confess "uri required";
    my $params = shift;                             # undef ok

    # use internal Lucy searcher directly to avoid needing MetaName defined
    my $q = Lucy::Search::PhraseQuery->new(
        field => 'swishdocpath',
        terms => [ $self->_analyze_uri_string($uri) ]
    );

    #warn "q=" . $q->to_string();

    my $lucy_searcher = $self->searcher->get_lucy();
    my $hits = $lucy_searcher->hits( query => $q );

    #warn "$q total=" . $hits->total_hits();
    my $hitdoc = $hits->next;

    if ( !$hitdoc ) {
        return { code => 404, };
    }

    #dump $hitdoc;

    # get all fields
    my %doc;
    my $fields = $self->fields;
    for my $field (@$fields) {
        my $str = $hitdoc->{$field};
        $doc{$field} = [ split( m/\003/, defined $str ? $str : "" ) ];
    }
    $doc{title}   = $hitdoc->{swishtitle};
    $doc{summary} = $hitdoc->{swishdescription};
    $doc{mtime}   = $hitdoc->{swishlastmodified};

    # highlight query string if present
    if ( $params and $params->{q} ) {
        my %hiliter_config = %{ $self->hiliter_config };
        my %parser_config  = %{ $self->parser_config };
        my $query
            = Search::Tools->parser(%parser_config)->parse( $params->{q} );
        my $hiliter
            = Search::Tools->hiliter( query => $query, %hiliter_config );

        for my $f ( keys %doc ) {
            next if $self->no_hiliting($f);
            if ( ref $doc{$f} ) {
                my @hv;
                for my $v ( @{ $doc{$f} } ) {
                    push @hv, $hiliter->light($v);
                }
                $doc{$f} = \@hv;
            }
            else {
                $doc{$f} = $hiliter->light( $doc{$f} );
            }
        }
    }

    my $ret = {
        code => 200,
        doc  => \%doc,
    };

    #dump $ret;

    return $ret;
}

1;

__END__

=head1 NAME

Search::OpenSearch::Engine::Lucy - Lucy server with OpenSearch results

=head1 SYNOPSIS

 use Search::OpenSearch::Engine::Lucy;
 my $engine = Search::OpenSearch::Engine::Lucy->new(
    index       => [qw( path/to/index1 path/to/index2 )],
    facets      => {
        names       => [qw( color size flavor )],
        sample_size => 10_000,
    },
    fields      => [qw( color size flavor )],   # result attributes in response
    indexer_config  => {
        somekey => somevalue,
    },
    searcher_config => {
        anotherkey => anothervalue,
    },
    suggester_config => {
        spellcheck_config => {
            lang => 'en_US',
        },
        limit => 10,
    },
    aggregator_class => 'MyAggregator', # defaults to Dezi::Aggregator
    cache           => CHI->new(
        driver           => 'File',
        dir_create_mode  => 0770,
        file_create_mode => 0660,
        root_dir         => "/tmp/opensearch_cache",
    ),
    cache_ttl       => 3600,
    do_not_hilite   => [qw( color )],
    snipper_config  => { as_sentences => 1, strip_markup => 1, }, # see Search::Tools::Snipper
    hiliter_config  => { class => 'h', tag => 'b' }, # see Search::Tools::HiLiter
    parser_config   => {},                           # see Search::Query::Parser
    
 );
 my $response = $engine->search(
    q           => 'quick brown fox',   # query
    s           => 'score desc',        # sort order
    o           => 0,                   # offset
    p           => 25,                  # page size
    h           => 1,                   # highlight query terms in results
    c           => 0,                   # count total only (same as f=0 r=0)
    L           => 'field|low|high',    # limit results to inclusive range
    f           => 1,                   # include facets
    r           => 1,                   # include results
    t           => 'XML',               # or JSON
    u           => 'http://yourdomain.foo/opensearch/',
    b           => 'AND',               # or OR
 );
 print $response;

=head1 METHODS

=head2 type

Returns C<Lucy>.

=head2 aggregator_class

Passed as param to new(). This class is used for filtering
incoming docs via the aggregator's swish_filter() method.

=head2 auto_commit( 0 | 1 )

Set this in new().

If true, a new indexer is spawned via init_indexer() for
each POST, PUT or DELETE.

If false, the same indexer is re-used in POST, PUT or DELETE
calls, until COMMIT or ROLLBACK is called.

Default is true (on).

=head2 BUILD

Overrides base method to load the I<aggregator_class> and other
Engine-specific construction tasks.

=head2 init_searcher

Returns a Dezi::Lucy::Searcher object.

=head2 init_indexer

Returns a Dezi::Lucy::Indexer object (used by the REST API).

=head2 init_suggester

Returns a LucyX::Suggester object. You can configure it as
described in the SYNOPSIS.

=head2 build_facets( I<query>, I<results> )

Returns hash ref of facets from I<results>. See Search::OpenSearch::Engine.

=head2 process_result( I<args> )

Overrides base method to preserve multi-value fields as arrays.

=head2 has_rest_api

Returns true.

=head2 get_allowed_http_methods

Returns array (not an array ref) of supported HTTP method names.
These correspond to the UPPERCASE method names below.

B<NOTE> that COMMIT and ROLLBACK are not official HTTP/1.1 method
names.

=head2 PUT( I<doc> )

Writes I<doc> to the first index defined. I<doc> must already exist.

=head2 POST( I<doc> )

Writes I<doc> to the first index defined. I<doc> may be new or already exist.

=head2 DELETE( I<uri> )

Deletes I<uri> from all indexes.

=head2 GET( I<uri> )

Fetches I<uri> from all indexes.

=head2 COMMIT

If auto_commit is false, use this method to conclude a transaction.

=head2 ROLLBACK

If auto_commit is false, use this method to abort a transaction.

=head1 AUTHOR

Peter Karman, C<< <karman at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-search-opensearch-engine-lucy at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Search-OpenSearch-Engine-Lucy>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Search::OpenSearch::Engine::Lucy


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Search-OpenSearch-Engine-Lucy>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Search-OpenSearch-Engine-Lucy>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Search-OpenSearch-Engine-Lucy>

=item * Search CPAN

L<http://search.cpan.org/dist/Search-OpenSearch-Engine-Lucy/>

=back

=head1 COPYRIGHT & LICENSE

Copyright 2010 Peter Karman.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.


=cut