Yet Another CPAN Grep

Dancer-SearchApp/lib/Dancer/SearchApp.pm

package Dancer::SearchApp;
use strict;
use File::Basename 'basename';
use Dancer;
use Search::Elasticsearch::Async;
use URI::Escape 'uri_unescape';
use URI::file;
#use Search::Elasticsearch::TestServer;

use Dancer::SearchApp::Defaults 'get_defaults';

use Dancer::SearchApp::Entry;
use Dancer::SearchApp::HTMLSnippet;    

use vars qw($VERSION $es %indices);
$VERSION = '0.06';

=head1 NAME

Dancer::SearchApp - A simple local search engine

=head1 SYNOPSIS

=head1 QUICKSTART

Also see L<Dancer::SearchApp::Installation>.

  cpanm --look Dancer::SearchApp
  
  # Install prerequisites
  cpanm --installdeps .

  # Install Elasticsearch https://www.elastic.co/downloads/elasticsearch
  # Start Elasticsearch
  # Install Apache Tika from https://tika.apache.org/download.html into jar/

  # Launch the web frontend
  plackup --host 127.0.0.1 -p 8080 -Ilib -a bin\app.pl

  # Edit filesystem configuration
  cat >>fs-import.yml
  fs:
    directories:
        - folder: "C:\\Users\\Corion\\Projekte\\App-StarTraders"
          recurse: true
          exclude:
             - ".git"
        - folder: "t\\documents"
          recurse: true

  # Collect some content
  perl -Ilib -w bin/index-filesystem.pl -f

  # Search in your browser

=head1 CONFIGURATION

Configuration happens through config.yml

  elastic_search:
    home: "./elasticsearch-2.1.1/"
    index: "dancer-searchapp"

The Elasticsearch instance to used can also be passed in C<%ENV>
as C<SEARCHAPP_ES_NODES>.

=cut

use Data::Dumper;
$Data::Dumper::Sortkeys = 1;

my $config = get_defaults(
    env      => \%ENV,
    config   => config(),
    #defaults => \%
    names => [
        ['elastic_search/index' => 'elastic_search/index' => 'SEARCHAPP_ES_INDEX', 'searchapp'],
        ['elastic_search/nodes' => 'elastic_search/nodes' => 'SEARCHAPP_ES_NODES', 'localhost:9200'],
    ],
);

# A small helper subroutine that adds some API headers that result in
# the API not being interpretable as pages to be displayed by a browser
sub add_api_headers {
    header 'Content-Disposition' => 'attachment; filename="1.txt"';
    header 'X-Content-Type-Options' => 'nosniff';
};

sub search {
    if( ! $es ) {
        my $nodes = $config->{elastic_search}->{nodes};
        $nodes = [split /,/, $nodes] # our config system doesn't provide for lists...
            unless ref $nodes;
        $es = Search::Elasticsearch->new(
            nodes => $nodes,
        );
    };
    
    $es
};

$Template::Stash::PRIVATE = $Template::Stash::PRIVATE = 1;

get '/' => sub {
    # Later, separate out the code paths between
    # search and index page only, to serve the index
    # page as a static file
    
    my $statistics;
    my $results;
    
    my $from = params->{'from'} || 0;
    $from =~ s!\D!!g;
    my $size = params->{'size'} || 25;
    $size =~ s!\D!!g;
    my $search_term = params->{'q'};
    
    if( defined $search_term) {
        
        #warning "Reading ES indices\n";
        %indices = %{ search->indices->get({index => ['*']}) };
        #warning $_ for sort keys %indices;

        my @restrict_type;
        my $type;
        if( $type = params->{'type'} and $type =~ m!([a-z0-9+-]+)/[a-z0-9+-]+!i) {
            #warn "Filtering for '$type'";
            @restrict_type = (filter => { term => { mime_type => $type }});
        };
        
        my $sanitized_search_term = $search_term;
        # Escape colons, as they're special in search queries...
        $sanitized_search_term =~ s!([:\\])!\\$1!g;
        
        # Move this to an async query, later
        my $index = $config->{elastic_search}->{index};
        $results = search->search(
            # Wir suchen in allen Sprachindices
            index => [ grep { /^\Q$index\E/ } sort keys %indices ],
            body => {
                from => $from,
                size => $size,
                query => {
                    # multi_match => { ... grep for the non-autocomplete stuff, and include the boosters
                    bool => {
                        must => {
                            query_string => {
                                query => $search_term,
                                fields => ['title','folder','content', 'author'] #'creation_date'] 
                            },
                        },
                        @restrict_type,
                    },
                },
                sort => {
                    _score => { order => 'desc' },
                },
               "highlight" => {
                    "pre_tags" => '<b>',
                    "post_tags" => '</b>',
                    "fields" => {
                    # we want the whole content so we can strip it down
                    # ourselves:
                        "content" => {"number_of_fragments" => 0},
                        #"content" => {}
                    }
                }
            }
        );
        
        #warn Dumper $results->{hits};
    } else {
        # Update the statistics
        #$statistics = search->search(
        #    search_type => 'count',
        #    index => config->{index},
        #    body        => {
        #        query       => {
        #            match_all => {}
        #        }
        #    }
        #);
        #warn Dumper $statistics;
    };
    
    if( $results ) {
        for( @{ $results->{ hits }->{hits} } ) {
            $_->{source} = Dancer::SearchApp::Entry->from_es( $_ );
            for my $key ( qw( id index type )) {
                $_->{$key} = $_->{"_$key"}; # thanks, Template::Toolkit
            };
            
        };
    };
    
    if( $results and exists params->{lucky}) {
        my $first = $results->{ hits }->{hits}->[0];
        
        if( $first ) {
            my( $index, $type, $id ) = @{$first}{qw(index type id)};
            warn "Redirecting/reproxying first document";
            if( $type eq 'http' ) {
                return
                    redirect $id
            } else {
                my $doc = $first->{source};
                my $local = URI::file->new( $id )->file;
                return
                    reproxy( $doc, $local, 'Inline',
                        index => $index,
                        type => $type,
                );
            }
        };
    } else {

        if( $results and $results->{hits} and $results->{hits}->{hits} and $results->{hits}->{hits}->[0]->{highlight}) {
            # Rework the result snippets to show only the highlighted stuff, together
            # with the appropriate page number if available
            for my $document (@{ $results->{hits}->{hits} }) {
                my $html = $document->{highlight}->{content}->[0];
                my @show = Dancer::SearchApp::HTMLSnippet->extract_highlights(
                    html => $html,
                    max_length => 300,
                );

                # Find the PDF page numbers from Tika
                for my $s (@show) {
                    $s->{page} = () = (substr($html,0,$s->{start}) =~ /<div class="page"/g);
                };

                $document->{highlight}->{content} =
                    [map {
                           +{ snippet => substr( $html, $_->{start}, $_->{length} ),
                             page     => $_->{page},
                           }
                         } @show
                    ];
            };
        };
    
        template 'index', {
                results => ($results ? $results->{hits} : undef ),
                params => {
                    q    => $search_term,
                    from => $from,
                    size => $size,
                },
        };
    };
};

# Show (cached) elements
get '/cache/:index/:type/:id' => sub {
    my $index = params->{index};
    my $type = params->{type};
    my $id = uri_unescape( params->{id} );
    my $document = retrieve($index,$type,$id);
    #warn $document->basic_mime_type;
    
    $document->{type} = $type;
    $document->{index} = $index;
    
    if( $document ) {
        return template 'view_document', {
            result => $document,
            backlink => scalar( request->referer ),
            # we should also save the page offset...
        }
    } else {
        status 404;
        return <<SORRY
        That file does (not) exist anymore in the index.
SORRY
        # We could delete that item from the index here...
        # or schedule reindexing of the resource?
    }
};

# Reproxy elements from disk
sub reproxy {
    my( $document, $local, $disposition, %options ) = @_;
    
    # Now, if the file exists both in the index and locally, let's reproxy the content
    if( $document and -f $local) {
        status 200;
        content_type( $document->mime_type );
        header( "Content-Disposition" => sprintf '%s; filename="%s"', $disposition, basename $local);
        my $abs = File::Spec->rel2abs( $local, '.' );
        open my $fh, '<', $local
            or die "Couldn't read local file '$local': $!";
        binmode $fh;
        local $/;
        <$fh>
        
    } else {
        status 404; # sorry
        return <<SORRY
        That file does (not) exist anymore or is currently unreachable
        for this webserver. We'll need to implement 
        cleaning up the index from dead items.
SORRY
        # We could delete that item from the index here...
        # Or schedule reindexing of the resource?
    }
};

sub retrieve {
    my( $index, $type, $id ) = @_;
    my $document;
    if( eval {
        $document = search->get(index => $index, type => $type, id => $id);
        1
    }) {
        my $res = Dancer::SearchApp::Entry->from_es($document);
        return $res
    } else {
        warn "$@";
    };
    # Not found in the Elasticsearch index
    return undef
}

get '/open/:index/:type/:id' => sub {
    my $index = params->{index};
    my $type = params->{type};
    my $id = uri_unescape params->{id};
    my $document = retrieve($index,$type,$id);
    if( $type eq 'http' ) {
        return
            redirect $id
    } else {
        my $local = URI::file->new( $id )->file;
        return
        reproxy( $document, $local, 'Attachment',
            index => $index,
            type => $type,
        );
    }
};

get '/inline/:index/:type/:id' => sub {
    my $index = params->{index};
    my $type = params->{type};
    my $id = uri_unescape params->{id};
    my $document = retrieve($index,$type,$id);
    
    my $local;
    if( 'http' eq $type ) {
        $document->content
    } else {
        $local = URI::file->new( $id )->file;
    };
    
    reproxy( $document, $local, 'Inline',
        index => $index,
        type => $type,
    );
    
};

# This is likely a really bad design choice which I will regret later.
# Most likely, manually encoding to JSON would be the saner approach
# instead of globally setting a serializer for all routes.
set 'serializer' => 'JSON';

get '/suggest/:query.json' => sub {
    my( $q ) = params->{query};
    #warn "Completing '$q'";
    
    return [] unless $q and $q =~ /\S/;
    
    # Strip leading/trailing whitespace, Justin Case
    $q =~ s!^\s+!!;
    $q =~ s!\s+$!!;

    # Reinitialize indices
    # Some day, we could cache that/not refresh them all the time    
    %indices = %{ search->indices->get({index => ['*']}) };

    my @restrict_type;
    my $type;
    if( $type = params->{'type'} and $type =~ m!([a-z0-9+-]+)/[a-z0-9+-]+!i) {
        #warn "Filtering for '$type'";
        @restrict_type = (filter => { term => { mime_type => $type }});
    };
        
    # This should be centralized
    # This is "did you mean X"
    #my @fields = ('title','content', 'author');
    
    # Query all suggestive fields at once:
    #my %suggest_query = map {;
    #    "my_suggestions_$_" => {
    #        phrase  => {
    #            field => "$_.autocomplete",
    #            #field => "$_",
    #            #text => $q,
    #        }
    #    }
    #} @fields;
    
    my @fields = ('title_suggest');
    
    # Query all suggestive fields at once:
    my %suggest_query = map {;
        "my_completions_$_" => {
            phrase  => {
                field => "title_suggest",
                #field => "$_",
                #text => $q,
            }
        }
    } @fields;

    #warn Dumper \%suggest_query;
    
    # Move this to an async query, later
    my $index = $config->{elastic_search}->{index};
    my $results = search->suggest(
        index => [ grep { /^\Q$index\E/ } sort keys %indices ],
        body    => {
            foo => {
                text  => $q,
                completion => {
                    field => 'title_suggest',
                    "fuzzy" => { "fuzziness" => 2 }, # edit distance of 2
                }
            }
            #%suggest_query
        }
    );
    
    #warn Dumper $results;
    
    my %suggestions;
    my @res = map {; +{
                  tokens => [split //, $_->{text}],
                  value => $_->{text},
                  url   => $_->{_source}->{url},
              } }
              sort { $b->{_score} <=> $a->{_score} || $b cmp $a } # sort by score+asciibetically descending
              map { $_->{options} ? @{ $_->{options} } : () } # unwrap again
              map { @$_ } # unwrap
              grep { ref $_ eq 'ARRAY' } values %$results
              ;
    
    add_api_headers;
    return \@res;
};

true;

__END__

=head1 SECURITY CONSIDERATIONS

=head2 Dancer::SearchApp

This web front end can serve not only the extracted content but also
the original files from your hard disk. Configure the file system crawler
to index only data that you are comfortable with sharing with whoever
gets access to the web server.

Consider making the web server only respond on requests originating from
127.0.0.1:

  plackup --host 127.0.0.1 -p 8080 -Ilib -a bin\app.pl

=head2 Elasticsearch

Elasticsearch has a long history of vulnerabilities and has little to no
concept of information segregation. This basically means that anything that
can reach Elasticsearch can read all the data you stored in it.

Configure Elasticsearch to only respond to localhost or to queries from
within a trusted network, like your home network.

Note that leaking a copy of the Elasticsearch search index is almost as
bad as leaking a copy of the original data. This is especially true if you
look at backups.

=head1 REPOSITORY

The public repository of this module is
L<https://github.com/Corion/dancer-searchapp>.

=head1 SUPPORT

The public support forum of this module is
L<https://perlmonks.org/>.

=head1 TALKS

I've given a talk about this module at Perl conferences:

L<German Perl Workshop 2016, German|http://corion.net/talks/dancer-searchapp/dancer-searchapp.de.html>

L<Video on Youtube, German|https://www.youtube.com/watch?v=X4j5LgmfgZY>

L<YAPC::Europe 2016, Cluj, English|http://corion.net/talks/dancer-searchapp/dancer-searchapp.en.html>

L<Video on Youtube, English|https://www.youtube.com/watch?v=TXSM-Izmia8>

=for html
<iframe title="YouTube video player" class="youtube-player" type="text/html" 
width="640" height="390" src="http://www.youtube.com/embed/TXSM-Izmia8"
frameborder="0" allowFullScreen></iframe>

=head1 BUG TRACKER

Please report bugs in this module via the RT CPAN bug queue at
L<https://rt.cpan.org/Public/Dist/Display.html?Name=Dancer-SearchApp>
or via mail to L<dancer-searchapp-Bugs@rt.cpan.org>.

=head1 AUTHOR

Max Maischein C<corion@cpan.org>

=head1 COPYRIGHT (c)

Copyright 2014-2016 by Max Maischein C<corion@cpan.org>.

=head1 LICENSE

This module is released under the same terms as Perl itself.

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.