Yet Another CPAN Grep

Search-OpenSearch-Federated/lib/Search/OpenSearch/Federated.pm

package Search::OpenSearch::Federated;
use Moo;

our $VERSION = '0.007';

has 'debug' => ( is => 'rw' );
has 'fields' => (
    is      => 'rw',
    default => sub { [qw( title id author link summary tags modified )] }
);
has 'urls'             => ( is => 'rw' );
has 'total'            => ( is => 'rw' );
has 'facets'           => ( is => 'rw' );
has 'subtotals'        => ( is => 'rw' );
has 'timeout'          => ( is => 'rw' );
has 'normalize_scores' => ( is => 'rw' );
has 'version'          => ( is => 'rw', default => sub {$VERSION} );

use Carp;
use Data::Dump qw( dump );
use Parallel::Iterator qw( iterate_as_array );
use JSON;
use LWP::UserAgent;
use Scalar::Util qw( blessed );
use Search::Tools::XML;
use Data::Transformer;
use Normalize;

# we do not use WWW::OpenSearch because we need to pull out
# some non-standard data from the XML.
# we do use XML::Feed to parse XML responses.
use XML::Simple;
use XML::Feed;

my $OS_NS = 'http://a9.com/-/spec/opensearch/1.1/';

my $XMLer = Search::Tools::XML->new();

my $XML_ESCAPER = Data::Transformer->new(
    normal => sub { local ($_) = shift; $$_ = $XMLer->escape($$_); } );

sub search {
    my $self = shift;

    my $urls     = $self->{urls} or croak "no urls defined";
    my $num_urls = scalar @$urls;
    my @done     = iterate_as_array(
        sub {
            $self->_fetch( $_[1] );
        },
        $urls,
    );

    return $self->_aggregate( \@done );
}

sub _aggregate {
    my $self      = shift;
    my $responses = shift;
    my $results   = [];
    my $fields    = $self->fields;
    my $total     = 0;
    my %subtotals = ();
    my %facets    = ();

RESP: for my $resp (@$responses) {

        my $req_uri     = $resp->request->uri;
        my $resp_status = $resp->code;
        $self->debug
            and warn
            sprintf( "response for %s = %s\n", $req_uri, $resp_status );
        next RESP unless $resp_status =~ m/^2/;

        # temporary buffer to allow for normalizing scores
        my @resp_results  = ();
        my $highest_score = 0;

        if ( $resp->content_type eq 'application/json' ) {
            my $r = decode_json( $resp->content );
            if ( $r->{results} ) {
                @resp_results = @{ $r->{results} };
            }

            # must turn facets inside out in order
            # to aggregate counts correctly
            if ( $r->{facets} ) {
                for my $name ( keys %{ $r->{facets} } ) {
                    for my $facet ( @{ $r->{facets}->{$name} } ) {
                        $facets{$name}->{ $facet->{term} } += $facet->{count};
                    }
                }
            }
            $total += $r->{total} || 0;
            $subtotals{$req_uri} = $r->{total};
        }
        elsif ( $resp->content_type eq 'application/xml' ) {
            my $xml = $resp->content;

            #warn $xml;
            my $feed = XML::Feed->parse( \$xml );

            if ( !$feed ) {
                warn XML::Feed->errstr;
                next RESP;
            }

            #dump $feed;

            #
            # we must re-escape the XML content since the feed parser
            # and XML::Simple will escape values automatically
            #
            my @entries;
            for my $item ( $feed->entries ) {
                my $e = {};
                for my $f (@$fields) {
                    $e->{$f} = $item->$f;
                    if ( blessed( $e->{$f} ) ) {

                        #dump( $e->{$f} );
                        if ( $e->{$f}->isa('XML::Feed::Content') ) {
                            $e->{$f} = $XMLer->escape( $e->{$f}->body );
                        }
                        elsif ( $e->{$f}->isa('DateTime') ) {
                            $e->{$f} = $e->{$f}->epoch;
                        }
                    }
                    else {
                        $e->{$f} = $XMLer->escape( $e->{$f} );
                    }
                }

                #dump $e;
                my $content = $item->content;
                my $fields = XMLin( $content->body, NoAttr => 1 );

                #dump $fields;

                for my $f ( keys %$fields ) {
                    $e->{$f} = $fields->{$f};
                    if ( ref $e->{$f} ) {
                        $XML_ESCAPER->traverse( $e->{$f} );
                    }
                    else {
                        $e->{$f} = $XMLer->escape( $e->{$f} );
                    }
                }

                # massage some field names
                $e->{mtime} = delete $e->{modified};
                $e->{uri}   = delete $e->{id};

                #dump $content;
                #dump $e;
                push @entries, $e;

            }

            # facets require digging into the raw xml
            my $xml_feed = XMLin( $feed->as_xml, NoAttr => 1 );

            #dump($xml_feed);

            # must turn facets inside out in order
            # to aggregate counts correctly
            if ( $xml_feed->{category}->{sos}->{facets} ) {
                my $facet_feed = $xml_feed->{category}->{sos}->{facets};
                for my $name ( keys %$facet_feed ) {
                    if ( ref $facet_feed->{$name}->{$name} eq 'ARRAY' ) {
                        for my $facet ( @{ $facet_feed->{$name}->{$name} } ) {
                            $facets{$name}->{ $facet->{term} }
                                += $facet->{count};
                        }
                    }
                    elsif ( ref $facet_feed->{$name}->{$name} eq 'HASH' ) {
                        my $facet = $facet_feed->{$name}->{$name};
                        $facets{$name}->{ $facet->{term} } = $facet->{count};
                    }

                }
            }

            my $atom = $feed->{atom};
            my $this_total = $atom->get( $OS_NS, 'totalResults' );
            $total += $this_total;
            $subtotals{$req_uri} = $this_total;
            push @resp_results, @entries;
        }
        else {
            croak sprintf( "Unsupported response type '%s' for %s\n",
                scalar $resp->content_type, $req_uri );
        }

        # normalize scores
        if ( $self->normalize_scores ) {
            my $normalizer = Normalize->new( 'round_to' => 0.001 );
            my %normalized = ();
            my $i          = 0;

            # compute
            for my $r (@resp_results) {
                $normalized{ $i++ } = $r->{score};
            }
            $normalizer->normalize_to_max( \%normalized );

            # apply
            for my $idx ( keys %normalized ) {
                $resp_results[$idx]->{score} = ( $normalized{$idx} * 1000 );
            }
        }

        # aggregate
        push @$results, @resp_results;

    }

    # transform facets back into arrays of count/term pairs
    my %facets_norm;
    for my $name ( keys %facets ) {
        my @diads = ();
        for my $term ( keys %{ $facets{$name} } ) {
            push @diads, { term => $term, count => $facets{$name}->{$term} };
        }
        $facets_norm{$name} = [@diads];
    }
    $self->{facets}    = \%facets_norm;
    $self->{total}     = $total;
    $self->{subtotals} = \%subtotals;
    return [ sort { $b->{score} <=> $a->{score} } @$results ];
}

sub _fetch {
    my $self = shift;
    my $url  = shift or croak "url required";
    my $ua   = LWP::UserAgent->new();
    $ua->agent( 'sos-fedsearch ' . $VERSION );
    $ua->timeout( $self->{timeout} ) if $self->{timeout};

    my $response = $ua->get($url);

    $self->debug and warn "got response for $url: " . $response->status_line;
    return $response;
}

1;

__END__

=head1 NAME

Search::OpenSearch::Federated - aggregate OpenSearch results

=head1 SYNOPSIS

 my $ms = Search::OpenSearch::Federated->new(
    urls    => [
        'http://some-site.org/search?q=foo',
        'http://some-other-site.org/search?q=foo',
    ],
    timeout => 10,  # very generous
 );

 my $results = $ms->search();
 for my $r (@$results) {
     printf("title=%s", $r->title);
     printf("uri=%s",   $r->uri);
     print "\n";
 }

=head1 DESCRIPTION

Search::OpenSearch::Federated is for aggregating multiple OpenSearch responses
into a single result set. Use it as a client for Search::OpenSearch::Engine-powered
servers or for any server that provides OpenSearch-style results.

=head1 METHODS

Search::OpenSearch::Federated isa Search::Tools::Object.

=head2 new( I<args> )

Constructor. I<args> should include key C<urls> with value of
an array reference. Supported I<args> keys are:

=over

=item urls I<arrayref>

=item timeout I<n>

=item fields I<arrayref>

=item debug 0|1

=item normalize_scores 0|1

If true, all result scores are run through the L<Normalize> module to (hopefully)
help create parity amongst the result sets.

=item version

Defaults to $VERSION package var.

=back

=head2 search

Execute the search. Returns array ref of results sorted by score.

=head2 fields

Returns fields set in new().

=head2 total

Return total hits.

=head2 subtotals

Returns hash ref of subtotal for each URL, keys being
the values of urls().

=head2 facets

Returns hash ref of aggregated facets for all URLs.

=head1 COPYRIGHT

Copyright 2013 - American Public Media Group

=head1 AUTHOR

Peter Karman, C<< <karman at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-search-opensearch-federated at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Search-OpenSearch-Federated>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Search::OpenSearch::Federated

You can also look for information at:

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Search-OpenSearch-Federated>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Search-OpenSearch-Federated>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Search-OpenSearch-Federated>

=item * Search CPAN

L<http://search.cpan.org/dist/Search-OpenSearch-Federated/>

=back


=head1 ACKNOWLEDGEMENTS

Thanks to American Public Media and the state of Minnesota for sponsoring the 
development of this module.

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.