Yet Another CPAN Grep

WWW-CPANRatings/lib/WWW/CPANRatings.pm

package WWW::CPANRatings;
use strict;
use warnings;
our $VERSION = '0.03';
use utf8;
use List::Util qw(sum);
use LWP::UserAgent;
use DateTime::Format::DateParse;
use HTML::TokeParser::Simple;
use URI;
use Web::Scraper;
use JSON::XS;
use Text::CSV_PP;
use feature 'say';

sub new { 
    my $class = shift;
    my $args = shift || {};
    my $self = bless $args,$class;
    $self->setup_request(sub{
        my $url = shift;
        my $ua = LWP::UserAgent->new;
        my $response = $ua->get( $url );
        return $response->decoded_content;
    });
    return $self;
}

sub setup_request {
    my ($self,$cb) = @_;
    $self->{requester} = $cb;
}

sub request {
    my ($self,$url) = @_;
    return $self->{requester}->( $url );
}

sub fetch_ratings {
    my $self = shift;
    my $arg = shift;

    # if it's file
    my $text;
    if( $arg && -e $arg ) {
        open my $fh , "<" , $arg;
        local $/;
        $text = <$fh>;
        close $fh;
    }
    elsif( $arg && $arg =~ /^http/ ) {
        $text = get( $arg );
    }

    unless ( $text ) {
        $text = $self->request('http://cpanratings.perl.org/csv/all_ratings.csv');
    }

    my @lines = split /\n/,$text;
    my $csv = Text::CSV_PP->new();     # create a new object

    # drop first 2 lines
    splice @lines,0,2;
    my %rating_data;

    for my $line ( @lines ) {
        chomp($line);
        my $status  = $csv->parse($line);
        die 'csv file parse failed.' unless $status;
        my ($dist,$rating,$review_count) = $csv->fields();

        # say $dist, $rating, $review_count;
        $rating_data{ $dist } = {
            dist => $dist,
            rating => $rating,
            review_cnt => $review_count,
        };
    }
    return $self->{rating_data} = \%rating_data;
}

sub rating_data { 
    my $self = shift;
    $self->fetch_ratings unless $self->{rating_data};
    return $self->{rating_data};
}

sub get_ratings {
    my ($self,$distname) = @_;
    $distname =~ s/::/-/g;
    return $self->rating_data->{ $distname };
}

# dist_name format 
sub get_reviews {
    my ($self,$modname) = @_;
    my $distname = $modname;
    $distname =~ s/::/-/g;
    my $base_url = "http://cpanratings.perl.org/dist/";
    my $url = $base_url . $distname;
    my $content = $self->request($url);
    return unless $content;
    return unless $content =~ /$modname reviews/;
    my $result = $self->parse_review_page($content);
    return @{ $result->{reviews} };
}


# returned structure,
#     $VAR1 = {
#        'reviews' => [
#                 {
#                   'body' => ' Moose got me laid. Could you ask anything more of a CPAN module? ',
#                   'user_link' => bless( do{\(my $o = 'http://cpanratings.perl.org/user/funguy')}, 'URI::http' ),
#                   'attrs' => 'Fun Guy - 2011-04-12T14:30:46 ',
#                   'user' => 'Fun Guy',
#                   'dist' => ' Moose',
#                   'dist_link' => bless( do{\(my $o = 'http://search.cpan.org/dist/Moose/')}, 'URI::http' )
#                 },



sub rating_scraper { 
    my $self = shift;
    return scraper {
        process '.review' => 'reviews[]' => scraper {
            process '.review_header a', 
                    dist_link => '@href',
                    dist => 'TEXT';

            process '.review_header',
                    header => 'TEXT';

            process '.review_header img',
                    ratings => '@alt';

            process '.review_text', body => 'TEXT';

            process '.review_attribution' ,
                'attrs' => 'TEXT';
            process '.review_attribution a' , 
                'user' => 'TEXT',
                'user_link' => '@href';
        };
    };
}

sub parse_review_page {
    my ($self,$content) = @_;

    my $rating_scraper = $self->rating_scraper;
    my $res = $rating_scraper->scrape( $content );

    # post process

    for my $review ( @{ $res->{reviews} } ) {
        if( $review->{header} =~ m{^\s*([a-zA-Z:]+)\s+\(([0-9.]+)\)\s*$} ) {
            $review->{version} = $2;
        }

        $review->{dist} =~ s{^\s*}{};
        $review->{dist} =~ s{\s*$}{};

        if( $review->{attrs} =~ m{([0-9-T:]+)\s*$} ) {
            $review->{created_on} = 
                DateTime::Format::DateParse->parse_datetime( $1 );
        }

        delete $review->{attrs};
    }
    return $res;
}


sub get_all_reviews {
    my $self = shift;
    my $all_ratings = $self->rating_data;
    while( my( $distname,$ratings) = each %$all_ratings ) {
        # $ratings->{review_cnt};
        # $ratings->{dist};
        # $ratings->{rating};
        $ratings->{reviews} = [ $self->get_reviews( $ratings->{dist} ) ];
    }
    return $all_ratings;
}

1;
__END__

=head1 NAME

WWW::CPANRatings - parsing CPANRatings data

=head1 SYNOPSIS

    use WWW::CPANRatings;

    my $r = WWW::CPANRatings->new;

    my $all_ratings = $r->rating_data;  # get rating data.

    my $ratings = $r->get_ratings( 'Moose' );  # get Moose rating scores.

    my @reviews = $r->get_reviews( 'Moose' );  # parse review text from cpanratings.perl.org.




    for my $r ( @reviews ) {
        $r->{dist};
        $r->{dist_link};
        $r->{version}
        $r->{user};
        $r->{user_link};
        $r->{created_on};  # DateTime object.
        $r->{ratings};
    }

=head1 DESCRIPTION

=head1 METHODS

=head2 $r->fetch_ratings()

Download/Parse csv rating data.

=head2 AllRatingData | HashRef = $r->rating_data()

Get csv rating data.

=head2 RatingData | HashRef = $r->get_ratings( DistName | String )

Get rating data of a distribution

=head2 Reviews | Array = $r->get_reviews( DistName | String )

Get distribution reviews (including text, user, timestamp)

=head2 $r->get_all_reviews

Get reviews from all distributions.

=head1 AUTHOR

Yo-An Lin E<lt>cornelius.howl {at} gmail.comE<gt>

=head1 SEE ALSO

L<WWW::CPANRatings::RSS>

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.