Group
Extension

Fancazzista-Scrap/lib/Fancazzista/Scrap/WebsiteScrapper.pm

package Fancazzista::Scrap::WebsiteScrapper;

use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use Mojo::DOM;
use JSON;
use Encode qw(encode);

our $VERSION = '1.00';

sub new {
    my $class = shift;

    my $self = {};

    bless $self, $class;

    return $self;
}

sub scrap {
    my $self   = shift;
    my $config = shift;

    my @websites = ();

    foreach ( @{ $config->{websites} } ) {
        my @resourceArticles = $self->extractArticles($_);

        push @websites,
          {
            name         => $_->{name},
            url          => $_->{url},
            articles     => \@resourceArticles,
            from_website => 1,
          };
    }

    return @websites;
}

sub getWebsiteHtml {
    my $self = shift;
    my $url  = shift;

    my $ua = new LWP::UserAgent;
    $ua->agent( "$0/0.1 " . $ua->agent );

    my $req = new HTTP::Request 'GET' => $url;
    $req->header( 'Accept' => 'text/html' );

    my $res = $ua->request($req);

    return $res->decoded_content;
}

sub extractArticles {
    my $self     = shift;
    my $resource = shift;
    my $content  = $self->getWebsiteHtml( $resource->{url} );
    my $dom      = Mojo::DOM->new($content);
    my $found    = $dom->find( $resource->{selector} );

    my @articles = ();

    foreach ( $found->each ) {
        my $text = $_->find( $resource->{textSelector} )->[0]->text;
        my $link = $_->find( $resource->{linkSelector} )->[0]->attr->{href};

        $text =~ s/^\s+|\s+$//g;

        push @articles,
          {
            text => encode( 'utf8', $text ),
            link => $link
          };
    }

    return @articles;
}

1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.