Group
Extension

WWW-Flatten/lib/WWW/Flatten.pm

package WWW::Flatten;
use strict;
use warnings;
use utf8;
use 5.010;
use Mojo::Base 'WWW::Crawler::Mojo';
use Mojo::Util qw(md5_sum);
use Mojo::File;
use Mojo::Log;
use WWW::Crawler::Mojo::ScraperUtil qw{resolve_href guess_encoding};
use Encode;
our $VERSION = '0.09';

my $html_type_regex = qr{^(text|application)/(html|xml|xhtml)};
my $css_type_regex = qr{^text/css$};
my $types = {
    'application/atom+xml'      => 'atom',
    'application/font-woff'     => 'woff',
    'application/javascript'    => 'js',
    'application/json'          => 'json',
    'application/pdf'           => 'pdf',
    'application/rss+xml'       => 'rss',
    'application/x-gzip'        => 'gz',
    'application/xml'           => 'xml',
    'application/zip'           => 'zip',
    'audio/mpeg'                => 'mp3',
    'audio/ogg'                 => 'ogg',
    'image/gif'                 => 'gif',
    'image/jpeg'                => 'jpg',
    'image/png'                 => 'png',
    'image/svg+xml'             => 'svg',
    'image/x-icon'              => 'ico',
    'text/cache-manifest'       => 'appcache',
    'text/css'                  => 'css',
    'text/html'                 => 'html',
    'text/plain'                => 'txt',
    'text/xml'                  => 'xml',
    'video/mp4'                 => 'mp4',
    'video/ogg'                 => 'ogv',
    'video/webm'                => 'webm',
};

has depth => 10;
has filenames => sub { {} };
has 'basedir';
has is_target => sub { sub { 1 } };
has 'normalize';
has asset_name => sub { asset_number_generator(6) };
has _retrys => sub { {} };
has max_retry => 3;
has 'log_name';
has 'log';

sub asset_number_generator {
    my $digit = (shift || 6);
    my $num = 0;
    return sub {
        return sprintf("%0${digit}d", $num++);
    };
}

sub asset_hash_generator {
    my $len = (shift || 6);
    my %uniq;
    return sub {
        my $md5 = md5_sum(shift->url);
        my $len = $len;
        my $key;
        do { $key = substr($md5, 0, $len++) } while (exists $uniq{$key});
        $uniq{$key} = undef;
        return $key;
    };
}

sub init {
    my ($self) = @_;
    
    $self->log(Mojo::Log->new(
        path => $self->basedir. $self->log_name)) if $self->log_name;
    
    for (keys %{$self->filenames}) {
        $self->enqueue($_);
        if (!ref $_) {
            my $val = $self->filenames->{$_};
            delete $self->filenames->{$_};
            $self->filenames->{Mojo::URL->new($_)} = $val;
        }
    }
    
    $self->on(res => sub {
        my ($self, $scrape, $job, $res) = @_;

        $self->say_progress;
        
        return unless $res->code == 200;
        
        for my $job2 ($scrape->()) {
            
            next unless ($self->is_target->($job2));
            next unless ($job2->depth <= $self->depth);
            
            my $url = $job2->url;
            
            if (my $cb = $self->normalize) {
                $job2->url($url = $cb->($url));
            }
            
            next if ($self->filenames->{$url});
            
            my $new_id = $self->asset_name->($job2);
            my $type = $self->ua->head($url)->res->headers->content_type || '';
            my $ext1 = $types->{ lc(($type =~ /([^;]*)/)[0]) };
            my $ext2 = ($url->path =~ qr{\.(\w+)$})[0];
            my $ext = $ext1 || $ext2;
            $new_id .= ".$ext" if $ext;
            $self->filenames->{$url} = $new_id;
            
            next if $type !~ $html_type_regex && $type !~ $css_type_regex
                                                && -f $self->basedir. $new_id;
            
            $self->enqueue($job2);
        }
        
        my $url = $job->url;
        my $type = $res->headers->content_type;
        my $original = $job->original_url;
        my $save_file = $self->filenames->{$original};
        
        if ($type && $type =~ $html_type_regex) {
            my $encode = guess_encoding($res) || 'UTF-8';
            my $cont = Mojo::DOM->new(Encode::decode($encode, $res->body));
            my $base = $url;
            
            if (my $base_tag = $cont->at('base')) {
                $base = resolve_href($base, $base_tag->attr('href'));
            }
            
            $self->flatten_html($cont, $base, $save_file);
            $cont = $cont->to_string;
            $self->save($original, $cont, $encode);
        } elsif ($type && $type =~ $css_type_regex) {
            my $encode = guess_encoding($res) || 'UTF-8';
            my $cont = $self->flatten_css($res->body, $url, $save_file);
            $self->save($original, $cont, $encode);
        } else {
            $self->save($original, $res->body);
        }
        
        $self->log->info(
            sprintf('created: %s => %s ', $save_file, $original)) if $self->log;
    });
    
    $self->on(error => sub {
        my ($self, $msg, $job) = @_;
        
        $self->log->error("$msg: ". $job->url) if $self->log;
        
        my $md5 = md5_sum($job->url->to_string);
        if (++$self->_retrys->{$md5} < $self->max_retry) {
            $self->requeue($job);
            $self->log->warn("Re-scheduled: ". $job->url) if $self->log;
        } else {
            my $times = $self->max_retry;
            $self->log->error("Failed $times times: ". $job->url) if $self->log;
        }
    });
    
    $self->SUPER::init;
}

sub get_href {
    my ($self, $base, $url, $ref_path) = @_;
    my $fragment = ($url =~ qr{(#.+)})[0] || '';
    my $abs = resolve_href($base, $url);
    if (my $cb = $self->normalize) {
        $abs = $cb->($abs);
    }
    my $file = $self->filenames->{$abs};
    return $abs. $fragment unless $file;
    my $path = Mojo::File->new($file)->to_rel(
                                Mojo::File->new($ref_path || '')->dirname);
    $path =~ s{\\}{/}g if $^O eq 'MSWin32';
    return $path. $fragment if ($file);
}

sub flatten_html {
    my ($self, $dom, $base, $ref_path) = @_;
    
    state $handlers = $self->html_handlers();
    $dom->find(join(',', keys %{$handlers}))->each(sub {
        my $dom = shift;
        for ('href', 'ping','src','data') {
            $dom->{$_} = $self->get_href($base, $dom->{$_}, $ref_path) if ($dom->{$_});
        }
    });
    
    $dom->find('meta[content]')->each(sub {
        if ($_[0] =~ qr{http\-equiv="?Refresh"?}i && $_[0]->{content}) {
            $_[0]->{content} =~
                            s{URL=(.+)}{ 'URL='. $self->get_href($base, $1, $ref_path) }e;
        }
    });

    
    $dom->find('base')->each(sub {shift->remove});
    
    $dom->find('style')->each(sub {
        my $dom = shift;
        my $cont = $dom->content;
        $dom->content($self->flatten_css($cont, $base, $ref_path));
    });
    
    $dom->find('[style]')->each(sub {
        my $dom = shift;
        my $cont = $dom->{style};
        $dom->{style} = $self->flatten_css($dom->{style}, $base, $ref_path);
    });
    return $dom
}

sub flatten_css {
    my ($self, $cont, $base, $ref_path) = @_;
    $cont =~ s{url\((.+?)\)}{
        my $url = $1;
        $url =~ s/^(['"])// && $url =~ s/$1$//;
        'url('. $self->get_href($base, $url, $ref_path). ')';
    }egi;
    return $cont;
}

sub save {
    my ($self, $url, $content, $encode) = @_;
    my $path =  Mojo::File->new($self->basedir. $self->filenames->{$url});
    $path->dirname->make_path unless -d $path->dirname;
    $content = Encode::encode($encode, $content) if $encode;
    $path->spurt($content);
}

sub say_start {
    my $self = shift;
    
    my $content = <<"EOF";
----------------------------------------
Crawling is starting with @{[ $self->queue->next->url ]}
Max Connection  : @{[ $self->max_conn ]}
User Agent      : @{[ $self->ua_name ]}
----------------------------------------
EOF
    say $content;

    if ($self->log) {
        $self->log->append($content);
        
        my $path =  Mojo::File->new($self->log->path)->to_abs;
        
        say <<EOF;
This could take a while. You can run the following command on another shell to track the status:

  tail -f $path
EOF
    }
}

sub say_progress {
    local $| = 1;
    my $self = shift;
    my $len = $self->queue->length;
    print sprintf('%s jobs left', $len), ' ' x 30, "\r";
}

1;

=head1 NAME

WWW::Flatten - Flatten a web pages deeply and make it portable

=head1 SYNOPSIS

    use strict;
    use warnings;
    use utf8;
    use 5.010;
    use WWW::Flatten;
    
    my $basedir = './github/';
    mkdir($basedir);
    
    my $bot = WWW::Flatten->new(
        basedir => $basedir,
        max_conn => 1,
        max_conn_per_host => 1,
        depth => 3,
        filenames => {
            'https://github.com' => 'index.html',
        },
        is_target => sub {
            my $uri = shift->url;
            
            if ($uri =~ qr{\.(css|png|gif|jpeg|jpg|pdf|js|json)$}i) {
                return 1;
            }
            
            if ($uri->host eq 'assets-cdn.github.com') {
                return 1;
            }
            
            return 0;
        },
        normalize => sub {
            my $uri = shift;
            ...
            return $uri;
        }
    );
    
    $bot->crawl;

=head1 DESCRIPTION

WWW::Flatten is a web crawling tool for freezing pages into standalone.

This software is considered to be alpha quality and isn't recommended for regular usage.

=head1 ATTRIBUTES

=head2 depth

Depth limitation. Defaults to 10.

    $ua->depth(10);

=head2 filenames

URL-Filename mapping table. This well automatically be increased during crawling
but you can pre-define some beforehand.

    $bot->finenames({
        'http://example.com/index.html' => 'index.html',
        'http://example.com/index2.html' => 'index2.html',
    })

=head2 basedir

A directory path for output files.

    $bot->basedir('./out');

=head2 is_target

Set the condition which indecates whether the job is flatten target or not.

    $bot->is_target(sub {
        my $job = shift;
        ...
        return 1 # or 0
    });

=head2 'normalize'

A code reference which perform normalization for URLs. The callback will take
L<Mojo::URL> instance.

    $bot->normalize(sub {
        my $url = shift;
        my $modified = ...;
        return $modified;
    });

=head2 asset_name

A code reference that generates asset names. Defaults to a preset generator
asset_number_generator, which generates 6 digit number. There provides
another option asset_hash_generator, which generates 6 character hash.

    $bot->asset_name(WWW::Flatten::asset_hash_generator(6));

=head2 max_retry

Max attempt limit of retry in case the server in inresponsible. Defaults to 3.

=head1 METHODS

=head2 asset_number_generator

Numeric file name generating closure with self containing storage. See also
L<asset_name> attribute.

    $bot->asset_name(WWW::Flatten::asset_number_generator(3));

=head2 asset_hash_generator

Hash-based file name generating closure with self containing storage. See also
L<asset_name> attribute. This function automatically avoid name collision by
extending the given length.

If you want the names as short as possible, use the following setting.

    $bot->asset_name(WWW::Flatten::asset_hash_generator(1));

=head2 init

Initialize the crawler

=head2 get_href

Generate new href with old one.

=head2 flatten_html

Replace URLs in a Mojo::DOM instance, according to filenames attribute.

=head2 flatten_css

Replace URLs in a CSS string, according to filenames attribute.

=head2 save

Save HTTP response into a file.

=head1 AUTHOR

Sugama Keita, E<lt>sugama@jamadam.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) jamadam

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.