Group
Extension

Metabrik-Repository/lib/Metabrik/Client/Www.pm

#
# $Id$
#
# client::www Brik
#
package Metabrik::Client::Www;
use strict;
use warnings;

use base qw(Metabrik);

sub brik_properties {
   return {
      revision => '$Revision$',
      tags => [ qw(unstable browser http javascript screenshot) ],
      author => 'GomoR <GomoR[at]metabrik.org>',
      license => 'http://opensource.org/licenses/BSD-3-Clause',
      attributes => {
         uri => [ qw(uri) ],
         username => [ qw(username) ],
         password => [ qw(password) ],
         ignore_content => [ qw(0|1) ],
         user_agent => [ qw(user_agent) ],
         ssl_verify => [ qw(0|1) ],
         datadir => [ qw(datadir) ],
         timeout => [ qw(0|1) ],
         rtimeout => [ qw(timeout) ],
         add_headers => [ qw(http_headers_hash) ],
         do_javascript => [ qw(0|1) ],
         do_redirects => [ qw(0|1) ],
         src_ip => [ qw(ip_address) ],
         max_redirects => [ qw(count) ],
         client => [ qw(object) ],
         _last => [ qw(object|INTERNAL) ],
         _last_code => [ qw(code|INTERNAL) ],
      },
      attributes_default => {
         ssl_verify => 0,
         ignore_content => 0,
         timeout => 0,
         rtimeout => 10,
         add_headers => {},
         do_javascript => 0,
         do_redirects => 1,
         max_redirects => 10,
      },
      commands => {
         install => [ ], # Inherited
         create_user_agent => [ ],
         reset_user_agent => [ ],
         get => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         cat => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         post => [ qw(content_hash uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         patch => [ qw(content_hash uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         put => [ qw(content_hash uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         head => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         delete => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         options => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         code => [ ],
         content => [ ],
         get_content => [ qw(uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         post_content => [ qw(content_hash uri|OPTIONAL username|OPTIONAL password|OPTIONAL) ],
         save_content => [ qw(output) ],
         headers => [ ],
         get_response_headers => [ ],
         delete_request_header => [ qw(header) ],
         get_response_header => [ qw(header) ],
         set_request_header => [ qw(header value|value_list) ],
         forms => [ ],
         links => [ ],
         trace_redirect => [ qw(uri|OPTIONAL) ],
         screenshot => [ qw(uri output) ],
         eval_javascript => [ qw(js) ],
         info => [ qw(uri|OPTIONAL) ],
         mirror => [ qw(url|$url_list output|OPTIONAL datadir|OPTIONAL) ],
         parse => [ qw(html) ],
         get_last => [ ],
         get_last_code => [ ],
      },
      require_modules => {
         'IO::Socket::SSL' => [ ],
         'Progress::Any::Output' => [ ],
         'Progress::Any::Output::TermProgressBarColor' => [ ],
         'Data::Dumper' => [ ],
         'HTML::TreeBuilder' => [ ],
         'LWP::UserAgent' => [ ],
         'LWP::UserAgent::ProgressAny' => [ ],
         'HTTP::Request' => [ ],
         'HTTP::Request::Common' => [ ],
         'WWW::Mechanize' => [ ],
         'Mozilla::CA' => [ ],
         'HTML::Form' => [ ],
         'Metabrik::File::Write' => [ ],
         'Metabrik::System::File' => [ ],
         'Metabrik::Network::Address' => [ ],
      },
      need_packages => {
         freebsd => [ qw(p5-LWP-Protocol-https) ],
         ubuntu => [ qw(liblwp-protocol-https-perl) ],
         debian => [ qw(liblwp-protocol-https-perl) ],
         kali => [ qw(liblwp-protocol-https-perl) ],
      },
      optional_modules => {
         'WWW::Mechanize::PhantomJS' => [ ],
      },
      optional_binaries => {
         phantomjs => [ ],
      },
   };
}

sub create_user_agent {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   $self->log->debug("create_user_agent: creating agent");

   $uri ||= $self->uri;

   # Use IO::Socket::SSL which supports timeouts among other things.
   $ENV{PERL_NET_HTTPS_SSL_SOCKET_CLASS} = 'IO::Socket::SSL';

   my $ssl_verify = $self->ssl_verify
      ? IO::Socket::SSL::SSL_VERIFY_PEER()
      : IO::Socket::SSL::SSL_VERIFY_NONE();

   my %args = (
      stack_depth => 0,  # Default is infinite, and will eat-up whole memory.
                         # 0 means completely turn off the feature.
      autocheck => 0,  # Do not throw on error by checking HTTP code. Let us do it.
      timeout => $self->rtimeout,
      ssl_opts => {
         verify_hostname => $self->ssl_verify,
         SSL_verify_mode => $ssl_verify,
         SSL_ca_file => Mozilla::CA::SSL_ca_file(),
         # SNI support - defaults to PeerHost
         # SSL_hostname => 'hostname',
      },
   );

   my $mechanize = 'WWW::Mechanize';
   if ($self->do_javascript) {
      if ($self->brik_has_module('WWW::Mechanize::PhantomJS')
      &&  $self->brik_has_binary('phantomjs')) {
         $mechanize = 'WWW::Mechanize::PhantomJS';
      }
      else {
         return $self->log->error("create_user_agent: module [WWW::Mechanize::PhantomJS] not found, cannot do_javascript");
      }
   }
   if ((! $self->do_redirects) && $mechanize eq 'WWW::Mechanize::PhantomJS') {
      $self->log->warning("create_user_agent: module [WWW::Mechanize::PhantomJS] does ".
         "not support do_redirects, won't use it.");
   }
   elsif ($self->do_redirects) {
      $args{max_redirect} = $self->max_redirects;
   }
   else {  # Follow redirects not wanted
      $args{max_redirect} = 0;
   }

   my $src_ip = $self->src_ip;
   if (defined($src_ip)) {
      my $na = Metabrik::Network::Address->new_from_brik_init($self) or return;
      if (! $na->is_ip($src_ip)) {
         return $self->log->error("create_user_agent: src_ip [$src_ip] is invalid");
      }
      $args{local_address} = $src_ip;
   }

   my $mech = $mechanize->new(%args);
   if (! defined($mech)) {
      return $self->log->error("create_user_agent: unable to create WWW::Mechanize object");
   }

   if ($self->user_agent) {
      $mech->agent($self->user_agent);
   }
   else {
      # Some WWW::Mechanize::* modules can't do that
      if ($mech->can('agent_alias')) {
         $mech->agent_alias('Linux Mozilla');
      }
   }

   $username = defined($username) ? $username : $self->username;
   $password = defined($password) ? $password : $self->password;
   if (defined($username) && defined($password)) {
      $self->log->debug("create_user_agent: using Basic authentication");
      $mech->cookie_jar({});
      $mech->credentials($username, $password);
   }

   if ($self->log->level > 2) {
      $mech->add_handler("request_send",  sub { shift->dump; return });
      $mech->add_handler("response_done", sub { shift->dump; return });
   }

   return $mech;
}

sub reset_user_agent {
   my $self = shift;

   $self->client(undef);

   return 1;
}

sub _method {
   my $self = shift;
   my ($uri, $username, $password, $method, $data) = @_;

   $uri ||= $self->uri;
   $self->brik_help_run_undef_arg($method, $uri) or return;

   $self->timeout(0);

   $username = defined($username) ? $username : $self->username;
   $password = defined($password) ? $password : $self->password;
   my $client = $self->client;
   if (! defined($self->client)) {
      $client = $self->create_user_agent($uri, $username, $password) or return;
      $self->client($client);
   }

   my $add_headers = $self->add_headers;
   if (defined($add_headers)) {
      for my $k (keys %$add_headers) {
         my $v = $add_headers->{$k};
         if (ref($v) eq 'ARRAY') {
            my $this = join('; ', @$v);
            $client->add_header($k => $this);
         }
         else {
            $client->add_header($k => $v);
         }
      }
   }

   $self->log->verbose("$method: $uri");

   my $response;
   eval {
      if ($method ne 'get' && ref($client) eq 'WWW::Mechanize::PhantomJS') {
         return $self->log->error("$method: method not supported by WWW::Mechanize::PhantomJS");
      }
      if ($method eq 'post' || $method eq 'put') {
         $response = $client->$method($uri, Content => $data);
      }
      elsif ($method eq 'patch') {
         # https://stackoverflow.com/questions/23910962/how-to-send-a-http-patch-request-with-lwpuseragent
         my $req = HTTP::Request::Common::PATCH($uri, [ %$data ]);
         $response = $client->request($req);
      }
      elsif ($method eq 'options' || $method eq 'patch') {
         my $req = HTTP::Request->new($method, $uri, $add_headers);
         $response = $client->request($req);
      }
      else {
         $response = $client->$method($uri);
      }
   };
   if ($@) {
      chomp($@);
      if ($@ =~ /read timeout/i) {
         $self->timeout(1);
      }
      return $self->log->error("$method: unable to use method [$method] to uri [$uri]: $@");
   }

   $self->_last($response);

   my %r = ();
   $r{code} = $response->code;
   if (! $self->ignore_content) {
      if ($self->do_javascript) {
         # decoded_content method is available in WWW::Mechanize::PhantomJS
         # but is available in HTTP::Request response otherwise.
         $r{content} = $client->decoded_content;
      }
      else {
         $r{content} = $response->decoded_content;
      }
   }

   # Error messages seen from IO::Socket::SSL module.
   if ($r{content} =~ /^Can't connect to .+Connection timed out at /is) {
      $self->timeout(1);
      return $self->log->error("$method: $uri: connection timed out");
   }
   elsif ($r{content} =~ /^Can't connect to .+?\n\n(.+?) at /is) {
      return $self->log->error("$method: $uri: ".lcfirst($1));
   }
   elsif ($r{content} =~ /^Connect failed: connect: Interrupted system call/i) {
      return $self->log->error("$method: $uri: connection interrupted by syscall");
   }

   my $headers = $response->headers;
   $r{headers} = { map { $_ => $headers->{$_} } keys %$headers };
   delete $r{headers}->{'::std_case'};

   return \%r;
}

sub get {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   return $self->_method($uri, $username, $password, 'get');
}

sub cat {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   $self->_method($uri, $username, $password, 'get') or return;
   return $self->content;
}

sub post {
   my $self = shift;
   my ($href, $uri, $username, $password) = @_;

   $self->brik_help_run_undef_arg('post', $href) or return;

   return $self->_method($uri, $username, $password, 'post', $href);
}

sub put {
   my $self = shift;
   my ($href, $uri, $username, $password) = @_;

   $self->brik_help_run_undef_arg('put', $href) or return;

   return $self->_method($uri, $username, $password, 'put', $href);
}

sub patch {
   my $self = shift;
   my ($href, $uri, $username, $password) = @_;

   $self->brik_help_run_undef_arg('patch', $href) or return;

   return $self->_method($uri, $username, $password, 'patch', $href);
}

sub delete {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   return $self->_method($uri, $username, $password, 'delete');
}

sub options {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   return $self->_method($uri, $username, $password, 'options');
}

sub head {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   return $self->_method($uri, $username, $password, 'head');
}

sub code {
   my $self = shift;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("code: you have to execute a request first");
   }

   return $last->code;
}

sub content {
   my $self = shift;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("content: you have to execute a request first");
   }

   if ($self->do_javascript) {
      # decoded_content method is available in WWW::Mechanize::PhantomJS
      # but is available in HTTP::Request response otherwise.
      my $client = $self->client;
      return $client->decoded_content;
   }

   return $last->decoded_content;
}

sub get_content {
   my $self = shift;
   my @args = @_;

   $self->get(@args) or return;
   return $self->content;
}

sub post_content {
   my $self = shift;
   my @args = @_;

   $self->post(@args) or return;
   return $self->content;
}

sub save_content {
   my $self = shift;
   my ($output) = @_;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("save_content: you have to execute a request first");
   }

   eval {
      $self->client->save_content($output);
   };
   if ($@) {
      chomp($@);
      return $self->log->error("save_content: unable to save content: $@");
   }

   return 1;
}

sub headers {
   my $self = shift;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("headers: you have to execute a request first");
   }

   return $last->headers;
}

#
# Alias for headers Command
#
sub get_response_headers {
   my $self = shift;

   return $self->headers;
}

#
# Remove one header for next request.
#
sub delete_request_header {
   my $self = shift;
   my ($header) = @_;

   $self->brik_help_run_undef_arg('delete_header', $header) or return;

   my $headers = $self->add_headers;
   my $value = $headers->{$header} || 'undef';
   delete $headers->{$header};

   return $value;
}

#
# Return one header from last response.
#
sub get_response_header {
   my $self = shift;
   my ($header) = @_;

   $self->brik_help_run_undef_arg('get_header', $header) or return;

   my $headers = $self->headers or return;
   if (exists($headers->{$header})) {
      return $headers->{$header};
   }

   $self->log->verbose("get_header: header [$header] not found");

   return 0;
}

#
# Set header for next request.
#
sub set_request_header {
   my $self = shift;
   my ($header, $value) = @_;

   $self->brik_help_run_undef_arg('set_request_header', $header) or return;
   $self->brik_help_run_undef_arg('set_request_header', $value) or return;

   my $headers = $self->add_headers;
   $headers->{$header} = $value;

   return $value;
}

sub links {
   my $self = shift;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("links: you have to execute a request first");
   }

   my @links = ();
   for my $l ($self->client->links) {
      push @links, $l->url;
      $self->log->verbose("links: found link [".$l->url."]");
   }

   return \@links;
}

sub forms {
   my $self = shift;

   my $last = $self->_last;
   if (! defined($last)) {
      return $self->log->error("forms: you have to execute a request first");
   }

   my $client = $self->client;

   if ($self->log->level > 2) {
      print Data::Dumper::Dumper($last->headers)."\n";
   }

   # We use our own "manual" way to get access to content:
   # WWW::Mechanize::PhantomJS is clearly broken, and we have to support
   # WWW::Mechanize also. At some point, we should write a good WWW::Mechanize::PhantomJS
   # module.
   #my @forms = $client->forms;
   my $content = $self->content or return;
   my @forms = HTML::Form->parse($content, $client->base);

   my @result = ();
   for my $form (@forms) {
      my $name = $form->{attr}{name} || 'undef';
      my $action = $form->{action};
      my $method = $form->{method} || 'undef';

      my $h = {
         action => $action->as_string,
         method => $method,
      };

      for my $input (@{$form->{inputs}}) {
         my $type = $input->{type} || '';
         my $name = $input->{name} || '';
         my $value = $input->{value} || '';
         if ($type ne 'submit') {
            $h->{input}{$name} = $value;
         }
      }

      push @result, $h;
   }

   return \@result;
}

sub trace_redirect {
   my $self = shift;
   my ($uri, $username, $password) = @_;

   $uri ||= $self->uri;
   $self->brik_help_run_undef_arg('trace_redirect', $uri) or return;

   my $prev = $self->do_redirects;
   $self->do_redirects(0);

   my @results = ();

   my $location = $uri;
   # Max 20 redirects
   for (1..20) {
      $self->log->verbose("trace_redirect: $location");

      my $response;
      eval {
         $response = $self->get($location);
      };
      if ($@) {
         chomp($@);
         return $self->log->error("trace_redirect: unable to get uri [$uri]: $@");
      }

      my $this = {
         uri => $location,
         code => $self->code,
      };
      push @results, $this;

      if ($this->{code} != 302 && $this->{code} != 301) {
         last;
      }

      $location = $this->{location} = $self->headers->{location};
   }

   $self->do_redirects($prev);

   return \@results;
}

sub screenshot {
   my $self = shift;
   my ($uri, $output) = @_;

   $self->brik_help_run_undef_arg('screenshot', $uri) or return;
   $self->brik_help_run_undef_arg('screenshot', $output) or return;

   if ($self->brik_has_module('WWW::Mechanize::PhantomJS')
   &&  $self->brik_has_binary('phantomjs')) {
      my $mech = WWW::Mechanize::PhantomJS->new
         or return $self->log->error("screenshot: PhantomJS failed");

      my $get = $mech->get($uri)
         or return $self->log->error("screenshot: get uri [$uri] failed");

      my $data = $mech->content_as_png
         or return $self->log->error("screenshot: content_as_png failed");

      my $write = Metabrik::File::Write->new_from_brik_init($self) or return;
      $write->encoding('ascii');
      $write->overwrite(1);
      $write->append(0);

      $write->open($output) or return $self->log->error("screenshot: open failed");
      $write->write($data) or return $self->log->error("screenshot: write failed");
      $write->close;

      return $output;
   }

   return $self->log->error("screenshot: optional module [WWW::Mechanize::PhantomJS] and optional binary [phantomjs] are not available");
}

sub eval_javascript {
   my $self = shift;
   my ($js) = @_;

   $self->brik_help_run_undef_arg('eval_javascript', $js) or return;

   # Perl module Wight may also be an option.

   if ($self->brik_has_module('WWW::Mechanize::PhantomJS')
   &&  $self->brik_has_binary('phantomjs')) {
      my $mech = WWW::Mechanize::PhantomJS->new(launch_arg => ['ghostdriver/src/main.js'])
         or return $self->log->error("eval_javascript: PhantomJS failed");

      return $mech->eval_in_page($js);
   }

   return $self->log->error("eval_javascript: optional module [WWW::Mechanize::PhantomJS] ".
      "and optional binary [phantomjs] are not available");
}

sub info {
   my $self = shift;
   my ($uri) = @_;

   $uri ||= $self->uri;
   $self->brik_help_run_undef_arg('info', $uri) or return;

   my $r = $self->get($uri) or return;
   my $headers = $r->{headers};

   # Taken from apps.json from Wappalyzer
   my @headers = qw(
      IBM-Web2-Location
      X-Drupal-Cache
      X-Powered-By
      X-Drectory-Script
      Set-Cookie
      X-Powered-CMS
      X-KoobooCMS-Version
      X-ATG-Version
      User-Agent
      X-Varnish
      X-Compressed-By
      X-Firefox-Spdy
      X-ServedBy
      MicrosoftSharePointTeamServices
      Set-Cookie
      Generator
      X-CDN
      Server
      X-Tumblr-User
      X-XRDS-Location
      X-Content-Encoded-By
      X-Ghost-Cache-Status
      X-Umbraco-Version
      X-Rack-Cache
      Liferay-Portal
      X-Flow-Powered
      X-Swiftlet-Cache
      X-Lift-Version
      X-Spip-Cache
      X-Wix-Dispatcher-Cache-Hit
      COMMERCE-SERVER-SOFTWARE
      X-AMP-Version
      X-Powered-By-Plesk
      X-Akamai-Transformed
      X-Confluence-Request-Time
      X-Mod-Pagespeed
      Composed-By
      Via
   );

   if ($self->log->level > 2) {
      print Data::Dumper::Dumper($headers)."\n";
   }

   my %info = ();
   for my $hdr (@headers) {
      my $this = $headers->header(lc($hdr));
      $info{$hdr} = $this if defined($this);
   }

   my $title = $r->{title};
   if (defined($title)) {
      print "Title: $title\n";
   }

   for my $k (sort { $a cmp $b } keys %info) {
      print "$k: ".$info{$k}."\n";
   }

   return 1;
}

sub mirror {
   my $self = shift;
   my ($url, $output, $datadir) = @_;

   $datadir ||= $self->datadir;
   $self->brik_help_run_undef_arg('mirror', $url) or return;
   my $ref = $self->brik_help_run_invalid_arg('mirror', $url, 'SCALAR', 'ARRAY') or return;

   my @files = ();
   if ($ref eq 'ARRAY') {
      $self->brik_help_run_empty_array_arg('mirror', $url) or return;

      for my $this (@$url) {
         my $file = $self->mirror($this, $output) or next;
         push @files, @$file;
      }
   }
   else {
      if ($url !~ /^https?:\/\// && $url !~ /^ftp:\/\//) {
         return $self->log->error("mirror: invalid URL [$url]");
      }

      my $sf = Metabrik::System::File->new_from_brik_init($self) or return;
      if (! defined($output)) {
         my $filename = $sf->basefile($url) or return;
         $output = $datadir.'/'.$filename;
      }
      else { # $output is defined
         if (! $sf->is_absolute($output)) {  # We want default datadir for output file
            $output = $datadir.'/'.$output;
         }
      }

      $self->log->debug("mirror: url[$url] output[$output]");

      my $mech = $self->create_user_agent or return;
      LWP::UserAgent::ProgressAny::__add_handlers($mech);
      Progress::Any::Output->set("TermProgressBarColor");

      my $rc;
      eval {
         $rc = $mech->mirror($url, $output);
      };
      if ($@) {
         chomp($@);
         return $self->log->error("mirror: mirroring URL [$url] to local file [$output] failed: $@");
      }
      my $code = $rc->code;
      $self->_last_code($code);
      if ($code == 200) {
         push @files, $output;
         $self->log->verbose("mirror: downloading URL [$url] to local file [$output] done");
      }
      elsif ($code == 304) { # Not modified
         $self->log->verbose("mirror: file [$output] not modified since last check");
      }
      else {
         return $self->log->error("mirror: error while mirroring URL [$url] with code: [$code]");
      }
   }

   return \@files;
}

sub parse {
   my $self = shift;
   my ($html) = @_;

   $self->brik_help_run_undef_arg('parse', $html) or return;

   return HTML::TreeBuilder->new_from_content($html);
}

sub get_last {
   my $self = shift;

   return $self->_last;
}

sub get_last_code {
   my $self = shift;

   return $self->_last_code;
}

1;

__END__

=head1 NAME

Metabrik::Client::Www - client::www Brik

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2014-2022, Patrice E<lt>GomoRE<gt> Auffret

You may distribute this module under the terms of The BSD 3-Clause License.
See LICENSE file in the source distribution archive.

=head1 AUTHOR

Patrice E<lt>GomoRE<gt> Auffret

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.