IMDB-JSON/lib/IMDB/JSON.pm
=head1 PACKAGE
IMDB::JSON
=head1 DESCRIPTION
Search IMDB for a specific title, process the result and extract the JSON script within. Process the JSON script and return a hash reference.
=cut
package IMDB::JSON;
$IMDB::JSON::VERSION = "0.05";
use strict;
use HTML::TokeParser;
use LWP::Simple qw($ua get);
use IO::Socket::SSL;
use JSON::XS;
=head1 SYNOPSIS
use IMDB::JSON;
use Data::Dumper;
my $IMDB = IMDB::JSON->new;
print Dumper($IMDB->search("The Thing", 1982));
exit;
=head1 METHODS
=head2 new(opt => value);
Create a new IMDB::JSON object, options can be passed to the object by specifying them
=head3 OPTIONS
=over
=item base_url
The base URL to start from. This is usually https://www.imdb.com
=item raw_json
If true, returns only raw JSON text, it's not processed into an hash reference
=item user_agent
Set the User-Agent you want to send with the request
=item debug
If true, print debug messages to STDERR
=back
=cut
sub new {
my ($CLASS, %o) = @_;
return bless {
base_url => ($o{base_url} ? $o{base_url} : 'https://www.imdb.com'),
raw_json => ($o{raw_json} ? 1 : 0),
user_agent => $o{user_agent},
debug => $o{debug}
};
}
sub _get {
my ($self, $URL) = @_;
$ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0;
my $ua = LWP::UserAgent->new(
ssl_opts => {
verify_hostname => 0,
SSL_verify_mode => IO::Socket::SSL::SSL_VERIFY_NONE,
}
);
# Set the user agent to something
$ua->agent($self->{user_agent}) if $self->{user_agent};
print STDERR "DEBUG: fetch URL: $URL\n" if $self->{debug};
my $req = HTTP::Request->new( GET => $URL);
my $response = $ua->request($req);
return $response->content;
}
#URI encoding
sub _enc {
my ($self, $data) = @_;
$data =~ s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg;
return $data;
}
# Process IMDB search results
sub _result {
my ($self, $title, $year) = @_;
my $data = $self->_get($self->{base_url} . '/search/title?title=' . $self->_enc($title) . '&release_date=' . $year . '-01-01,' . $year . '-12-31&view=simple');
print STDERR "DEBUG: " . length($data) . " bytes of data received\n" if $self->{debug};
return if !$data;
my $url;
# Process the results data (must be reference scalar!)
my $p = HTML::TokeParser->new(\$data);
$self->{_cur_id} = '';
# <meta property="imdb:pageConst" content="tt11125620"/>
while(my $t = $p->get_tag('meta')){
if($t->[1]->{content} =~ /(tt\d+)/){
$self->{_cur_id} = $1;
last;
} else {
use Data::Dumper; print Dumper($t);
}
}
$p = HTML::TokeParser->new(\$data);
# Walk down to the results section
while(my $t = $p->get_tag('div')){
last if($t->[1]->{class} eq 'lister-item mode-simple');
}
# Walk through the results and match the correct one
while(my $t = $p->get_tag('span')){
# Found a results chunk
if($t->[1]->{class} eq "lister-item-header"){
# Grab the href and text
my $t = $p->get_tag('a');
$url = $t->[1]->{href};
# Grab the title and year
my $txt = $p->get_trimmed_text;
$t = $p->get_tag('span');
my $yr = $p->get_trimmed_text;
# Check and see if they match
if($title eq $txt && "($year)" eq $yr){
return $url;
} elsif($self->{debug}){
print STDERR "DEBUG: result miss: $txt / $yr\n";
}
}
}
return;
}
=head1 search(tilte, year)
Attempt to return IMDB data for the movie / show B<tilte> published in B<year>
=cut
sub search {
my ($self, $title, $year) = @_;
my $url = $self->_result($title, $year);
return if !$url;
my $data = $self->_get($url =~ /^https?:\/\// ? $url : $self->{base_url} . ($url =~ /^\// ? $url : '/' . $url));
print STDERR "DEBUG: " . length($data) . " bytes of data received\n" if $self->{debug};
return if !$data;
return $self->_get_json($data);
}
sub _get_json {
my ($self, $data) = @_;
my $p = HTML::TokeParser->new(\$data);
while(my $t = $p->get_tag('script')){
last if($t->[1]->{type} eq "application/ld+json");
}
my $json = $p->get_text;
if(!$json){
$@ = "JSON script not found!";
print STDERR "DEBUG: $@\n" if $self->{debug};
return;
} else {
my $jsn = decode_json($json);
$jsn->{id} = $self->{_cur_id};
return ($self->{raw_json} ? $json : $jsn);
}
}
=head1 byid(imdb_id)
Returns JSON results for B<imdb_id>
=cut
sub byid {
my ($self, $id) = @_;
my $data = $self->_get($self->{base_url} . '/title/' . $id . '/');
return if !$data;
return $self->_get_json($data);
}
1;
=head1 AUTHOR
Colin Faber <cfaber@fpsn.net>
=head1 BUGS
Report all bugs on https://rt.cpan.org OR email me directly
=head1 COPYRIGHT
IMDB::JSON is Copyright (C) 2018, by Colin Faber.
It is free software; you can redistribute it and/or modify it under the terms of either:
a) the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version, or
b) the "Perl Artistic License".