Yet Another CPAN Grep

Genealogy-ChroniclingAmerica/lib/Genealogy/ChroniclingAmerica.pm

package Genealogy::ChroniclingAmerica;

# OLD API
# https://chroniclingamerica.loc.gov/search/pages/results/?date1=1912&state=Indiana&format=json&andtext=ralph%3Dbixler

# TODO: NEW API
# https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/README.html
# https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/ChronAm-download_results.html
# https://www.loc.gov/collections/chronicling-america/?dl=page&end_date=1912-12-31&ops=PHRASE&qs=ralph+bixler&searchType=advanced&start_date=1912-01-01&fo=json

use warnings;
use strict;

use Carp;
use CHI;
use LWP::UserAgent;
use JSON::MaybeXS;
use Object::Configure;
use Params::Get 0.13;
use Scalar::Util;
use Return::Set 0.02;
use URI;

=head1 NAME

Genealogy::ChroniclingAmerica - Find URLs for a given person on the Library of Congress Newspaper Records

=head1 VERSION

Version 0.07

=cut

our $VERSION = '0.07';

=head1 SYNOPSIS

    use HTTP::Cache::Transparent;  # be nice
    use Genealogy::ChroniclingAmerica;

    HTTP::Cache::Transparent::init({
	BasePath => '/tmp/cache'
    });
    my $loc = Genealogy::ChroniclingAmerica->new({
	firstname => 'John',
	lastname => 'Smith',
	state => 'Indiana',
	date_of_death => 1862
    });

    while(my $url = $loc->get_next_entry()) {
	print "$url\n";
    }

=head1 DESCRIPTION

The B<Genealogy::ChroniclingAmerica> module allows users to search for historical newspaper records from the B<Chronicling America> archive,
maintained by the Library of Congress.
By providing a person's first name,
last name,
and state,
the module constructs and executes search queries,
retrieving URLs to relevant newspaper pages in JSON format.
It supports additional filters like date of birth and date of death,
enforces B<rate-limiting> to comply with API request limits,
local caching,
and includes robust error handling and validation.
Ideal for genealogy research,
this module streamlines access to historical newspaper archives with an easy-to-use interface.

=over 4

=item * Rate-Limiting

A minimum interval between successive API calls can be enforced to ensure that the API is not overwhelmed and to comply with any request throttling requirements.

Rate-limiting is implemented using L<Time::HiRes>.
A minimum interval between API
calls can be specified via the C<min_interval> parameter in the constructor.
Before making an API call,
the module checks how much time has elapsed since the
last request and,
if necessary,
sleeps for the remaining time.

=back

=head1 SUBROUTINES/METHODS

=head2 new

Creates a Genealogy::ChroniclingAmerica object.

It takes three mandatory arguments:

=over 4

=item * C<firstname>

=item * C<lastname>

=item * C<state> - Must be the full name,
not an abbreviation.

=back

Accepts the following optional arguments:

=over 4

=item * C<cache>

A caching object.
If not provided,
an in-memory cache is created with a default expiration of one hour.

=item * C<middlename>

=item * C<date_of_birth>

=item * C<date_of_death>

=item * C<host> - The domain of the site to search, the default is L<https://chroniclingamerica.loc.gov>.

=item * C<ua> - An object that understands get and env_proxy messages,
such as L<LWP::UserAgent::Throttled>.

=item * C<min_interval> - Amount to rate limit.
Defaults to 3 seconds,
inline with L<https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/README.html#rate-limits>

=back

=cut

sub new {
	my $class = shift;

	return unless(defined($class));

	# Handle hash or hashref arguments
	my $params = Params::Get::get_params(undef, \@_) || {};

	if(Scalar::Util::blessed($class)) {
		# If $class is an object, clone it with new arguments
		return bless { %{$class}, %{$params} }, ref($class);
	}

	unless($params->{'firstname'}) {
		Carp::croak('Firstname is not optional');
		return;	# Don't know why this is needed, but it is
	}

	# Fail when the input is just a set of numbers
	if($params->{'firstname'} !~ /\D/) {
		Carp::croak('Usage: ', __PACKAGE__, ": invalid input to new(), $params->{firstname}");
		return;
	}

	unless(defined($params->{'lastname'})) {
		Carp::croak('Lastname is not optional');
		return;
	}

	# Fail when the input is just a set of numbers
	if($params->{'lastname'} !~ /\D/) {
		Carp::croak('Usage: ', __PACKAGE__, ": invalid input to new(), $params->{lastname}");
		return;
	}

	unless($params->{'state'}) {
		Carp::croak('State is not optional');
		return;
	}

	if(length($params->{'state'}) == 2) {
		Carp::croak('State needs to be the full name');
		return;
	}

	# Fail when the input contains a number
	if($params->{'state'} =~ /\d/) {
		Carp::croak('Usage: ', __PACKAGE__, ": invalid input to new(), $params->{state}");
		return;
	}

	my $ua = $params->{'ua'};
	if(!defined($ua)) {
		my $ssl_opts;
		if(-r '/etc/ssl/certs/ca-certificates.crt') {	# Linux
			$ssl_opts = {
				'SSL_ca_file' => '/etc/ssl/certs/ca-certificates.crt',
				verify_hostname => 1
			}
		} elsif(-r '/opt/homebrew/etc/ca-certificates/cert.pem') {	# MacOS
			$ssl_opts = {
				'SSL_ca_file' => '/opt/homebrew/etc/ca-certificates/cert.pem',
				verify_hostname => 1
			}
		} else {
			$ssl_opts = { verify_hostname => 0 };
		}
		$ua = LWP::UserAgent->new(
			ssl_opts => $ssl_opts,
			agent => __PACKAGE__ . "/$VERSION"
		);
		$ua->env_proxy(1);
	}

	$params = Object::Configure::configure($class, $params);

	# Set up caching (default to an in-memory cache if none provided)
	my $cache = $params->{cache} || CHI->new(
		driver => 'Memory',
		global => 1,
		expires_in => '1 hour',
	);

	# Set up rate-limiting: minimum interval between requests (in seconds)
	# From https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/README.html#rate-limits
	# Burst Limit: 20 requests per 1 minute, Block for 5 minutes
	my $min_interval = $params->{min_interval} || 4;	# default: four second delay

	my $rc = {
		%{$params},
		min_interval => $min_interval,
		ua => $ua,
		host => $params->{'host'} || 'www.loc.gov',
		path => 'collections/chronicling-america',
		cache => $cache,
	};

	my %query_parameters = ( 'fo' => 'json', 'location_state' => ucfirst(lc($params->{'state'})), 'ops' => 'PHRASE', 'searchType' => 'advanced' );
	if($query_parameters{'location_state'} eq 'District of columbia') {
		$query_parameters{'location_state'} = 'District of Columbia';
	}
	my $name = $params->{'firstname'};
	if($params->{'middlename'}) {
		$rc->{'name'} = "$name $params->{middlename} $params->{lastname}";
		$name .= '+' . $params->{middlename};
	} else {
		$rc->{'name'} = "$name $params->{lastname}";
	}
	$name .= "+$params->{lastname}";

	$name =~ s/\s/+/g;

	$query_parameters{'qs'} = $name;
	if($params->{'date_of_birth'}) {
		$query_parameters{'start_date'} = $params->{'date_of_birth'};
	}
	if($params->{'date_of_death'}) {
		$query_parameters{'end_date'} = $params->{'date_of_death'};
	}

	# Just scanning for one year
	$query_parameters{'start_date'} ||= $params->{'date_of_death'};
	$query_parameters{'end_date'} ||= $params->{'date_of_birth'};

	$query_parameters{'start_date'} .= '-01-01' if($query_parameters{'start_date'});
	$query_parameters{'end_date'} .= '-12-31' if($query_parameters{'end_date'});

	my $uri = URI->new("https://$rc->{host}/$rc->{path}");
	$uri->query_form(%query_parameters);
	my $url = $uri->as_string();
	# ::diag(">>>>$url = ", $rc->{'name'});
	# print ">>>>$url = ", $rc->{'name'}, "\n";

	my $items = _get_items($ua, $url);

	# Update last_request timestamp
	$rc->{'last_request'} = time();

	if(scalar(@{$items})) {
		# Add 'fo=json' to the end of each row
		my @rc;
		for my $item (@{$items}) {
			unless($item->{'id'} =~ /&fo=json$/) {
				$item->{'id'} .= '&fo=json';
			}
			push @rc, $item;
		}
		$rc->{'items'} = \@rc;
		$rc->{'index'} = 0;
		$rc->{'matches'} = scalar(@rc);
	} else {
		$rc->{'matches'} = 0;
	}

	return bless $rc, $class;
}

=head2 get_next_entry

Returns the next match as a URL.

=cut

sub get_next_entry
{
	my $self = shift;

	# Exit if no matches or index out of bounds
	return if($self->{'matches'} == 0) || ($self->{'index'} >= $self->{'matches'});

	# Retrieve the next entry and increment index
	my $entry = $self->{'items'}->[$self->{'index'}++];

	# ::diag(Data::Dumper->new([$entry])->Dump());

	# Create a cache key based on the location, date and time zone (might want to use a stronger hash function if needed)
	my $cache_key = "loc:$entry->{id}";
	if(my $cached = $self->{cache}->get($cache_key)) {
		return $cached;
	}

	# Enforce rate-limiting: ensure at least min_interval seconds between requests.
	my $now = time();
	my $elapsed = $now - $self->{last_request};
	if($elapsed < $self->{min_interval}) {
		Time::HiRes::sleep($self->{min_interval} - $elapsed);
	}

	# Make the API request
	# ::diag(__LINE__);
	# ::diag(Data::Dumper->new([$entry])->Dump());
	# ::diag($entry->{'id'});
	my $resp = $self->{'ua'}->get($entry->{'id'});
	# ::diag(__LINE__);
	# ::diag(Data::Dumper->new([$resp])->Dump());

	# Update last_request timestamp
	$self->{last_request} = time();

	# Handle error responses
	if($resp->is_error()) {
		# print 'got: ', $resp->content(), "\n";
		Carp::carp("get_next_entry: API returned error on $entry->{id}: ", $resp->status_line()) unless($resp->code() == 404);
		return;
	}

	unless($resp->is_success()) {
		Carp::croak($resp->status_line());
	}

	my $data = decode_json($resp->decoded_content());

	my $full_text = $data->{'full_text'};
	if(!defined($full_text)) {
		return $self->get_next_entry();
	}

	$full_text =~ s/[\r\n]/ /g;
        if($full_text !~ /$self->{'name'}/ims) {
                return $self->get_next_entry();
        }

	# ::diag(__LINE__);
	# ::diag($data->{full_text});
	foreach my $page(@{$data->{'page'}}) {
		if($page->{'mimetype'} eq 'application/pdf') {
			# Cache the result before returning it
			$self->{'cache'}->set($cache_key, $page->{'url'});
			return Return::Set::set_return($page->{'url'}, { type => 'string', 'min' => 5, matches => qr/\.pdf$/ });
		}
	}
}

# This is the sample code at https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/ChronAm-download_results.html
#	translated into Perl

# Run P1 search and get a list of results
sub _get_items
{
	my ($ua, $url, $items_ref, $conditional, $depth) = @_;

	$items_ref ||= [];
	$conditional ||= 'True';
	$depth ||= 0;

	# Check that the query URL is not an item or resource link
	my @exclude = ('loc.gov/item', 'loc.gov/resource');
	for my $string (@exclude) {
		if (index($url, $string) != -1) {
			Carp::croak('Your URL points directly to an item or ',
			  'resource page (you can tell because "item" ',
			  'or "resource" is in the URL). Please use ',
			  'a search URL instead. For example, instead ',
			  'of "https://www.loc.gov/item/2009581123/", ',
			  'try "https://www.loc.gov/maps/?q=2009581123".');
		}
	}

	# Create URI object and add parameters
	my $uri = URI->new($url);
	$uri->query_form(
		$uri->query_form(),
		fo => 'json',
		c => 100,
		at => 'results,pagination'
	);

	# Make HTTP request
	# ::diag(__LINE__);
	# ::diag($uri);
	my $response = $ua->get($uri);

	# Check that the API request was successful
	if($response->is_success() && $response->header('Content-Type') && ($response->header('Content-Type') =~ /json/)) {
		my $data = decode_json($response->decoded_content());
		my $results = $data->{results};

		for my $result(@{$results}) {
			# Filter out anything that's a collection or web page
			my $original_format = $result->{original_format} || [];
			my $filter_out = 0;

			# Check if original_format contains "collection" or "web page"
			for my $format (@$original_format) {
				if ($format =~ /collection/i || $format =~ /web page/i) {
					$filter_out = 1;
					last;
				}
			}

			# Evaluate conditional (simplified - assumes 'True' means true)
			if ($conditional ne 'True') {
				$filter_out = 1;
			}

			unless ($filter_out) {
				# Get the link to the item record
				if (my $item = $result->{id}) {
					# Filter out links to Catalog or other platforms
					if ($item =~ /^http:\/\/www\.loc\.gov\/resource/) {
						# my $resource = $item; # Assign item to resource
						# push @$items_ref, $resource;
						push @$items_ref, $result;
					}
					if ($item =~ /^http:\/\/www\.loc\.gov\/item/) {
						push @$items_ref, $result;
					}
				}
			}
		}

		# Repeat the loop on the next page, unless we're on the last page
		# Put the $depth in case the end of list code doesn't work
		if(($depth <= 10) && defined(my $next_url = $data->{pagination}->{next})) {
			_get_items($ua, $next_url, $items_ref, $conditional, $depth + 1);
		}

		return $items_ref;
	}
	Carp::carp($url, ': ', $response->status_line());
	return $items_ref;
}

=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 BUGS

If a middle name is given and no match is found,
it should search again without the middle name.

=head1 SEE ALSO

=item * L<https://github.com/nigelhorne/gedcom>

=item * L<https://chroniclingamerica.loc.gov>

=item * L<https://github.com/LibraryOfCongress/data-exploration>

=head1 SUPPORT

This module is provided as-is without any warranty.

Please report any bugs or feature requests to C<bug-genealogy-chroniclingamerica at rt.cpan.org>,
or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Genealogy-ChroniclingAmerica>.
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

You can find documentation for this module with the perldoc command.

    perldoc Genealogy::ChroniclingAmerica

You can also look for information at:

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Genealogy-ChroniclingAmerica>

=item * Search CPAN

L<https://metacpan.org/release/Genealogy-ChroniclingAmerica>

=back

=head1 LICENSE AND COPYRIGHT

Copyright 2018-2025 Nigel Horne.

This program is released under the following licence: GPL2

=cut

1; # End of Genealogy::ChroniclingAmerica
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.