Genealogy-Obituary-Parser/lib/Genealogy/Obituary/Parser.pm
package Genealogy::Obituary::Parser;
use strict;
use warnings;
use Carp;
use DateTime::Format::Text;
use Exporter 'import';
use JSON::MaybeXS;
use Params::Get 0.13;
use Return::Set 0.03;
use Params::Validate::Strict;
our @EXPORT_OK = qw(parse_obituary);
our $geocoder;
# TODO: use Lingua::EN::Tagger;
# TODO: add more general code, e.g. where it looks for father, also look for mother
# TODO: parse https://funeral-notices.co.uk/notice/adams/5244000
=head1 NAME
Genealogy::Obituary::Parser - Extract structured family relationships from obituary text
=head1 VERSION
Version 0.04
=cut
our $VERSION = '0.04';
=head1 SYNOPSIS
use Genealogy::Obituary::Parser qw(parse_obituary);
my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.';
my $data = parse_obituary($text);
# $data = {
# spouse => ['Paul'],
# children => ['Anna', 'Lucy'],
# grandchildren => ['Jake', 'Emma'],
# };
=head1 DESCRIPTION
This module parses freeform obituary text and extracts structured family relationship data
for use in genealogical applications.
It parses obituary text and extract structured family relationship data, including details about children, parents, spouse, siblings, grandchildren, and other relatives.
=head1 FUNCTIONS
=head2 parse_obituary($text)
The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash.
It returns a hash reference containing structured family information,
with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc.
Takes a string, or a ref to a string.
=head3 API SPECIFICATION
=head4 INPUT
{
'text' => {
'type' => 'string', # or stringref
'min' => 1,
'max' => 10000
}, 'geocoder' => { # used to geocode locations to verify they exist
'type' => 'object',
'can' => 'geocode',
'optional' => 1,
}
}
=head4 OUTPUT
=over 4
=item * No matches: undef
=back
{
type => 'hashref',
'min' => 1,
'max' => 10
}
=cut
sub parse_obituary
{
my $params = Params::Validate::Strict::validate_strict({
args => Params::Get::get_params('text', \@_),
schema => {
'text' => {
'type' => 'string',
'min' => 1,
'max' => 10000
}, 'geocoder' => {
'type' => 'object',
'can' => 'geocode',
'optional' => 1,
}
}
});
my $text = $params->{'text'};
if(my $geo = $params->{'geocoder'}) {
$geocoder = $geo;
}
Carp::croak(__PACKAGE__, ': Usage: parse_obituary($text)') unless defined($text);
if(ref($text) eq 'SCALAR') {
$text = ${$text};
}
# Quick scan to get started
sub parse_obituary_quick {
my $text = shift;
my %data;
my @patterns = (
[ qr/\bdaughters?\s+([^.,;]+)/i, 'children' ],
[ qr/\bsons?\s+([^.,;]+)/i, 'children' ],
[ qr/\bchildren\s+([^.,;]+)/i, 'children' ],
[ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ],
[ qr/\bwife\s+([^.,;]+)/i, 'spouse' ],
[ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ],
[ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ],
[ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ],
[ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ],
[ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ],
[ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ],
[ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ],
);
for my $p (@patterns) {
my ($re, $field) = @$p;
while ($text =~ /$re/g) {
my $list = $1;
next unless $list;
# Robust splitting on commas and "and"
my @names = grep { length } map { s/^\s+|\s+$//gr } split /\s*(?:,|(?:\band\b))\s*/i, $list;
push @{$data{$field}}, map { { 'name' => $_ } } @names;
}
}
return \%data;
}
# my %family = %{parse_obituary_quick($text)};
my %family;
# Helper to extract people from a specific section and remove empty entries
sub extract_people_section {
my $section = shift;
return unless $section;
$section =~ s/\s+and\s+/, /g; # Ensure "and" is treated as a separator
$section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g;
my @entries = split /\s*,\s*/, $section;
my @people;
foreach my $entry (@entries) {
$entry =~ s/<<COMMA>>/, /g;
my ($name, $spouse, $location) = ('', '', '');
# Match "Ian (Terry) Girvan of Surrey, BC"
if ($entry =~ /^(\w+)\s+\(([^)]+)\)\s+(\w+)\s+of\s+(.+)$/) {
$name = "$1 $3"; $spouse = $2; $location = $4;
}
# Match "Gwen Steeves (Leslie) of Riverview, NB"
elsif ($entry =~ /^(.+?)\s+\(([^)]+)\)\s+of\s+(.+)$/) {
$name = $1; $spouse = $2; $location = $3;
}
# Match "Carol Girvan of Dartmouth, NS"
elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) {
$name = $1; $location = $2;
} else {
# Match names only (e.g. for siblings)
$name = $entry;
}
next if !$name; # Skip if name is empty
next if($name =~ /^father-in-law\sto\s/); # Skip follow ons
last if($name =~ /^devoted\s/i);
last if($name =~ /^loved\s/i);
# Create a hash and filter out blank fields
my %person = (
name => $name,
spouse => $spouse,
location => $location,
);
# Remove blank fields
%person = map { $_ => $person{$_} } grep { defined $person{$_} && $person{$_} ne '' } keys %person;
push @people, \%person;
}
return \@people;
}
sub extract_names_from_phrase {
my $phrase = shift;
my @names;
$phrase =~ s/[.;]//g;
# Case: "Christopher, Thomas, and Marsha Cloud"
if ($phrase =~ /^((?:\w+\s*,\s*)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) {
my ($pre, $last_first, $last) = ($1, $2, $3);
my @firsts = split(/\s*,\s*/, $pre);
push @firsts, $last_first;
push @names, map { "$_ $last" } @firsts;
return @names;
}
# Case: "Christopher and Thomas Cloud"
if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) {
my ($first_part, $second_first, $last) = ($1, $2, $3);
my @firsts = split(/\s*,\s*|\s+and\s+/, $first_part);
push @names, map { "$_ $last" } (@firsts, $second_first);
return @names;
}
# Fallback: Split by comma or 'and'
$phrase =~ s/, and grandchildren.+//; # Handle "Anna and Lucy, and grandchildren Jake and Emma"
my @parts = split /\s*(?:,|and)\s*/, $phrase;
push @names, grep { defined($_) && $_ ne '' } @parts;
return @names;
}
# Correct extraction of children (skipping "his/her")
if ($text =~ /survived by (his|her) children\s*([^\.;]+)/i) {
my $children_text = $2;
$family{children} = extract_people_section($children_text);
} elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) { # Look for the phrase "Loving mum to"
my $children_text = $1;
$family{children} = extract_people_section($children_text);
} elsif ($text =~ /Loving father of\s*([^\.;]+)/i) { # Look for the phrase "Loving father of"
my $children_text = $1;
$family{children} = extract_people_section($children_text);
} elsif($text =~ /mother of\s*([^\.;]+)?,/i) { # Look for the phrase "mother of"
my $children_text = $1;
$children_text =~ s/, grandmother.+//;
$family{children} = extract_people_section($children_text);
} elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) {
my @children;
my @grandchildren;
push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' };
if($text =~ /\bdaughter,?\s([a-z]+)/i) {
push @children, { 'name' => $1, 'sex' => 'F' }
}
if($text =~ /\bgranddaughter,?\s([a-z]+)/i) {
push @grandchildren, { 'name' => $1, 'sex' => 'F' };
}
$family{children} = \@children if @children;
$family{grandchildren} = \@grandchildren if @grandchildren;
} elsif($text =~ /Surviving are (?:a )?daughters?,\s*Mrs\.\s+(\w+)\s+\(([^)]+)\)\s+(\w+),\s+([^;]+?);/i) {
# Handle "Surviving are a daughter, Mrs. Walter (Ruth Ann) Gerke, Fort Wayne"
my @children;
my $spouse_first = $1;
my $daughter_name = $2;
my $spouse_last = $3;
my $location = $4;
$location =~ s/,\s*$//;
push @children, {
name => $daughter_name,
location => $location,
sex => 'F',
spouse => {
name => "$spouse_first $spouse_last",
sex => 'M'
}
};
$family{children} = \@children;
} else {
my @children;
# my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0);
# my $tagged = $tagger->add_tags($text);
if($text =~ /\ssons,\s*(.*?);/s) {
my $sons_text = $1;
if($sons_text =~ /, all of (.+)$/) {
my $location = $1;
while($sons_text =~ /([\w. ]+?),\s/g) {
my $son = $1;
if($son =~ /(\w+)\s+and\s+(\w+)/) {
push @children, {
name => $1,
location => $location,
sex => 'M',
}, {
name => $2,
location => $location,
sex => 'M',
};
last;
} else {
push @children, {
name => $son,
location => $location,
sex => 'M',
};
}
}
} else {
while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
push @children, {
name => $1,
location => $2,
sex => 'M',
};
}
}
}
if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) {
push @children, {
name => $1,
location => $3,
sex => 'F',
spouse => { 'name' => $2, sex => 'M' }
};
} elsif($text =~ /one daughter,\s*(.+?),\s*(.+?);/) {
my $name = $1;
my $location = $2;
if($name =~ /(\w+)\s+(\w+)/) {
push @children, {
name => $1,
location => $location,
sex => 'F',
spouse => { name => $2, sex => 'M' }
};
} else {
push @children, {
name => $1,
location => $location,
sex => 'F',
};
}
}
$family{children} = \@children if @children;
if(!$family{'children'}) {
while($text =~ /\b(son|daughter)s?,\s*([A-Z][a-z]+(?:\s+\([A-Z][a-z]+\))?)\s*(?:and their children ([^.;]+))?/g) {
my $sex = $1 eq 'son' ? 'M' : 'F';
my $child = $2;
my $grandkids = $3;
if(my @grandchildren = $grandkids ? split /\s*,\s*|\s+and\s+/, $grandkids : ()) {
push @children, {
name => $child,
sex => $sex,
grandchildren => \@grandchildren,
};
} elsif(($sex eq 'F') && ($child =~ /(.+)\s+\((.+)\)/)) {
push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } }
} elsif($child ne 'Mrs') {
push @children, { name => $child, sex => $sex }
}
}
}
$family{children} = \@children if @children;
}
if(!$family{'children'}) {
if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) {
my $raw = $1;
$raw =~ s/\sand their .+//;
my @children = extract_names_from_phrase($raw);
push @{$family{children}}, map { { name => $_, sex => 'M' } } @children;
}
if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) {
my $raw = $1;
$raw =~ s/\sand their .+//;
my @children = extract_names_from_phrase($raw);
push @{$family{children}}, map { { name => $_, sex => 'F' } } @children;
}
}
# Extract grandchildren
if(!$family{'grandchildren'}) {
if($text =~ /grandchildren\s+([^\.;]+)/i) {
my @grandchildren = split /\s*(?:,|and)\s*/i, $1;
if(scalar(@grandchildren)) {
$family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ];
}
}
}
if($family{'grandchildren'} && scalar @{$family{grandchildren}}) {
while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) {
shift @{$family{'grandchildren'}};
}
if($family{'grandchildren'}->[0] =~ /brothers/) {
if(!exists $family{'brothers'}) {
shift @{$family{'grandchildren'}};
$family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}}));
}
delete $family{grandchildren};
}
} else {
delete $family{grandchildren};
}
if((!defined($family{'grandchildren'})) || (($#{$family{'grandchildren'}}) <= 0)) {
# handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill"
my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved|$)/;
# Normalize and split into individual names
my @grandchildren;
if($grandchildren_str) {
@grandchildren = split /,\s*|\s+and\s+/, $grandchildren_str;
}
if(scalar(@grandchildren)) {
$family{'grandchildren'} = \@grandchildren;
} elsif($text =~ /grandm\w+\s/) {
my $t = $text;
$t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/;
$family{grandchildren} = [ split /\s*(?:,|and)\s*/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] || '' ];
}
}
# Extract siblings (sisters and brothers) correctly, skipping "her" or "his"
if($text =~ /predeceased by (his|her) sisters?\s*([^;\.]+);?/i) {
my $sisters_text = $2;
$sisters_text =~ s/^,\s+//;
$family{sisters} = extract_people_section($sisters_text);
} else {
while($text =~ /\bsister[,\s]\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
my $name = $1;
$family{'sisters'} ||= [];
if($name eq 'Mrs') {
if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) {
$name = $1;
} else {
undef $name;
}
}
if($name) {
push @{$family{sisters}}, {
name => $name,
sex => 'F',
status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living',
};
}
}
if(!exists($family{'sisters'})) {
if($text =~ /\stwo\ssisters,\s*(.*?)\sand\s(.*?)[;:]/s) {
my($first, $second) = ($1, $2);
foreach my $sister($first, $second) {
if($sister =~ /Mrs\.\s(.+?),\s(.+)/) {
my $name = $1;
my $location = $2;
$location =~ s/,$//;
if($name =~ /(\w+)\s+(\w+)/) {
push @{$family{sisters}}, {
name => $1,
location => $location,
sex => 'F',
spouse => { 'name' => $2, 'sex' => 'M' }
};
} else {
push @{$family{sisters}}, {
name => $name,
location => $location,
sex => 'F',
};
}
} else {
push @{$family{sisters}}, {
name => $sister,
sex => 'F',
};
}
}
}
}
if($family{'sisters'}) {
# Deduplicate by serializing hashes for comparison
my %seen;
my @sisters = grep {
my $key = JSON::MaybeXS->new->canonical(1)->encode($_);
!$seen{$key}++
} @{$family{sisters}};
$family{sisters} = \@sisters;
}
}
if($text =~ /predeceased by (his|her) brothers?\s*([^;\.]+);?/i) {
my $brothers_text = $2;
$brothers_text =~ s/^,\s+//;
$family{brothers} = extract_people_section($brothers_text);
# TODO: mark all statuses to deceased
} else {
while ($text =~ /\bbrother,\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
$family{'brothers'} ||= [];
push @{$family{brothers}}, {
name => $1,
status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living',
};
}
if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) {
if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) {
push @{$family{'siblings'}},
{ 'name' => $1 },
{ 'name' => $2 }
}
}
if(!exists($family{'brothers'})) {
if($text =~ /\sbrothers,\s*(.*?)[;\.]/s) {
my $brothers_text = $1;
if($brothers_text =~ /, all of (.+)$/) {
my $location = $1;
while($brothers_text =~ /([\w. ]+?),\s/g) {
my $son = $1;
if($son =~ /(\w+)\s+and\s+(\w+)/) {
push @{$family{brothers}}, {
name => $1,
location => $location,
sex => 'M',
}, {
name => $2,
location => $location,
sex => 'M',
};
last;
} else {
push @{$family{brothers}}, {
name => $son,
location => $location,
sex => 'M',
};
}
}
} else {
while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
push @{$family{brothers}}, {
name => $1,
location => $2,
sex => 'M',
};
}
}
}
}
}
if(!exists($family{'brothers'}) && $text =~ /\b(?:two|three|four)\s+brothers?,\s*(.+?)(?:,\s*a\s+(?:sister|half-sister)|;)/i) {
# Pattern for "two brothers, Name and Name"
my $brothers_text = $1;
my @brothers;
# Handle "Charles F. Harris and Berton Harris"
if($brothers_text =~ /\band\b/) {
my @names = split /\s+and\s+/, $brothers_text;
foreach my $name (@names) {
$name =~ s/^\s+|\s+$//g;
$name =~ s/,\s*$//;
push @brothers, {
name => $name,
sex => 'M',
status => 'living'
};
}
}
$family{brothers} = \@brothers if(scalar @brothers);
}
# Detect nieces/nephews
$family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : [];
# Extract parents and clean the names by removing unnecessary details
if($text =~ /(son|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) {
my $father = $2;
my $mother = $3;
# Remove anything after the first comma in each parent's name
$father =~ s/,.*//;
$mother =~ s/,.*//;
if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
$mother = "$1 $2";
}
$family{parents} = {
father => { name => $father },
mother => { name => $mother },
};
} elsif($text =~ /parents were (\w+) and (\w+)/i) {
$family{parents} = {
father => { name => $1 },
mother => { name => $2 },
};
}
# Extract spouse's death year and remove the "(year)" from the name
if($text =~ /(wife|husband) of the late\s+([\w\s]+)\s+\((\d{4})\)/) {
my $name = $2;
my $death_year = $3;
$family{'spouse'} ||= [];
# Remove the death year part from the spouse's name
$name =~ s/\s*\(\d{4}\)//;
push @{$family{'spouse'}}, {
name => $name,
death_year => $death_year
}
} elsif($text =~ /\bmarried ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.*?\b(?:at|in)\s+([^.,]+))?/i) {
$family{'spouse'} ||= [];
push @{$family{'spouse'}}, {
name => $1,
married => {
date => $2,
place => $3 // '',
}
};
} elsif($text =~ /husband (?:to|of) the late\s([\w\s]+)[\s\.]/i) {
$family{'spouse'} ||= [];
push @{$family{'spouse'}}, { name => $1, status => 'deceased' }
} elsif($text =~ /\b(?:wife|husband) of ([^.,;]+)/i) {
$family{'spouse'} ||= [];
push @{$family{'spouse'}}, { name => $1 }
} elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) {
push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' }
} elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) {
push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' }
}
# Ensure spouse location is properly handled
if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') {
delete $family{spouse}[0]{location};
}
# Extract the funeral information
if($text =~ /funeral service.*?at\s+(.+?),?\s+on\s+(.*?),?\s+at\s+(.+?)\./) {
$family{funeral} = {
location => $1,
date => $2,
time => $3,
};
} elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) {
$family{funeral} = {
location => $1,
date => $2,
time => $3,
};
if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) {
$family{'funeral'}->{'date'} = $1;
if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) {
# Wednesday 9th March at 1.15pm. Friends etc. etc.
$family{'funeral'}->{'date'} = $1;
$family{'funeral'}->{'time'} = $2;
}
}
} elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) {
$family{funeral} = {
time => $1,
location => $2
};
} elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) {
$family{funeral} = { location => $1 }
} elsif($text =~ /services.+\sat\s(.+),\swith\s/i) {
$family{funeral} = { location => $1 }
}
# Extract father-in-law and mother-in-law information (if present)
if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) {
my $father_in_law = $1;
$family{children_in_law} = [{ name => $father_in_law }];
} elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) {
my $mother_in_law = $1;
$family{children_in_law} = [ split /\s*(?:,|and)\s*/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] || '' ];
if(scalar($family{children_in_law} == 0)) {
$family{children_in_law} = [{ name => $mother_in_law }];
}
}
# Extract aunt information
if($text =~ /niece of\s+([A-Za-z]+)/) {
my $aunt = $1;
$family{aunt} = [{ 'name' => $aunt }];
}
# Birth info
if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) {
$family{birth} = {
place => $1,
date => $2,
}
} elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) {
$family{'birth'}->{'place'} = $1;
if(my $location = _extract_location($1)) {
$family{'birth'}->{'location'} = $location;
}
if(my $dt = _extract_date($2)) {
$family{'birth'}->{date} = $dt->ymd('/');
}
$family{'birth'}->{'place'} =~ s/\s+$//;
} elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) {
$family{'birth'}->{'place'} = $2;
my $father = $3;
my $mother = $4;
eval {
if(my $dt = DateTime::Format::Text->parse_datetime($1)) {
$family{'birth'}->{date} = $dt->ymd('/');
}
};
# TODO
# if($verbose && $@) {
# Carp::carp($@);
# }
if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
$mother = "$1 $2";
}
if($father =~ /(.+?)\.\s\s/) {
$father = $1;
}
$family{parents} = {
father => { name => $father },
mother => { name => $mother }
};
if($text =~ /survived by (his|her) (father|mother)[\s,;]/i) {
$family{parents}->{$2}->{'status'} = 'living';
}
} elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) {
if(my $dt = _extract_date($1)) {
$family{'birth'}->{date} = $dt->ymd('/');
}
if($2) {
$family{'birth'}->{'location'} = $2;
}
}
# Date of death
if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) {
$family{death}->{date} = $1;
$family{death}->{datetime} = _extract_date($1);
}
# Age at death
if($text =~ /,\s(\d{1,3}), of\s/) {
if($1 < 110) {
$family{'death'}->{'age'} = $1;
}
}
# Place of death
if($text =~ /\b(?:passed away|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) {
my $place = $2;
if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) {
$place = $1;
$family{death}->{date} = $2;
} elsif($place =~ /(.+)\son\s(.+)/) {
$place = $1;
if(my $dt = _extract_date($2)) {
$family{death}->{date} = $dt->ymd('/');
}
}
$place =~ s/^\bthe residence,\s//;
$place =~ s/\bafter a.*$//;
$place =~ s/,\s+$//;
$family{death}->{place} = $place;
}
# Remove blank fields from the main family hash
%family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family;
# Remove empty arrays the family hash
foreach my $key (keys %family) {
if(ref($family{$key}) eq 'ARRAY') {
$family{$key} = [ grep { /\S/ } @{$family{$key}} ];
if(@{$family{$key}} == 0) {
delete $family{$key};
}
}
}
return if(!scalar keys(%family));
return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 });
}
sub _extract_date
{
my $text = shift;
my $parser = DateTime::Format::Text->new();
my $dt;
eval { $dt = $parser->parse_datetime($text); };
return $dt if $dt && !$@;
return undef;
}
sub _extract_location {
my $place_text = shift;
unless($geocoder) {
eval { require Geo::Coder::Free };
if($@) {
Carp::carp(__PACKAGE__, ' (', __LINE__, "): geocoding locations disabled: $@");
return;
}
$geocoder = Geo::Coder::Free->new();
}
my @locations = $geocoder->geocode(location => $place_text); # Use array to improve caching
return unless scalar(@locations);
my $result = $locations[0];
if(ref($result)) {
return {
raw => $place_text,
# city => $result->{components}{city} || $result->{components}{town},
# region => $result->{components}{state},
# country => $result->{components}{country},
latitude => $result->latitude(),
longitude => $result->longitude()
};
}
return {
raw => $place_text,
# city => $result->{components}{city} || $result->{components}{town},
# region => $result->{components}{state},
# country => $result->{components}{country},
latitude => $result->{'latitude'},
longitude => $result->{'longitude'}
};
}
=head1 AUTHOR
Nigel Horne, C<< <njh at nigelhorne.com> >>
=head1 SEE ALSO
Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/>
=head1 SUPPORT
This module is provided as-is without any warranty.
=head1 LICENSE
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
=cut
1;