Convert-Pheno/lib/Convert/Pheno/Mapping.pm
package Convert::Pheno::Mapping;
use strict;
use warnings;
use autodie;
#use Carp qw(confess);
use feature qw(say);
use utf8;
use Data::Dumper;
use JSON::XS;
use Time::HiRes qw(gettimeofday);
use POSIX qw(strftime);
use Scalar::Util qw(looks_like_number);
use List::Util qw(first);
use Cwd qw(cwd);
use Sys::Hostname;
use Convert::Pheno::SQLite;
use Convert::Pheno::Default qw(get_defaults);
use Exporter 'import';
our @EXPORT =
qw(map_ontology_term dotify_and_coerce_number iso8601_time _map2iso8601 map_reference_range map_reference_range_csv map_age_range map2redcap_dict map2ohdsi convert2boolean find_age randStr map_operator_concept_id map_info_field map_omop_visit_occurrence dot_date2iso validate_format get_metaData get_info);
my $DEFAULT = get_defaults();
use constant DEVEL_MODE => 0;
# Global hash
my %seen = ();
#############################
#############################
# SUBROUTINES FOR MAPPING #
#############################
#############################
sub map_ontology_term {
# Most of the execution time goes to this subroutine
# We will adopt two estragies to gain speed:
# 1 - Prepare once, excute often (almost no gain in speed :/ )
# 2 - Create a global hash with "seen" queries (+++huge gain)
#return { id => 'dummy', label => 'dummy' } # test speed
# We will return quickly when nothing has to be done
my $query = $_[0]->{query};
# Skipping numbers
return $DEFAULT->{ontology_term} if looks_like_number($query);
# If the ontology term is an object we assume it comes
# from "terminology" property in the mapping file
return $query if ref $query eq 'HASH';
# Checking for existance in %seen
say "Skipping searching for <$query> as it already exists"
if DEVEL_MODE && exists $seen{$query};
# return if terms has already been searched and exists
# Not a big fan of global stuff...
# ¯\_(ツ)_/¯
# Premature return
return $seen{$query} if exists $seen{$query}; # global
# return something if we know 'a priori' that the query won't exist
#return { id => 'NCIT:NA000', label => $query } if $query =~ m/xx/;
# Ok, now it's time to start the subroutine
my $arg = shift;
my $column = $arg->{column};
my $ontology = $arg->{ontology};
my $self = $arg->{self};
my $databases = $self->{databases};
my $search = $self->{search};
my $print_hidden_labels = $self->{print_hidden_labels};
my $text_similarity_method = $self->{text_similarity_method};
my $min_text_similarity_score = $self->{min_text_similarity_score};
# DEVEL_MODE
say "searching for query <$query> in ontology <$ontology>" if DEVEL_MODE;
# Die if user wants OHDSI w/o flag -ohdsi-db
die
"Could not find the concept_id:<$query> in the provided <CONCEPT> table.\nPlease use the flag <--ohdsi-db> to enable searching at Athena-OHDSI database\n"
if ( $ontology eq 'ohdsi' && !$self->{ohdsi_db} );
# Perform query
my ( $id, $label ) = get_ontology_terms(
{
sth_column_ref => $self->{sth}{$ontology}{$column},
query => $query,
ontology => $ontology,
databases => $databases,
column => $column,
search => $search,
text_similarity_method => $text_similarity_method,
min_text_similarity_score => $min_text_similarity_score
}
);
# Add result to global %seen
$seen{$query} = { id => $id, label => $label }; # global
# id and label come from <db> _label is the original string (can change on partial matches)
return $print_hidden_labels
? { id => $id, label => $label, _label => $query }
: { id => $id, label => $label };
}
sub dotify_and_coerce_number {
my $val = shift;
# Premature return
return undef unless ( defined $val && $val ne '' );
# looks_like_number does not work with commas so we must tr first
( my $tr_val = $val ) =~ tr/,/./;
#print "#$val#$tr_val#\n";
# coercing to number $tr_val
return looks_like_number($tr_val)
? 0 + $tr_val
: $val;
}
sub iso8601_time {
# Standard modules (gmtime()===>Coordinated Universal Time(UTC))
# NB: The T separates the date portion from the time-of-day portion.
# The Z on the end means UTC (that is, an offset-from-UTC of zero hours-minutes-seconds).
# - The Z is pronounced “Zulu”.
my $now = time();
return strftime( '%Y-%m-%dT%H:%M:%SZ', gmtime($now) );
}
sub _map2iso8601 {
my ( $date, $time ) = split /\s+/, shift;
# UTC
return $date
. ( ( defined $time && $time =~ m/^T(.+)Z$/ ) ? $time : 'T00:00:00Z' );
}
sub map_reference_range {
my $arg = shift;
my $field = $arg->{field};
my $redcap_dict = $arg->{redcap_dict};
my $unit = $arg->{unit};
my %hash = ( low => 'Text Validation Min', high => 'Text Validation Max' );
my $hashref = {
unit => $unit,
map { $_ => undef } qw(low high)
}; # Initialize low,high to undef
for my $range (qw (low high)) {
$hashref->{$range} =
dotify_and_coerce_number( $redcap_dict->{$field}{ $hash{$range} } );
}
return $hashref;
}
sub map_reference_range_csv {
my ( $unit, $range ) = @_;
$range->{unit} = $unit;
return $range;
}
sub map_age_range {
my $str = shift;
# Premature return if not range
return { age =>
{ iso8601duration => 'P' . dotify_and_coerce_number($str) . 'Y' } }
unless $str =~ m/\-|\+/;
# if range
$str =~ s/\+/\-999/; # from '70+' '70-999'
my ( $start, $end ) = split /\-/, $str;
return {
ageRange => {
start => {
iso8601duration => 'P' . dotify_and_coerce_number($start) . 'Y'
},
end =>
{ iso8601duration => 'P' . dotify_and_coerce_number($end) . 'Y' }
}
};
}
sub map2redcap_dict {
my $arg = shift;
my ( $redcap_dict, $participant, $field, $labels ) = (
$arg->{redcap_dict}, $arg->{participant},
$arg->{field}, $arg->{labels}
);
# Options:
# labels = 1
# _labels
# labels = 0
# 'Field Note'
# NB: Some numeric fields will get stringified at $participant->{$field}
return $labels
? $redcap_dict->{$field}{_labels}{ $participant->{$field} }
: $redcap_dict->{$field}{'Field Note'};
}
sub map2ohdsi {
my $arg = shift;
my ( $ohdsi_dict, $concept_id, $self ) =
( $arg->{ohdsi_dict}, $arg->{concept_id}, $arg->{self} );
#######################
# OPTION A: <CONCEPT> #
#######################
# NB1: Here we don't win any speed over using %seen as ...
# .. we are already searching in a hash
# NB2: $concept_id is stringified by hash
my ( $data, $id, $label, $vocabulary ) = ( (undef) x 4 );
if ( exists $ohdsi_dict->{$concept_id} ) {
$id = $ohdsi_dict->{$concept_id}{concept_code};
$label = $ohdsi_dict->{$concept_id}{concept_name};
$vocabulary = $ohdsi_dict->{$concept_id}{vocabulary_id};
$data = { id => qq($vocabulary:$id), label => $label };
}
######################
# OPTION B: External #
######################
else {
$data = map_ontology_term(
{
query => $concept_id,
column => 'concept_id',
ontology => 'ohdsi',
self => $self
}
);
}
return $data;
}
sub convert2boolean {
my $val = lc(shift);
return
( $val eq 'true' || $val eq 'yes' ) ? JSON::XS::true
: ( $val eq 'false' || $val eq 'no' ) ? JSON::XS::false
: undef; # unknown = undef
}
sub find_age {
# Not using any CPAN module for now
# Adapted from https://www.perlmonks.org/?node_id=9995
# Assuming $birth_month is 0..11
my $arg = shift;
my $birth = $arg->{birth_day};
my $date = $arg->{date};
# Not a big fan of premature return, but it works here...
# ¯\_(ツ)_/¯
return unless ( $birth && $date );
my ( $birth_year, $birth_month, $birth_day ) =
( split /\-|\s+/, $birth )[ 0 .. 2 ];
my ( $year, $month, $day ) = ( split /\-/, $date )[ 0 .. 2 ];
#my ($day, $month, $year) = (localtime)[3..5];
#$year += 1900;
my $age = $year - $birth_year;
$age--
unless sprintf( "%02d%02d", $month, $day ) >=
sprintf( "%02d%02d", $birth_month, $birth_day );
return $age . 'Y';
}
sub randStr {
#https://www.perlmonks.org/?node_id=233023
return join( '',
map { ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 )[ rand 62 ] } 0 .. shift );
}
sub map_operator_concept_id {
my $arg = shift;
my $id = $arg->{operator_concept_id};
my $val = $arg->{value_as_number};
my $unit = $arg->{unit};
# Define hash for possible values
my %operator_concept_id = ( 4172704 => 'GT', 4172756 => 'LT' );
# 4172703 => 'EQ';
# $hasref will be used for return
my $hashref = undef;
# Only for GT || LT
if ( exists $operator_concept_id{$id} ) {
$hashref = {
unit => $unit,
map { $_ => undef } qw(low high)
}; # Initialize low,high to undef
if ( $operator_concept_id{$id} eq 'GT' ) {
$hashref->{high} = dotify_and_coerce_number($val);
}
else {
$hashref->{low} = dotify_and_coerce_number($val);
}
}
return $hashref;
}
sub map_omop_visit_occurrence {
# key eq 'visit_occurrence_id'
# { '85' =>
# {
# 'admitting_source_concept_id' => 0,
# 'admitting_source_value' => undef,
# 'care_site_id' => '\\N',
# 'discharge_to_concept_id' => 0,
# 'discharge_to_source_value' => undef,
# 'person_id' => 1,
# 'preceding_visit_occurrence_id' => 82,
# 'provider_id' => '\\N',
# 'visit_concept_id' => 9201,
# 'visit_end_date' => '1981-08-19',
# 'visit_end_datetime' => '1981-08-19 00:00:00',
# 'visit_occurrence_id' => 85,
# 'visit_source_concept_id' => 0,
# 'visit_source_value' => '7879d5b2-1af2-49a7-a801-121de124c6af',
# 'visit_start_date' => '1981-08-18',
# 'visit_start_datetime' => '1981-08-18 00:00:00',
# 'visit_type_concept_id' => 44818517
# }
# }
my $arg = shift;
my $self = $arg->{self};
my $ohdsi_dict = $arg->{ohdsi_dict};
my $person_id = $arg->{person_id};
my $visit_occurrence_id = $arg->{visit_occurrence_id};
my $visit_occurrence = $self->{visit_occurrence};
# Premature return
return undef if $visit_occurrence_id eq '\\N'; # perlcritic Severity: 5
# *** IMPORTANT ***
# EUNOMIA instance has mismatches between the person_id -- visit_occurrence_id
# For instance, person_id = 1 has only visit_occurrence_id = 85, but on tables it has:
# 82, 84, 42, 54, 41, 25, 76 and 81
# warn if we don't have $visit_occurrence_id in VISIT_OCCURRENCE
unless ( exists $visit_occurrence->{$visit_occurrence_id} ) {
warn
"Sorry, but <visit_occurrence_id:$visit_occurrence_id> does not exist for <person_id:$person_id>\n"
if DEVEL_MODE;
# Premature return
return undef; # perlcritic Severity: 5
}
# Getting pointer to the hash element
my $hashref = $visit_occurrence->{$visit_occurrence_id};
my $concept = map2ohdsi(
{
ohdsi_dict => $ohdsi_dict,
concept_id => $hashref->{visit_concept_id},
self => $self
}
);
# *** IMPORTANT ***
# Ad hoc to avoid using --ohdsi-db while we find a solution to EUNOMIA not being self-contained
my $ad_hoc_44818517 = {
id => "Visit Type:OMOP4822465",
label => "Visit derived from encounter on claim"
};
my $type =
$hashref->{visit_type_concept_id} == 44818517
? $ad_hoc_44818517
: map2ohdsi(
{
ohdsi_dict => $ohdsi_dict,
concept_id => $hashref->{visit_type_concept_id},
self => $self
}
);
my $start_date = _map2iso8601( $hashref->{visit_start_date} );
my $end_date = _map2iso8601( $hashref->{visit_end_date} );
my $info = { VISIT_OCCURRENCE => { OMOP_columns => $hashref } };
return {
_info => $info,
id => $visit_occurrence_id,
concept => $concept,
type => $type,
start_date => $start_date,
end_date => $end_date,
occurrence_id => $hashref->{visit_occurrence_id}
};
}
sub dot_date2iso {
# We can get
# '', '1990.12.25', '1990-12-25'
my $date = shift // '';
# Premature returns
return '1900-01-01' if $date eq '';
return $date if $date =~ m/^(\d{4})\-(\d{2})\-(\d{2})$/;
# Split '1990.12.25'
my ( $d, $m, $y ) = split /\./, $date;
# YYYYMMDD
return qq/$y-$m-$d/;
}
sub is_multidimensional {
return ref shift ? 1 : 0;
}
sub validate_format {
my ( $data, $format ) = @_;
my $result;
# PXF
if ( $format eq 'pxf' ) {
$result = exists $data->{subject} ? 1 : 0;
# BFF
}
else {
$result = !exists $data->{subject} ? 1 : 0;
}
return $result;
}
sub get_info {
my $self = shift;
# Detecting the number of logical CPUs across different OSes
my $os = $^O;
chomp(
my $ncpuhost =
lc($os) eq 'darwin' || lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
: $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
: qx{/usr/bin/nproc} // 1
);
# For the Windows command, the result will also contain the string
# "NumberOfLogicalProcessors" which is the header of the output.
# So we need to extract the actual number from it:
if ( $os eq 'MSWin32' ) {
($ncpuhost) = $ncpuhost =~ /(\d+)/;
}
$ncpuhost = 0 + $ncpuhost; # coercing it to be a number
return {
user => $ENV{'LOGNAME'}
|| $ENV{'USER'}
|| $ENV{'USERNAME'}
|| 'dummy-user',
username => $self->{username},
ncpuhost => $ncpuhost,
cwd => cwd,
id => $self->{id},
hostname => hostname,
version => $::VERSION
};
}
sub get_metaData {
my $self = shift;
# Setting a few variables
my $username = $self->{username};
# Setting resources
my $resources = [
{
id => 'icd10',
name =>
'International Statistical Classification of Diseases and Related Health Problems 10th Revision',
url => 'https://icd.who.int/browse10/2019/en#',
version => '2019',
namespacePrefix => 'ICD10',
iriPrefix => 'https://icd.who.int/browse10/2019/en#/'
},
{
id => 'ncit',
name => 'NCI Thesaurus',
url => 'http://purl.obolibrary.org/obo/ncit.owl',
version => '22.03d',
namespacePrefix => 'NCIT',
iriPrefix => 'http://purl.obolibrary.org/obo/NCIT_'
},
{
id => 'athena-ohdsi',
name => 'Athena-OHDSI',
url => 'https://athena.ohdsi.org',
version => 'v5.3.1',
namespacePrefix => 'OHDSI',
iriPrefix => 'http://www.fakeurl.com/OHDSI_'
},
{
id => 'hp',
name => 'Human Phenotype Ontology',
url => 'http://purl.obolibrary.org/obo/hp.owl',
version => '2023-04-05',
namespacePrefix => 'HP',
iriPrefix => 'http://purl.obolibrary.org/obo/HP_'
},
{
id => 'omim',
name => 'Online Mendelian Inheritance in Man',
url => 'https://www.omim.org',
version => '2023-05-22',
namespacePrefix => 'OMIM',
iriPrefix => 'http://omim.org/entry/'
},
{
id => 'cdisc-terminology',
name => 'CDISC Terminology',
url =>
'https://www.cdisc.org/standards/terminology/controlled-terminology',
version => '2023-01-24',
namespacePrefix => 'CDISC',
iriPrefix => 'http://www.fakeurl.com/CDISC_'
}
];
return {
created => iso8601_time(),
createdBy => $username,
submittedBy => $username,
phenopacketSchemaVersion => '2.0',
resources => $resources,
externalReferences => [
{
id => 'PMID: 26262116',
reference =>
'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4815923',
description =>
'Observational Health Data Sciences and Informatics (OHDSI): Opportunities for Observational Researchers'
}
]
};
}
1;