Yet Another CPAN Grep

Convert-Pheno/lib/Convert/Pheno/Utils/Mapping.pm

package Convert::Pheno::Utils::Mapping;

use strict;
use warnings;
use autodie;
use feature qw(say);
use utf8;
use Data::Dumper;
use JSON::XS;
use Time::HiRes qw(gettimeofday);
use POSIX       qw(strftime);
use DateTime::Format::ISO8601;
use Scalar::Util qw(looks_like_number);
use List::Util   qw(first);
use Cwd          qw(cwd);
use Sys::Hostname;
use Convert::Pheno::DB::SQLite;
use Convert::Pheno::Utils::Default qw(get_defaults);
use Exporter 'import';
use open qw(:std :encoding(UTF-8));

our @EXPORT =
  qw(map_ontology_term dotify_and_coerce_number get_current_utc_iso8601_timestamp map_iso8601_date2timestamp map_iso8601_timestamp2date get_date_component map_reference_range map_reference_range_csv map_age_range map2redcap_dict map2ohdsi convert2boolean get_age_from_date_and_birthday get_date_at_age generate_random_alphanumeric_string map_operator_concept_id map_info_field map_omop_visit_occurrence convert_date_to_iso8601 validate_format get_metaData get_info merge_omop_tables convert_label_to_days string2number number2string);

my $DEFAULT = get_defaults();
use constant DEVEL_MODE => 0;

# Global hash
my %SEEN = ();

#############################
#############################
#  HELPER SUBS FOR MAPPING  #
#############################
#############################

sub map_ontology_term {
    my ($arg)    = @_;
    my $query    = $arg->{query};
    my $ontology = $arg->{ontology};
    my $self     = $arg->{self};

    # 1) Skip pure numbers
    return $DEFAULT->{ontology_term} if looks_like_number($query);

    # 2) If already an object, assume pre‑mapped
    return $query if ref $query eq 'HASH';

    # 3) Fast return on cache hit
    if ( exists $SEEN{$ontology}{$query} ) {
        say "Skipping searching for <$query> in <$ontology> (cached)"
          if DEVEL_MODE;
        return $SEEN{$ontology}{$query};
    }

    # 4) --ohdsi-db
    if ( $ontology eq 'ohdsi' && !$self->{ohdsi_db} ) {

        #If -iomop and term not found in RAM <CONCEPT> die unless --ohdsi-db
        if ( $self->{method} =~ /^omop2bff/ ) {
            die "Could not find concept_id:<$query> in provided CONCEPT table. "
              . "Use --ohdsi-db to enable Athena‑OHDSI lookup.\n";
        }

        # Any search that involves 'ohdsi' as an ontology (e.g., mapping file)
        else {
            die "You have to use --ohdsi-db to perform Athena‑OHDSI lookups.\n";
        }
    }

    # 5) Perform the lookup
    say "Searching for <$query> in <$ontology>…" if DEVEL_MODE;
    my ( $id, $label, $concept_id ) = get_ontology_terms(
        {
            sth_column_ref         => $self->{sth}{$ontology}{ $arg->{column} },
            query                  => $query,
            ontology               => $ontology,
            databases              => $self->{databases},
            column                 => $arg->{column},
            search                 => $self->{search},
            text_similarity_method => $self->{text_similarity_method},
            min_text_similarity_score => $self->{min_text_similarity_score},
            levenshtein_weight        => $self->{levenshtein_weight},
        }
    );

    # 6) Store in cache
    my $entry =
      $arg->{require_concept_id}
      ? { id => $id, label => $label, concept_id => $concept_id }
      : { id => $id, label => $label };

    $SEEN{$ontology}{$query} = $entry;

    # 7) Return (with optional hidden‐label)
    return $arg->{print_hidden_labels}
      ? { %$entry, _label => $query }
      : $entry;
}

sub dotify_and_coerce_number {
    my $val = shift;

    # Premature return
    return undef unless ( defined $val && $val ne '' );

    # looks_like_number does not work with commas so we must tr first
    ( my $tr_val = $val ) =~ tr/,/./;

    #print "#$val#$tr_val#\n";

    # coercing to number $tr_val
    return looks_like_number($tr_val)
      ? 0 + $tr_val
      : $val;
}

sub get_current_utc_iso8601_timestamp {

    # Standard modules (gmtime()===>Coordinated Universal Time(UTC))
    # NB: The T separates the date portion from the time-of-day portion.
    #     The Z on the end means UTC (that is, an offset-from-UTC of zero hours-minutes-seconds).
    #     - The Z is pronounced “Zulu”.
    my $now = time();
    return strftime( '%Y-%m-%dT%H:%M:%SZ', gmtime($now) );
}

sub map_iso8601_date2timestamp {
    my $iso_str = shift;

    # Parse the ISO string into a DateTime object
    $iso_str =~ s/ /T/;
    my $dt = DateTime::Format::ISO8601->parse_datetime($iso_str);

    # Format it to the standardized ISO8601 timestamp,
    # ensuring that if no time was provided, a default is used.
    return $dt->strftime('%Y-%m-%dT%H:%M:%SZ');
}

sub map_iso8601_timestamp2date {
    my $iso_str = shift;
    $iso_str =~ s/\s+/T/;

    # split on 'T' and take the date portion
    my ($date) = split /T/, $iso_str;
    return $date;
}

sub get_date_component {
    my ( $date, $component ) = @_;
    $component //= 'year';
    $date =~ s/T.*//;    # get rid of 'T00:00:00Z'

    my @parts   = split /-/, $date;
    my %indexes = ( year => 0, month => 1, day => 2 );

    # Return the requested component if valid; otherwise, warn and return the year.
    return exists $indexes{$component}
      ? $parts[ $indexes{$component} ]
      : do {
        warn
"Invalid component <$component> requested. Returning year by default.\n";
        $parts[ $indexes{'year'} ];
      };
}

sub map_reference_range {
    my $arg         = shift;
    my $field       = $arg->{field};
    my $redcap_dict = $arg->{redcap_dict};
    my $unit        = $arg->{unit};
    my %hash = ( low => 'Text Validation Min', high => 'Text Validation Max' );
    my $hashref = {
        unit => $unit,
        map { $_ => undef } qw(low high)
    };    # Initialize low,high to undef
    for my $range (qw (low high)) {
        $hashref->{$range} =
          dotify_and_coerce_number( $redcap_dict->{$field}{ $hash{$range} } );
    }

    return $hashref;
}

sub map_reference_range_csv {
    my ( $unit, $range ) = @_;
    $range->{unit} = $unit;
    return $range;
}

sub map_age_range {
    my $str = shift;

    # Premature return if not range
    return { age =>
          { iso8601duration => 'P' . dotify_and_coerce_number($str) . 'Y' } }
      unless $str =~ m/\-|\+/;

    # if range
    $str =~ s/\+/\-999/;    # from '70+' '70-999'
    my ( $start, $end ) = split /\-/, $str;

    return {
        ageRange => {
            start => {
                iso8601duration => 'P' . dotify_and_coerce_number($start) . 'Y'
            },
            end =>
              { iso8601duration => 'P' . dotify_and_coerce_number($end) . 'Y' }
        }
    };
}

sub map2redcap_dict {
    my $arg = shift;
    my ( $redcap_dict, $participant, $field, $labels ) = (
        $arg->{redcap_dict}, $arg->{participant},
        $arg->{field},       $arg->{labels}
    );

    # Options:
    #  labels = 1
    #     _labels
    #  labels = 0
    #    'Field Note'

    # NB: Some numeric fields will get stringified at $participant->{$field}
    return $labels
      ? $redcap_dict->{$field}{_labels}{ $participant->{$field} }
      : $redcap_dict->{$field}{'Field Note'};
}

sub map2ohdsi {
    my $arg = shift;
    my ( $ohdsi_dict, $concept_id, $self ) =
      ( $arg->{ohdsi_dict}, $arg->{concept_id}, $arg->{self} );

    #######################
    # OPTION A: <CONCEPT> #
    #######################

    # NB1: Here we don't win any speed over using %SEEN as ...
    # .. we are already searching in a hash
    # NB2: $concept_id is stringified by hash
    my ( $data, $id, $label, $vocabulary ) = ( (undef) x 4 );
    if ( exists $ohdsi_dict->{$concept_id} ) {
        $id         = $ohdsi_dict->{$concept_id}{concept_code};
        $label      = $ohdsi_dict->{$concept_id}{concept_name};
        $vocabulary = $ohdsi_dict->{$concept_id}{vocabulary_id};
        $data       = { id => qq($vocabulary:$id), label => $label };
    }

    ######################
    # OPTION B: External #
    ######################

    else {
        $data = map_ontology_term(
            {
                query    => $concept_id,
                column   => 'concept_id',
                ontology => 'ohdsi',
                self     => $self
            }
        );
    }
    return $data;
}

sub convert2boolean {
    my $val = lc(shift);
    return
        ( $val eq 'true'  || $val eq 'yes' ) ? JSON::XS::true
      : ( $val eq 'false' || $val eq 'no' )  ? JSON::XS::false
      :                                        undef;            # unknown = undef

}

sub get_age_from_date_and_birthday {
    my $arg          = shift;
    my $birth_date   = $arg->{birth_day} or return;
    my $current_date = $arg->{date}      or return;

    # Assuming both dates are in a format like "YYYY-MM-DD" (or with spaces instead of a dash separator for the birth date)
    # Split the dates into year, month, and day.
    my ( $birth_year, $birth_month, $birth_day ) = split /[-\s]+/, $birth_date;
    my ( $current_year, $current_month, $current_day ) = split /-/,
      $current_date;

    # Calculate age based on year difference.
    my $age = $current_year - $birth_year;

    # If the current month/day is before the birthday month/day, subtract one year.
    if ( $current_month < $birth_month
        or ( $current_month == $birth_month && $current_day < $birth_day ) )
    {
        $age--;
    }

    # Return the age in ISO8601 duration format (e.g. "P31Y").
    return "P${age}Y";
}

sub get_date_at_age {
    my ( $duration_iso, $birthdate_iso ) = @_;

    # Parse the birth date using ISO8601 format.
    my $birthdate = DateTime::Format::ISO8601->parse_datetime($birthdate_iso);

    # Here we only handle durations expressed solely in years.
    # For a string like "P31Y", extract the number 31.
    my $years;
    if ( $duration_iso =~ /^P(\d+)Y/ ) {
        $years = $1;
    }
    else {
        warn
"Unsupported duration format: $duration_iso. Only durations in full years (P<number>Y) are supported.";
    }

    # Create a duration object for the extracted number of years.
    my $duration = DateTime::Duration->new( years => $years );

    # Add the duration to the birth date.
    my $date_at_age = $birthdate->clone->add_duration($duration);

    # Return the result in ISO format (YYYY-MM-DD)
    return $date_at_age->ymd;
}

sub generate_random_alphanumeric_string {

    #https://www.perlmonks.org/?node_id=233023
    return join( '',
        map { ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 )[ rand 62 ] } 0 .. shift );
}

sub map_operator_concept_id {
    my $arg  = shift;
    my $id   = $arg->{operator_concept_id};
    my $val  = $arg->{value_as_number};
    my $unit = $arg->{unit};

    # Define hash for possible values
    my %operator_concept_id = ( 4172704 => 'GT', 4172756 => 'LT' );

    #  4172703 => 'EQ';

    # $hasref will be used for return
    my $hashref = undef;

    # Only for GT || LT
    if ( exists $operator_concept_id{$id} ) {
        $hashref = {
            unit => $unit,
            map { $_ => undef } qw(low high)
        };    # Initialize low,high to undef
        if ( $operator_concept_id{$id} eq 'GT' ) {
            $hashref->{high} = dotify_and_coerce_number($val);
        }
        else {
            $hashref->{low} = dotify_and_coerce_number($val);
        }
    }
    return $hashref;
}

sub map_omop_visit_occurrence {

    # key eq 'visit_occurrence_id'
    # { '85' =>
    #    {
    #          'admitting_source_concept_id' => 0,
    #          'admitting_source_value' => undef,
    #          'care_site_id' => '\\N',
    #          'discharge_to_concept_id' => 0,
    #          'discharge_to_source_value' => undef,
    #          'person_id' => 1,
    #          'preceding_visit_occurrence_id' => 82,
    #          'provider_id' => '\\N',
    #          'visit_concept_id' => 9201,
    #          'visit_end_date' => '1981-08-19',
    #          'visit_end_datetime' => '1981-08-19 00:00:00',
    #          'visit_occurrence_id' => 85,
    #          'visit_source_concept_id' => 0,
    #          'visit_source_value' => '7879d5b2-1af2-49a7-a801-121de124c6af',
    #          'visit_start_date' => '1981-08-18',
    #          'visit_start_datetime' => '1981-08-18 00:00:00',
    #          'visit_type_concept_id' => 44818517
    #        }
    # }

    my $arg                 = shift;
    my $self                = $arg->{self};
    my $ohdsi_dict          = $arg->{ohdsi_dict};
    my $person_id           = $arg->{person_id};
    my $visit_occurrence_id = $arg->{visit_occurrence_id};
    my $visit_occurrence    = $self->{visit_occurrence};

    # Premature return
    return undef if $visit_occurrence_id eq '\\N';    # perlcritic Severity: 5

    # *** IMPORTANT ***
    # EUNOMIA instance has mismatches between the person_id -- visit_occurrence_id
    # For instance, person_id = 1 has only visit_occurrence_id = 85, but on tables it has:
    # 82, 84, 42, 54, 41, 25, 76 and 81

    # warn if we don't have $visit_occurrence_id in VISIT_OCCURRENCE
    unless ( exists $visit_occurrence->{$visit_occurrence_id} ) {
        warn
"Sorry, but <visit_occurrence_id:$visit_occurrence_id> does not exist for <person_id:$person_id>\n"
          if DEVEL_MODE;

        # Premature return
        return undef;    # perlcritic Severity: 5
    }

    # Getting pointer to the hash element
    my $hashref = $visit_occurrence->{$visit_occurrence_id};

    my $concept = map2ohdsi(
        {
            ohdsi_dict => $ohdsi_dict,
            concept_id => $hashref->{visit_concept_id},
            self       => $self

        }
    );

    # *** IMPORTANT ***
    # Ad hoc to avoid using --ohdsi-db while we find a solution to EUNOMIA not being self-contained
    my $ad_hoc_44818517 = {
        id    => "Visit Type:OMOP4822465",
        label => "Visit derived from encounter on claim"
    };
    my $type =
        $hashref->{visit_type_concept_id} == 44818517
      ? $ad_hoc_44818517
      : map2ohdsi(
        {
            ohdsi_dict => $ohdsi_dict,
            concept_id => $hashref->{visit_type_concept_id},
            self       => $self

        }
      );
    my $start_date = map_iso8601_date2timestamp( $hashref->{visit_start_date} );
    my $end_date   = map_iso8601_date2timestamp( $hashref->{visit_end_date} );
    my $info       = { VISIT_OCCURRENCE => { OMOP_columns => $hashref } };

    return {
        _info         => $info,
        id            => $visit_occurrence_id,
        concept       => $concept,
        type          => $type,
        start_date    => $start_date,
        end_date      => $end_date,
        occurrence_id => $hashref->{visit_occurrence_id}
    };
}

sub convert_date_to_iso8601 {
    my $date = shift // '';

    # Trim any accidental whitespace
    $date =~ s/^\s+|\s+$//g;

    # Return default if input is empty
    return '1900-01-01' if $date eq '';

    # If already in ISO format (YYYY-MM-DD), return as-is
    if ( $date =~ /^\d{4}-\d{2}-\d{2}$/ ) {
        return $date;
    }

    # If dot-separated format with four-digit first element (YYYY.MM.DD)
    if ( $date =~ /^(\d{4})\.(\d{2})\.(\d{2})$/ ) {
        return "$1-$2-$3";
    }

    # If dot-separated format with two-digit first element (DD.MM.YYYY)
    if ( $date =~ /^(\d{2})\.(\d{2})\.(\d{4})$/ ) {
        return "$3-$2-$1";
    }

    # Optionally, handle any other unexpected format gracefully
    warn "Invalid date format: $date";
}

sub is_multidimensional {
    return ref shift ? 1 : 0;
}

sub validate_format {
    my ( $data, $format ) = @_;
    return ( $format eq 'pxf' )
      ? !!( exists $data->{subject} )
      : !( exists $data->{subject} );
}

sub get_info {
    my $self = shift;

    # Detecting the number of logical CPUs across different OSes
    my $os = $^O;
    chomp(
        my $threadshost =
          lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
        : lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
        : $os eq 'MSWin32'     ? qx{wmic cpu get NumberOfLogicalProcessors}
        :                        qx{/usr/bin/nproc} // 1
    );

    # For the Windows command, the result will also contain the string
    # "NumberOfLogicalProcessors" which is the header of the output.
    # So we need to extract the actual number from it:
    if ( $os eq 'MSWin32' ) {
        ($threadshost) = $threadshost =~ /(\d+)/;
    }
    $threadshost = 0 + $threadshost;    # coercing it to be a number

    return {
        user => $ENV{'LOGNAME'}
          || $ENV{'USER'}
          || $ENV{'USERNAME'}
          || 'dummy-user',
        username    => $self->{username},
        threadshost => $threadshost,
        cwd         => cwd,
        id          => $self->{id},
        hostname    => hostname,
        version     => $::VERSION
    };
}

sub get_metaData {
    my $self = shift;

    # Setting a few variables
    my $username = $self->{username};

    # Setting resources
    my $resources = [
        {
            id   => 'icd10',
            name =>
'International Statistical Classification of Diseases and Related Health Problems 10th Revision',
            url             => 'https://icd.who.int/browse10/2019/en#',
            version         => '2019',
            namespacePrefix => 'ICD10',
            iriPrefix       => 'https://icd.who.int/browse10/2019/en#/'
        },
        {
            id              => 'ncit',
            name            => 'NCI Thesaurus',
            url             => 'http://purl.obolibrary.org/obo/ncit.owl',
            version         => '22.03d',
            namespacePrefix => 'NCIT',
            iriPrefix       => 'http://purl.obolibrary.org/obo/NCIT_'
        },
        {
            id              => 'athena-ohdsi',
            name            => 'Athena-OHDSI',
            url             => 'https://athena.ohdsi.org',
            version         => 'v5.3.1',
            namespacePrefix => 'OHDSI',
            iriPrefix       => 'http://www.fakeurl.com/OHDSI_'
        },
        {
            id              => 'hp',
            name            => 'Human Phenotype Ontology',
            url             => 'http://purl.obolibrary.org/obo/hp.owl',
            version         => '2023-04-05',
            namespacePrefix => 'HP',
            iriPrefix       => 'http://purl.obolibrary.org/obo/HP_'
        },
        {
            id              => 'omim',
            name            => 'Online Mendelian Inheritance in Man',
            url             => 'https://www.omim.org',
            version         => '2023-05-22',
            namespacePrefix => 'OMIM',
            iriPrefix       => 'http://omim.org/entry/'
        },
        {
            id   => 'cdisc-terminology',
            name => 'CDISC Terminology',
            url  =>
'https://www.cdisc.org/standards/terminology/controlled-terminology',
            version         => '2023-01-24',
            namespacePrefix => 'CDISC',
            iriPrefix       => 'http://www.fakeurl.com/CDISC_'
        }
    ];
    return {
        created                  => get_current_utc_iso8601_timestamp(),
        createdBy                => $username,
        submittedBy              => $username,
        phenopacketSchemaVersion => '2.0',
        resources                => $resources,
        externalReferences       => [
            {
                id        => 'PMID: 26262116',
                reference =>
                  'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4815923',
                description =>
'Observational Health Data Sciences and Informatics (OHDSI): Opportunities for Observational Researchers'
            }
        ]
    };
}

sub merge_omop_tables {
    my $individuals = shift;    # Expect an arrayref of individual OMOP structures
    die "Expected an array reference" unless ref($individuals) eq 'ARRAY';

    my %merged;
    foreach my $ind (@$individuals) {

        # Ensure each individual record is a hashref.
        next unless ref($ind) eq 'HASH';

        # For each table in this individual...
        foreach my $table ( keys %$ind ) {

            # If the table is stored as an arrayref, merge its rows.
            if ( ref( $ind->{$table} ) eq 'ARRAY' ) {
                push @{ $merged{$table} }, @{ $ind->{$table} };
            }
            else {
                # If it's a single hash (one row), add it as a single element.
                push @{ $merged{$table} }, $ind->{$table};
            }
        }
    }
    return \%merged;
}

sub convert_label_to_days {
    my ( $label, $count ) = @_;

    # return undef on missing args
    return undef
      unless defined $label && defined $count && looks_like_number($count);

    my $key = lc $label;

    # normalize plural to singular
    $key =~ s/s$//;

    my %mult = (
        day   => 1,
        week  => 7,
        month => 30,
        year  => 365,
    );

    # lookup multiplier
    my $factor = $mult{$key};
    return undef unless defined $factor;

    return $factor * $count;
}

# hex‑encoding the bytes, then parsing that hex as a BigInt.
sub string2number {
    my $str = shift;

    # Do nothing if we already have integer
    return $str if is_strict_integer($str);

    # 1) turn "Hello" into "48656c6c6f"
    my $hex = unpack( 'H*', $str ); 

    # 2) parse that hex as a BigInt 
    my $big = Math::BigInt->from_hex("0x$hex");

    # 3) return its decimal string 
    return $big->bstr;
}

sub is_strict_integer {
    my ($val) = @_;
    return 0 unless looks_like_number($val);
    return $val == int($val);
}

# Turn the decimal BigInt back into the original string
sub number2string {
    my $num = shift;

    # 1) lift into a BigInt
    my $big = Math::BigInt->new($num);

    # 2) get back the hex digits, e.g. "0x48656c6c6f"
    my $hex = $big->as_hex;
    
    # 3) strip the "0x" and unpack back into raw bytes
    $hex =~ s/^0[xX]//;
    return pack( 'H*', $hex );
}

1;