Yet Another CPAN Grep

Pheno-Ranker/lib/Pheno/Ranker/Compare.pm

package Pheno::Ranker::Compare;

use strict;
use warnings;
use autodie;
use feature    qw(say);
use List::Util qw(any shuffle first);
use Data::Dumper;
use Sort::Naturally qw(nsort);
use MIME::Base64;
use Compress::Zlib qw(compress);
use Hash::Fold fold => { array_delimiter => ':' };
use Pheno::Ranker::Metrics;

use Exporter 'import';
our @EXPORT =
  qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);

use constant DEVEL_MODE => 0;

our %nomenclature = ();

sub check_format {
    my $data = shift;
    return exists $data->[0]{subject} ? 'PXF' : 'BFF';
}

sub cohort_comparison {
    my ( $ref_binary_hash, $self ) = @_;
    my $out_file          = $self->{out_file};
    my $similarity_metric = $self->{similarity_metric_cohort};

    # Define limit #items for switching to whole matrix calculation
    my $max_items = $self->{max_matrix_records_in_ram};

    # Inform about the start of the comparison process
    say "Performing COHORT comparison"
      if ( $self->{debug} || $self->{verbose} );

    # Define the subroutine to be used
    my %similarity_function = (
        'hamming' => \&hd_fast,
        'jaccard' => \&jaccard_similarity_formatted
    );

    # Define values for diagonal elements depending on metric
    my %similarity_diagonal = (
        'hamming' => 0,
        'jaccard' => 1
    );

    # Use previous hashes to define stuff
    my $metric              = $similarity_function{$similarity_metric};
    my $similarity_diagonal = $similarity_diagonal{$similarity_metric};

    # Sorting keys of the hash
    my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
    my $num_items                   = scalar @sorted_keys_ref_binary_hash;

    # Define $switch for going from RAM to all calculations
    my $switch = $num_items > $max_items ? 1 : 0;

    say "RAM efficient mode is: "
      . ( $switch ? "on" : "off" )
      . " (max_matrix_records_in_ram: $max_items)"
      if ( $self->{debug} || $self->{verbose} );

    # Opening file for output
    open( my $fh, '>:encoding(UTF-8)', $out_file );
    say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;

    # Initialize matrix for storing similarity
    my @matrix;

    # Iterate over items (I elements)
    for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
        say "Calculating <"
          . $sorted_keys_ref_binary_hash[$i]
          . "> against the cohort..."
          if $self->{verbose};
        my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
          {binary_digit_string_weighted};

        # Print first column (w/o \t)
        print $fh $sorted_keys_ref_binary_hash[$i];

        # Iterate for pairwise comparisons (J elements)
        for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
            my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
              {binary_digit_string_weighted};
            my $similarity;

            if ($switch) {

                # For large datasets compute upper and lower triange
                my $str2 =
                  $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
                  {binary_digit_string_weighted};
                $similarity =
                  $i == $j ? $similarity_diagonal : $metric->( $str1, $str2 );
            }
            else {
                if ( $i == $j ) {

                    # Similarity for diagonal elements
                    $similarity = $similarity_diagonal;
                }
                elsif ( $j > $i ) {

                    # Compute similarity for large cohorts or upper triangle
                    $similarity = $metric->( $str1, $str2 );
                    $matrix[$i][$j] = $similarity;
                }
                else {
                    # Use precomputed similarity from lower triangle
                    $similarity = $matrix[$j][$i];
                }
            }

            # Print a tab before each similarity
            print $fh "\t", $similarity;
        }

        print $fh "\n";
    }

    # Close the file handle
    close $fh;

    # Inform about the completion of the matrix computation
    say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} );
    return 1;
}

sub compare_and_rank {
    my $arg             = shift;
    my $glob_hash       = $arg->{glob_hash};
    my $ref_hash        = $arg->{ref_hash};
    my $tar_hash        = $arg->{tar_hash};
    my $ref_binary_hash = $arg->{ref_binary_hash};
    my $tar_binary_hash = $arg->{tar_binary_hash};
    my $weight          = $arg->{weight};
    my $self            = $arg->{self};
    my $sort_by         = $self->{sort_by};
    my $align           = $self->{align};
    my $max_out         = $self->{max_out};

    say "Performing COHORT(REF)-PATIENT(TAR) comparison"
      if ( $self->{debug} || $self->{verbose} );

    # Hash for compiling metrics
    my $score;

    # Hash for stats
    my $stat;

    # Load TAR binary string
    my ($tar) = keys %{$tar_binary_hash};
    my $tar_str_weighted =
      $tar_binary_hash->{$tar}{binary_digit_string_weighted};

    # Load TAR number of vars
    my $target_vars = keys %{ $tar_hash->{$tar} };

    for my $key ( keys %{$ref_binary_hash} ) {    # No need to sort

        # Load REF binary string
        my $ref_str_weighted =
          $ref_binary_hash->{$key}{binary_digit_string_weighted};
        say "Comparing <id:$key> --- <id:$tar>" if $self->{verbose};
        say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
          if ( defined $self->{debug} && $self->{debug} > 1 );

        # Hamming
        $score->{$key}{hamming} =
          hd_fast( $ref_str_weighted, $tar_str_weighted );

        # Intersect and Jaccard
        ( $score->{$key}{jaccard}, $score->{$key}{intersect} ) =
          jaccard_similarity( $ref_str_weighted, $tar_str_weighted );

        # Load REF number of vars
        $score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} };

        # Add values
        push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
        push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
    }

    # Stats are only computed once (no overhead)
    $stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
    $stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );

    # Initialize a few variables
    my @headers = (
        'RANK',              'REFERENCE(ID)',
        'TARGET(ID)',        'FORMAT',
        'LENGTH',            'WEIGHTED',
        'HAMMING-DISTANCE',  'DISTANCE-Z-SCORE',
        'DISTANCE-P-VALUE',  'DISTANCE-Z-SCORE(RAND)',
        'JACCARD-INDEX',     'JACCARD-Z-SCORE',
        'JACCARD-P-VALUE',   'REFERENCE-VARS',
        'TARGET-VARS',       'INTERSECT',
        'INTERSECT-RATE(%)', 'COMPLETENESS(%)'
    );
    my $header  = join "\t", @headers;
    my @results = $header;
    my %info;
    my $length_align = length($tar_str_weighted);
    my $weight_bool  = $weight ? 'True' : 'False';
    my @alignments_ascii;
    my $alignment_str_csv;
    my @alignments_csv = join ';',
      qw/id ref indicator tar weight hamming-distance json-path label/;

    # The dataframe will have two header lines
    # *** IMPORTANT ***
    # nsort does not yield same results as canonical from JSON::XS
    # NB: we're sorting here and in create_binary_digit_string()
    my @sort_keys_glob_hash = sort keys %{$glob_hash};

    # Creating @labels array from {id,label}, or guess_label()
    my @labels =
      map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) }
      @sort_keys_glob_hash;

    # Die if #elements in arrays differ
    die "Mismatch between variables and labels"
      if @sort_keys_glob_hash != @labels;

    # Labels
    # NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
    my @dataframe = join ';', 'Id', @labels;

    # Variables (json path)
    push @dataframe, join ';', 'Id', @sort_keys_glob_hash;

    # 0/1
    push @dataframe, join ';', qq/T|$tar/,
      ( split //, $tar_binary_hash->{$tar}{binary_digit_string} );  # Add Target

    # Sort %score by value and load results
    my $count = 1;
    $max_out++;    # to be able to start w/ ONE

    # Start loop
    for my $key (
        sort {
            $sort_by eq 'jaccard'           #
              ? $score->{$b}{$sort_by}
              <=> $score->{$a}{$sort_by}    # 1 to 0 (similarity)
              : $score->{$a}{$sort_by}
              <=> $score->{$b}{$sort_by}    # 0 to N (distance)
        } keys %$score
      )
    {

        say "$count: Creating alignment <id:$key>" if $self->{verbose};

        # Create ASCII alignment
        # NB: We need it here to get $n_00
        my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
          create_alignment(
            {
                ref_key          => $key,
                ref_str_weighted =>
                  $ref_binary_hash->{$key}{binary_digit_string_weighted},
                tar_str_weighted      => $tar_str_weighted,
                glob_hash             => $glob_hash,
                sorted_keys_glob_hash => \@sort_keys_glob_hash,
                labels                => \@labels
            }
          );

     # *** IMPORTANT ***
     # The LENGTH of the alignment is based on the #variables in the REF-COHORT
     # Compute estimated av and dev for binary_string of L = length_align - n_00
     # Corrected length_align L = length_align - n_00
        my $length_align_corrected = $length_align - $n_00;

        #$estimated_average, $estimated_std_dev
        ( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
          estimate_hamming_stats($length_align_corrected);

        # Compute a few stats
        my $hamming_z_score = z_score(
            $score->{$key}{hamming},
            $stat->{hamming_stats}{mean},
            $stat->{hamming_stats}{sd}
        );
        my $hamming_z_score_from_random = z_score(
            $score->{$key}{hamming},
            $stat->{hamming_stats}{mean_rnd},
            $stat->{hamming_stats}{sd_rnd}
        );

        #my $hamming_p_value =
        #  p_value( $score->{$key}{hamming}, $length_align_corrected );
        my $hamming_p_value_from_z_score =
          p_value_from_z_score($hamming_z_score);
        my $jaccard_z_score = z_score(
            $score->{$key}{jaccard},
            $stat->{jaccard_stats}{mean},
            $stat->{jaccard_stats}{sd}
        );
        my $jaccard_p_value_from_z_score =
          p_value_from_z_score( 1 - $jaccard_z_score );

        # Compute Intersect-Rate (I/T) * 100
        #         Completeness   (T/R) * 100
        my $reference_vars = $score->{$key}{reference_vars};
        my $intersect      = $score->{$key}{intersect};
        my $intersect_rate =
          ( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100;
        my $completeness =
          ( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100;

        # Create a hash with formats
        my $format = {
            'RANK'          => { value => $count,          format => undef },
            'REFERENCE(ID)' => { value => $key,            format => undef },
            'TARGET(ID)'    => { value => $tar,            format => undef },
            'FORMAT'        => { value => $self->{format}, format => undef },
            'WEIGHTED'      => { value => $weight_bool,    format => undef },
            'LENGTH' => { value => $length_align_corrected, format => '%6d' },
            'HAMMING-DISTANCE' =>
              { value => $score->{$key}{hamming}, format => '%4d' },
            'DISTANCE-Z-SCORE' =>
              { value => $hamming_z_score, format => '%7.3f' },
            'DISTANCE-P-VALUE' =>
              { value => $hamming_p_value_from_z_score, format => '%12.7f' },
            'DISTANCE-Z-SCORE(RAND)' =>
              { value => $hamming_z_score_from_random, format => '%8.4f' },
            'JACCARD-INDEX' =>
              { value => $score->{$key}{jaccard}, format => '%7.3f' },
            'JACCARD-Z-SCORE' =>
              { value => $jaccard_z_score, format => '%7.3f' },
            'JACCARD-P-VALUE' =>
              { value => $jaccard_p_value_from_z_score, format => '%12.7f' },
            'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' },
            'TARGET-VARS'    => { value => $target_vars,    format => '%6d' },
            'INTERSECT'      => { value => $intersect,      format => '%6d' },
            'INTERSECT-RATE(%)' =>
              { value => $intersect_rate, format => '%8.2f' },
            'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' }
        };

        # Serialize results
        my $tmp_str = join "\t", map {
            defined $format->{$_}{format}
              ? sprintf( $format->{$_}{format}, $format->{$_}{value} )
              : $format->{$_}{value}
        } @headers;
        push @results, $tmp_str;

        # To save memory only load if --align
        if ( defined $align ) {

            # Add all of the above to @alignments_ascii
            my $sep = ('-') x 80;
            push @alignments_ascii,
              qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;

            # Add all of the above to @alignments_csv
            push @alignments_csv, @$alignment_arr_csv;

            # Add data to dataframe
            push @dataframe, join ';', qq/R|$key/,
              ( split //, $ref_binary_hash->{$key}{binary_digit_string} );

            # Add values to info
            $info{$key} = {
                  weighted => $weight_bool eq 'True'
                ? JSON::XS::true
                : JSON::XS::false,
                reference_id            => $key,
                target_id               => $tar,
                reference_binary_string =>
                  $ref_binary_hash->{$key}{binary_digit_string},
                target_binary_string =>
                  $tar_binary_hash->{$key}{binary_digit_string},
                reference_binary_string_weighted =>
                  $ref_binary_hash->{$key}{binary_digit_string_weighted},
                target_binary_string_weighted =>
                  $tar_binary_hash->{$key}{binary_digit_string_weighted},
                alignment_length   => $length_align_corrected,
                hamming_distance   => $score->{$key}{hamming},
                hamming_z_score    => $hamming_z_score,
                hamming_p_value    => $hamming_p_value_from_z_score,
                jaccard_similarity => $score->{$key}{jaccard},
                jaccard_z_score    => $jaccard_z_score,
                jaccard_p_value    => $jaccard_p_value_from_z_score,
                jaccard_distance   => 1 - $score->{$key}{jaccard},
                format             => $self->{format},
                alignment          => $$alignment_str_ascii,
                reference_vars     => $reference_vars,
                target_vars        => $target_vars,
                intersect          => $intersect,
                intersect_rate     => $intersect_rate,
                completeness       => $completeness
            };

        }

        $count++;
        last if $count == $max_out;
    }

    return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
}

sub create_alignment {

    # NB: The alignment will use the weighted string
    my $arg                   = shift;
    my $ref_key               = $arg->{ref_key};
    my $binary_string1        = $arg->{ref_str_weighted};
    my $binary_string2        = $arg->{tar_str_weighted};
    my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
    my $labels                = $arg->{labels};
    my $glob_hash             = $arg->{glob_hash};
    my $length1               = length($binary_string1);
    my $length2               = length($binary_string2);

    # Check that l1 = l2
    die "The binary strings must have the same length"
      if ( $length1 != $length2 );

    # Expand array to have weights as N-elements
    my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );

    # Initialize some variables
    my $out_ascii = "REF -- TAR\n";
    my @out_csv;
    my $cum_distance = 0;
    my $n_00         = 0;

    # For loop with 2 variables
    # i index for weighted
    # j the number of variables
    my ( $i, $j ) = ( 0, 0 );
    for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {

        # Load key and value
        my $key = $recreated_array->[$i];
        my $val = sprintf( "%3d", $glob_hash->{$key} );

        # We have to keep track with $j
        my $sorted_key = $sorted_keys_glob_hash->[$j];
        my $label      = $labels->[$j];

        # Load chars
        my $char1 = substr( $binary_string1, $i, 1 );
        my $char2 = substr( $binary_string2, $i, 1 );
        $n_00++ if ( $char1 == 0 && $char2 == 0 );

        # Correct $i index by adding weights
        $i = $i + $glob_hash->{$key} - 1;

        # Load metrics
        $cum_distance += $glob_hash->{$key} if $char1 ne $char2;
        my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
        my $distance            = $char1 eq $char2 ? 0 : $glob_hash->{$key};
        my $distance_pretty     = sprintf( "%3d", $distance );

        # w = weight, d = distance, cd = cumul distance
        my %format = (
            '11' => '-----',
            '10' => 'xxx--',
            '01' => '--xxx',
            '00' => '     '
        );
        $out_ascii .=
qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/;
        push @out_csv,
qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;

#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
    }
    return $n_00, \$out_ascii, \@out_csv;
}

sub recreate_array {
    my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
    my @recreated_array;
    foreach my $key (@$sorted_keys_glob_hash) {
        for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
            push @recreated_array, $key;
        }
    }
    return \@recreated_array;
}

sub create_glob_and_ref_hashes {
    my ( $array, $weight, $self ) = @_;
    my $primary_key = $self->{primary_key};
    my $glob_hash   = {};
    my $ref_hash_flattened;

    my $count = 1;
    for my $element ( @{$array} ) {

        # For consistency, we obtain the primary_key for both BFF/PXF
        # from $_->{id} (not from subject.id)
        my $id = $element->{$primary_key}
          or die
"Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n";

        # Remapping hash
        say "Flattening and remapping <id:$id> ..." if $self->{verbose};

        my $ref_hash = remap_hash(
            {
                hash   => $element,
                weight => $weight,
                self   => $self
            }
        );

        # *** IMPORTANT ***
        # We eliminate keys with weight = 0 if defined $weight;
        prune_keys_with_weight_zero($ref_hash) if defined $weight;

        # Load big hash ref_hash_flattened
        $ref_hash_flattened->{$id} = $ref_hash;

        # The idea is to create a $glob_hash with unique key-values
        # Duplicates will be automatically merged
        $glob_hash = { %$glob_hash, %$ref_hash };

        # To keep track of array element indexes
        $count++;
    }
    return ( $glob_hash, $ref_hash_flattened );
}

sub randomize_variables {
    my ( $glob_hash, $self ) = @_;
    my $max  = $self->{max_number_vars};
    my $seed = $self->{seed};

    # set random seed
    srand($seed);

    # Randomize
    # NB:keys have to be sorted for srand to work!!!!
    # perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
    my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];

    # Create a new hash with a hash slice
    my %new_glob_hash;
    @new_glob_hash{@items} = @{$glob_hash}{@items};

    # return reference
    return \%new_glob_hash;
}

sub prune_excluded_included {
    my ( $hash, $self ) = @_;
    my @included = @{ $self->{include_terms} };
    my @excluded = @{ $self->{exclude_terms} };

    # Die if we have both options at the same time
    die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
      if ( @included && @excluded );

    # *** IMPORTANT ***
    # Original $hash is modified

    # INCLUDED
    if (@included) {
        for my $key ( keys %$hash ) {
            delete $hash->{$key} unless any { $_ eq $key } @included;
        }
    }

    # EXCLUDED
    if (@excluded) {
        for my $key (@excluded) {
            delete $hash->{$key} if exists $hash->{$key};
        }
    }

    # We will do nothing if @included = @excluded = [] (DEFAULT)
    return 1;
}

sub set_excluded_phenotypicFeatures {
    my ( $hash, $switch, $format ) = @_;

    # Ensure phenotypicFeatures exist before processing
    return 1 unless exists $hash->{phenotypicFeatures};

    foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) {

        # Skip if 'excluded' is not set or false
        next unless $feature->{excluded};

        # NB: remaining phenotypicFeatures:1.excluded
        #     will be discarded by $exclude_variables_regex_qr later
        if ($switch) {

            # Determine the correct ID field based on the format
            my $id_field = $format eq 'BFF' ? 'featureType' : 'type';

            # Append '_excluded' to the appropriate ID
            $feature->{$id_field}{id} .= '_excluded';
        }
        else {
            # Remove the feature by setting it to undef
            $feature = undef;

  # Due to properties being set to undef, it's possible for the coverage file to
  # report phenotypicFeatures as 100% by all "excluded" = true
        }
    }

    return 1;
}

sub remap_hash {
    my $arg    = shift;
    my $hash   = $arg->{hash};
    my $weight = $arg->{weight};
    my $self   = $arg->{self};                                 # $self from $arg
    my $nodes  = $self->{nodes};
    my $edges  = $self->{edges};
    my $format = $self->{format};
    my $switch = $self->{retain_excluded_phenotypicFeatures};
    my %out_hash;

    # Do some pruning excluded / included
    prune_excluded_included( $hash, $self );

    # *** IMPORTANT ***
    # The user may include a term that:
    # 1 - may not exist in any individual
    # 2 - does not exist in some individuals
    # If the term does not exist in a individual
    #  - a) -include-terms contains ANOTHER TERM THAT EXISTS
    #        %$hash will contain keys => OK
    #  - b) -include-terms only includes the term/terms not present
    #        %$hash  = 0 , then we return {}, to avoid trouble w/ Fold.pm
    #print Dumper $hash;
    return {} unless %$hash;

# A bit more pruning plus folding
# NB: Hash::Fold keeps white spaces on keys
#
# Options for 1D-array folding:
# A) Array to Hash then Fold
# B) Fold then Regex <=== CHOSEN
#  - Avoids the need for deep cloning
#  - Works across any JSON data structure (without specific key requirements)
#  - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold'
#  - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations'
    set_excluded_phenotypicFeatures( $hash, $switch, $format );
    $hash = fold($hash);

    # Load the hash that points to the hierarchy for ontology-term-id
    #  *** IMPORTANT ***
    # - phenotypicFeatures.featureType.id => BFF
    # - phenotypicFeatures.type.id        => PXF
    my $id_correspondence = $self->{id_correspondence};

    # Load values for the for loop
    my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr};
    my $misc_regex_qr =
      qr/1900-01-01|NA0000|NCIT:C126101|P999Y|P9999Y|phenopacket_id/;

# Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup
    my %exclude_values =
      map { $_ => 1 }
      ( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' );

    # Now we proceed for each key
    for my $key ( keys %{$hash} ) {

        # To see which ones were discarded
        #say $key if !defined $hash->{$key};

        # Discard undefined
        next unless defined $hash->{$key};

# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
# Some can be "rescued" by adding the ontology as ($1)
# NB1: We discard _labels too!!
# NB2: info|metaData are always discarded
        next
          if ( defined $exclude_variables_regex_qr
            && $key =~ $exclude_variables_regex_qr );

        # The user can turn on age related values
        next
          if ( ( $format eq 'PXF' || $format eq 'BFF' )
            && $key =~ m/\.age(?!nt)|onset/i
            && !$self->{age} );    # $self->{age} [0|1]

        # Load values
        my $val = $hash->{$key};

  # Discarding lines with unsupported val (Time profiled with :NYTProf: ms time)
        next
          if (
            ( ref($val) eq 'HASH'
                && !keys %{$val} )   # Discard {} (e.g.,subject.vitalStatus: {})
            || ( ref($val) eq 'ARRAY' && !@{$val} )    # Discard []
            || exists $exclude_values{$val}
            || $val =~ $misc_regex_qr
          );

        # Add IDs to key
        my $id_key = add_id2key( $key, $hash, $self );

        # Finally add value to id_key
        my $tmp_key_at_variable_level = $id_key . '.' . $val;

        # Add HPO ascendants
        if ( defined $edges && $val =~ /^HP:/ ) {
            my $ascendants =
              add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges );
            $out_hash{$_} = 1 for @$ascendants;    # weight 1 for now
        }

        ##################
        # Assign weights #
        ##################

   # NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
   # NB: We don't warn if user selection does not exist, just assign 1

        my $tmp_key_at_term_level = $tmp_key_at_variable_level;

        # If variable has . then capture $1
        if ( $tmp_key_at_term_level =~ m/\./ ) {

            # NB: For long str regex is faster than (split /\./, $foo)[0]
            $tmp_key_at_term_level =~ m/^(\w+)\./;
            $tmp_key_at_term_level = $1;
        }

        if ( defined $weight ) {

            # *** IMPORTANT ***
            # ORDER MATTERS !!!!
            # We allow for assigning weights by TERM (e.g., 1D)
            # but VARIABLE level takes precedence to TERM

            $out_hash{$tmp_key_at_variable_level} =

              # VARIABLE LEVEL
              # NB: exists stringifies the weights
              exists $weight->{$tmp_key_at_variable_level}
              ? $weight->{$tmp_key_at_variable_level} + 0   # coercing to number

              # TERM LEVEL
              : exists $weight->{$tmp_key_at_term_level}
              ? $weight->{$tmp_key_at_term_level} + 0       # coercing to number

              # NO WEIGHT
              : 1;

        }
        else {

            # Assign a weight of 1 if no users weights
            $out_hash{$tmp_key_at_variable_level} = 1;

        }

        ##############
        # label Hash #
        ##############

        # Finally we load the Nomenclature hash
        my $label = $key;
        $label =~ s/id/label/;
        $nomenclature{$tmp_key_at_variable_level} = $hash->{$label}
          if defined $hash->{$label};
    }

    # *** IMPORTANT ***
    # We have to return an object {} when undef
    return \%out_hash // {};
}

sub add_hpo_ascendants {
    my ( $key, $nodes, $edges ) = @_;

    # First we obtain the ontology (0000539) from HP:0000539
    $key =~ m/HP:(\w+)$/;
    my $ontology = $1;

    # We'll use it to build a string equivalent to a key from $edges
    my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
    my $hpo_key = $hpo_url . $ontology;

    # We will include all ascendants in an array
    my @ascendants;
    for my $parent_id ( @{ $edges->{$hpo_key} } ) {

        # We have to create a copy to not modify the original $parent_id
        # as it can appear in multiple individuals
        my $copy_parent_id = $parent_id;
        $copy_parent_id =~ m/\/(\w+)$/;
        $copy_parent_id = $1;
        $copy_parent_id =~ tr/_/:/;

# *** IMPORTANT ***
# We cannot add any label to the ascendants, otherwise they will
# not be matched by an indv down the tree
# Myopia
# Mild Myopia
# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
# Use the labels only for debug
        my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
        $asc_key =~ s/HP:$ontology/$copy_parent_id/g;
        push @ascendants, $asc_key;

        # We finally add the label to %nomenclature
        my $hpo_asc_str = $hpo_url
          . $copy_parent_id;    # 'http://purl.obolibrary.org/obo/HP_HP:0000539
        $hpo_asc_str =~ s/HP://;    # 0000539
        $nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
    }
    return \@ascendants;
}

sub add_id2key {
    my ( $key, $hash, $self ) = @_;
    my $id_correspondence    = $self->{id_correspondence}{ $self->{format} };
    my $array_regex_qr       = $self->{array_regex_qr};
    my $array_terms_regex_qr = $self->{array_terms_regex_qr};

    #############
    # OBJECTIVE #
    #############

# This subroutine is important as it replaces the index (numeric) for a given
# array element by a selected ontology. It's done for all subkeys on that element

    #"interventionsOrProcedures" : [
    #     {
    #        "bodySite" : {
    #           "id" : "NCIT:C12736",
    #           "label" : "intestine"
    #        },
    #        "procedureCode" : {
    #           "id" : "NCIT:C157823",
    #           "label" : "Colon Resection"
    #        }
    #     },
    #   {
    #        "bodySite" : {
    #           "id" : "NCIT:C12736",
    #           "label" : "intestine"
    #        },
    #        "procedureCode" : {
    #           "id" : "NCIT:C86074",
    #           "label" : "Hemicolectomy"
    #        }
    #     },
    #]
    #
    # Will become:
    #
    #"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1,
    #"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1,
    #"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1,
    #"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1,
    #
    # To make the replacement we use $id_correspondence, then we perform a regex
    # to fetch the key parts

    # Only proceed if $key is one of the array_terms
    if ( $key =~ $array_terms_regex_qr ) {

        # Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF
        # NB: For others (e.g., MXF) we will have only $1 and $2
        $key =~ $array_regex_qr;

        #say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined

        my ( $tmp_key, $val );

        # Normal behaviour for BFF/PXF
        if ( defined $3 ) {

# If id_correspondence is an array (e.g., medicalActions) we have to grep the right match
            my $correspondence;
            if ( ref $id_correspondence->{$1} eq ref [] ) {

                #       $1         $2                 $3
                # <medicalActions> <0> <treatment.routeOfAdministration.id>
                my $subkey = ( split /\./, $3 )[0];    # treatment
                $correspondence =
                  first { $_ =~ m/^$subkey/ }
                  @{ $id_correspondence->{$1} };       # treatment.agent.id
            }
            else {
                $correspondence = $id_correspondence->{$1};
            }

            # Now that we know which is the term we use to find key-val in $hash
            $tmp_key =
                $1 . ':'
              . $2 . '.'
              . $correspondence;    # medicalActions.0.treatment.agent.id
            $val = $hash->{$tmp_key};    # DrugCentral:257
            $key = join '.', $1, $val, $3
              ; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id
        }

        # MXF or similar (...we haven't encountered other regex yet)
        else {

            $tmp_key = $1 . ':' . $2;
            $val     = $hash->{$tmp_key};
            $key     = $1;
        }
    }

    # $key = 'Bar:1' means that we have array but the user either:
    #  a) Made a mistake in the config
    #  b) Is not using the right config file
    else {
        die
"<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n"
          if $key =~ m/^(\w+):/;
    }

    return $key;
}

sub create_binary_digit_string {
    my ( $export, $weight, $glob_hash, $cmp_hash ) = @_;
    my %out_hash;

    # *** IMPORTANT ***
    # Being a nested for, keys %{$glob_hash} does not need sorting
    # BUT, we sort to follow the same order as serialized (sorted)
    my @sorted_keys_glob_hash = sort keys %{$glob_hash};

    # IDs of each indidividual
    for my $individual_id ( keys %{$cmp_hash} ) {    # no need to sort

        # One-hot encoding = Representing categorical data as numerical
        my ( $binary_str, $binary_str_weighted ) = ('') x 2;

        for my $key (@sorted_keys_glob_hash) {
            my $has_value = exists $cmp_hash->{$individual_id}{$key};
            $binary_str .= $has_value ? '1' : '0';
            if ( defined $weight ) {
                $binary_str_weighted .=
                  $has_value
                  ? ( '1' x $glob_hash->{$key} )
                  : ( '0' x $glob_hash->{$key} );
            }
        }

        # If weight is not defined, simply assign the unweighted string once.
        $binary_str_weighted = $binary_str unless defined $weight;

        $out_hash{$individual_id}{binary_digit_string} = $binary_str;
        $out_hash{$individual_id}{binary_digit_string_weighted} =
          $binary_str_weighted;

        if ( defined $export ) {

            # Convert string => raw bytes > zlib-compres => Base64
            $out_hash{$individual_id}{zlib_base64_binary_digit_string} =
              binary_to_base64($binary_str);

            $out_hash{$individual_id}{zlib_base64_binary_digit_string_weighted}
              = defined $weight
              ? binary_to_base64($binary_str_weighted)
              : $out_hash{$individual_id}{zlib_base64_binary_digit_string};
        }

    }
    return \%out_hash;
}

sub parse_hpo_json {
    my $data = shift;

# The <hp.json> file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
# The HPO is structured into a directed acyclic graph (DAG)
# Here's a brief overview of the structure of the hpo.json file:
# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
# - id: The identifier of the term (e.g., "HP:0000118").
# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
# - meta: Metadata associated with the term, including definition, synonyms, and other information.
# - type: The type of the term, usually "CLASS".
# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
# - sub: The subject (child) term ID (e.g., "HP:0000924").
# - obj: The object (parent) term ID (e.g., "HP:0000118").
# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.

    my $graph = $data->{graphs}->[0];
    my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
    my %edges = ();

    for my $edge ( @{ $graph->{edges} } ) {
        my $child_id  = $edge->{sub};
        my $parent_id = $edge->{obj};
        push @{ $edges{$child_id} }, $parent_id;
    }
    return \%nodes, \%edges;
}

sub prune_keys_with_weight_zero {
    my $hash_ref = shift;

    # Iterate over the keys of the hash
    foreach my $key ( keys %{$hash_ref} ) {

        # Delete the key if its value is 0
        delete $hash_ref->{$key} if $hash_ref->{$key} == 0;
    }
}

sub guess_label {
    my $input_string = shift;

    if (
        $input_string =~ /\.      # Match a literal dot
                       ([^\.]+)  # Match and capture everything except a dot
                       $        # Anchor to the end of the string
                      /x
      )
    {
        return $1;
    }

    # If no dot is found, return the original string
    return $input_string;
}

sub binary_to_base64 {
    my $binary_string = shift;

    # Convert binary string (e.g. "0010...") to raw bytes
    my $raw_data = pack( "B*", $binary_string );

  # Compress the raw data (note: compressing very short data may not save space)
    my $compressed = compress($raw_data);

    # Base64 encode the compressed data, without any newline breaks
    return encode_base64( $compressed, "" );
}

sub _base64_to_binary {
    my ( $b64_string, $original_length ) = @_;

    # Decode the Base64 encoded compressed data
    my $compressed_data = decode_base64($b64_string);

    # Decompress the data back to raw bytes
    my $raw_data = uncompress($compressed_data)
      or die "Decompression failed: $Compress::Zlib::gzerrno\n";

    # Convert the raw bytes back into a binary string (sequence of 0s and 1s)
    my $binary_string = unpack( "B*", $raw_data );

    # Trim the binary string to the original length to remove any padded bits
    return substr( $binary_string, 0, $original_length );
}

1;