Group
Extension

Pheno-Ranker/share/diseases/hpo/scripts/hpo_disease_converter.pl

#!/usr/bin/perl
#
#   A script that creates a disease-based 
#   BFF/PXF JSON array from HPO CSV data
#
#   Last Modified: Jan/15/2024
#
#   $VERSION taken from Pheno::Ranker
#
#   Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
#   License: Artistic License 2.0
#
#   If this program helps you in your research, please cite.

use strict;
use warnings;
use Text::CSV_XS;
use JSON::XS;
use Getopt::Long qw(:config no_ignore_case);

# Command-line arguments
GetOptions(
    'i|input=s'  => \my $input_file,
    'f|format=s' => \my $format,
) or die "Usage: $0 -i input.csv -f [pxf|bff]\n";

# Handles both: <genes_to_phenotype.txt>  <phenotype_to_genes.txt>
die "Input file is required.\nUsage: $0 -i input.csv -f [pxf|bff]\n" unless $input_file;
die "Format must be 'pxf' or 'bff'.\n" unless $format && ($format eq 'pxf' || $format eq 'bff');

# Set a fixed seed for reproducibility
srand(12345);

# Initialize CSV parser
my $csv = Text::CSV_XS->new({
    binary    => 1,
    sep_char  => "\t",
    auto_diag => 1,
});

# Open the input CSV file
open my $fh, '<', $input_file or die "Could not open '$input_file': $!";

# Read the header
my $header = $csv->getline($fh);
$csv->column_names(@$header);

# Data structures to hold diseases categorized by type
my %diseases = (
    OMIM  => {},
    ORPHA => {},
);

# Process each row in the CSV
while (my $row = $csv->getline_hr($fh)) {
    my $hpo_id     = $row->{'hpo_id'};
    my $hpo_name   = $row->{'hpo_name'};
    my $disease_id = $row->{'disease_id'};

    my ($disease_type, $disease_identifier) = split(':', $disease_id, 2);

    unless (exists $diseases{$disease_type}) {
        warn "Unknown disease type '$disease_type' for ID '$disease_id'. Skipping.\n";
        next;
    }

    unless (exists $diseases{$disease_type}{$disease_identifier}) {
        $diseases{$disease_type}{$disease_identifier} = initialize_entry($disease_id, $format);
    }

    add_feature($diseases{$disease_type}{$disease_identifier}, $hpo_id, $hpo_name, $format);
}

close $fh;

# Convert disease hashes to arrays for JSON encoding
my %output_data;
foreach my $type (keys %diseases) {
    $output_data{$type} = [ values %{ $diseases{$type} } ];
}

# Initialize JSON encoder
my $json = JSON::XS->new->utf8->pretty->canonical;

# Write JSON files
foreach my $type (keys %output_data) {
    my $filename = lc($type) . "." . lc($format) . ".json";
    write_json($output_data{$type}, $filename, $json);
}

print "Successfully wrote:\n" . (exists $output_data{'OMIM'} ? "omim.$format.json\n" : "") 
      . (exists $output_data{'ORPHA'} ? "orpha.$format.json\n" : "");

# Subroutine to initialize an entry
sub initialize_entry {
    my ($disease_id, $format) = @_;
    my $entry = {
        id => $disease_id,
    };
    if ($format eq 'pxf') {
        $entry->{subject} = {
            id          => $disease_id,
            sex         => get_random_sex_pxf(),
            vitalStatus => { status => "ALIVE" },
        };
    } elsif ($format eq 'bff') {
        $entry->{sex} = get_random_sex_bff();
    }
    $entry->{phenotypicFeatures} = [];
    return $entry;
}

# Subroutine to add a feature
sub add_feature {
    my ($entry, $hpo_id, $hpo_name, $format) = @_;
    foreach my $feature (@{ $entry->{phenotypicFeatures} }) {
        if ($format eq 'pxf' && $feature->{type}{id} eq $hpo_id) {
            return;
        }
        if ($format eq 'bff' && $feature->{featureType}{id} eq $hpo_id) {
            return;
        }
    }
    if ($format eq 'pxf') {
        push @{ $entry->{phenotypicFeatures} }, { type => { id => $hpo_id, label => $hpo_name } };
    } elsif ($format eq 'bff') {
        push @{ $entry->{phenotypicFeatures} }, { featureType => { id => $hpo_id, label => $hpo_name } };
    }
}

# Subroutine to get random sex for pxf
sub get_random_sex_pxf {
    my @sex_options = ("MALE", "FEMALE");
    return $sex_options[int(rand(@sex_options))];
}

# Subroutine to get random sex for bff
sub get_random_sex_bff {
    my @sex_options = (
        { id => "NCIT:C20197", label => "Male" },
        { id => "NCIT:C16576", label => "Female" }
    );
    return $sex_options[int(rand(@sex_options))];
}

# Subroutine to write JSON data to a file
sub write_json {
    my ($data_ref, $filename, $json_encoder) = @_;
    open my $out_fh, '>', $filename or die "Could not open '$filename' for writing: $!";
    print $out_fh $json_encoder->encode($data_ref);
    close $out_fh;
}


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.