Yet Another CPAN Grep

Convert-Pheno/lib/Convert/Pheno/IO/CSVHandler.pm

package Convert::Pheno::IO::CSVHandler;

use strict;
use warnings;
use autodie;
use feature qw(say);
use File::Basename;
use Text::CSV_XS           qw(csv);
use Sort::Naturally        qw(nsort);
use List::Util             qw(any);
use File::Spec::Functions  qw(catdir);
use IO::Compress::Gzip     qw($GzipError);
use IO::Uncompress::Gunzip qw($GunzipError);

use Data::Dumper;
use Devel::Size qw(total_size);
use Convert::Pheno;
use Convert::Pheno::IO::FileIO;
use Convert::Pheno::OMOP::Definitions;
use Convert::Pheno::OMOP;
use Convert::Pheno::Utils::Schema;
use Convert::Pheno::Utils::Mapping;
use Exporter 'import';
our @EXPORT =
  qw(read_csv read_csv_stream read_redcap_dict_file read_mapping_file read_sqldump read_sqldump_stream sqldump2csv transpose_omop_data_structure write_csv open_filehandle load_exposures get_headers convert_table_aoh_to_hoh);

use constant DEVEL_MODE => 0;

#########################
#########################
#  SUBROUTINES FOR CSV  #
#########################
#########################

sub read_redcap_dictionary {
    my $filepath = shift;

    # Define split record separator from file extension
    my ( $separator, $encoding ) = define_separator( $filepath, undef );

    # We'll create an HoH using as 1D-key the 'Variable / Field Name'
    my $key = 'Variable / Field Name';

    # We'll be adding the key <_labels>. See sub add_labels
    my $labels = 'Choices, Calculations, OR Slider Labels';

    # Loading data directly from Text::CSV_XS
    # NB1: We want HoH and sub read_csv returns AoH
    # NB2: By default the Text::CSV module treats all fields in a CSV file as strings, regardless of their actual data type.
    # NB3: csv function ~ x2  RAM. It's ok here.
    my $hoh = csv(
        in       => $filepath,
        sep_char => $separator,

        #binary    => 1, # default
        auto_diag => 1,
        encoding  => $encoding,
        key       => $key,
        on_in     => sub { $_{_labels} = add_labels( $_{$labels} ) }
    );

    return $hoh;
}

sub add_labels {
    my $value = shift;

    # *** IMPORTANT ***
    # This sub can return undef, i.e., $_{labels} = undef
    # That's OK as we won't perform exists $_{_label}
    # Note that in $hoh (above) empty columns are  key = ''.

    # Premature return if empty ('' = 0)
    return undef unless $value;    # perlcritic Severity: 5

    # We'll skip values that don't provide even number of key-values
    my @tmp = map { s/^\s//; s/\s+$//; $_; } ( split /\||,/, $value );    # perlcritic Severity: 5

    # Return undef for non-valid entries
    return @tmp % 2 == 0 ? {@tmp} : undef;
}

sub read_redcap_dict_file {
    my $arg = shift;

    # Read and load REDCap CSV dictionary
    return read_redcap_dictionary( $arg->{redcap_dictionary} );
}

sub read_mapping_file {
    my $arg = shift;

    # Read and load mapping file
    my $data_mapping_file =
      io_yaml_or_json( { filepath => $arg->{mapping_file}, mode => 'read' } );

    # Validate mapping file against JSON schema
    my $jv = Convert::Pheno::Utils::Schema->new(
        {
            data        => $data_mapping_file,
            debug       => $arg->{self_validate_schema},
            schema_file => $arg->{schema_file}
        }
    );
    $jv->json_validate;

    # Remap for quick looukup
    remap_assignTermIdFromHeader($data_mapping_file);

    # Return if succesful
    return $data_mapping_file;
}

sub read_sqldump {
    my $arg      = shift;
    my $filepath = $arg->{in};
    my $self     = $arg->{self};

    # Before resorting to writting this subroutine I performed an exhaustive search on CPAN:
    # - Tested MySQL::Dump::Parser::XS but I could not make it work...
    # - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables)
    # - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages....
    # Anyway, I ended up writting the parser myself...

    # Define variables that modify what we load
    my $max_lines_sql = $self->{max_lines_sql};
    my @omop_tables   = @{ $self->{omop_tables} };

    #COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin;
    # ......
    # \.

    # Verbose
    say "Reading the SQL dump...\n" if $self->{verbose};

    # Start reading the SQL dump
    my $fh = open_filehandle( $filepath, 'r' );

    # Determine the print interval based on file size
    my $print_interval = get_print_interval($filepath);

    # We'll store the data in the hashref $data
    my $data = {};

    # Now we we start processing line by line
    my $switch      = 0;
    my $local_count = 0;
    my $total_count = 0;
    my @headers;
    my $headers_data_structure;
    my $table_name;

    while ( my $line = <$fh> ) {

        if ( $line =~ m/^COPY/ ) {

            chomp $line;

            # First line contains the headers
            #COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin;

            # Create an array to hold the column names for this table
            $line =~ s/[\(\),]//g;    # getting rid of (),
            @headers = split /\s+/, $line;

            $table_name = uc( ( split /\./, $headers[1] )[1] );    # ATTRIBUTE_DEFINITI

            # Discarding non @$omop_tables:
            # This step improves RAM consumption
            next unless any { $_ eq $table_name } @omop_tables;

            # Discarding headers which are not terms/variables
            @headers = @headers[ 2 .. $#headers - 2 ];

            # Turning on the switch for later
            $switch = 1;

            # Reset count
            $local_count = 0;

            # Initializing $data>key as empty arrayref
            $data->{$table_name} = [];

            # Loading headers
            $headers_data_structure->{$table_name} = [@headers];

            # Jump one line
            $line = <$fh>;

            # Say if verbose
            say "Loading <$table_name> in memory..."
              if $self->{verbose};

        }

        # Loading the data if $switch
        if ($switch) {

            chomp $line;

            # Order matters. We exit before loading
            if ( $local_count == $max_lines_sql || $line =~ /^\\\.$/ ) {
                $switch = 0;
                print "==============\nRows read(total): $local_count\n\n"
                  if $self->{verbose};
                next;
            }
            $local_count++;
            $total_count++;

            # Columns are separated by \t
            # NB: 'split' and 'Text::CSV' split to strings
            # We go with 'split'. Coercing a posteriori
            my @fields = split /\t/, $line;

            # Loading the fields like this:
            #
            #  $VAR1 = {
            #  'PERSON' => [  # NB: This is the table name
            #             {
            #              'person_id' => 123,
            #               'test' => 'abc'
            #             },
            #             {
            #               'person_id' => 456,
            #               'test' => 'def'
            #             }
            #           ]
            #         };

            # Using tmp hashref to load all fields at once with slice
            my $hash_slice;
            @{$hash_slice}{@headers} =
              map { dotify_and_coerce_number($_) } @fields;

            # Adding them as an array element (AoH)
            push @{ $data->{$table_name} }, $hash_slice;

            # Print
            say "Rows read: $local_count"
              if ( $self->{verbose} && $local_count % $print_interval == 0 );
        }
    }
    close $fh;

    # Print if verbose
    print
"==========================\nRows read (sqldump-total): $total_count\n==========================\n\n"
      if $self->{verbose};

    # RAM Usage
    say ram_usage_str( 'read_sqldump', $data )
      if ( DEVEL_MODE || $self->{verbose} );

    return ( $data, $headers_data_structure );
}

sub read_sqldump_stream {
    my $arg           = shift;
    my $filein        = $arg->{in};
    my $self          = $arg->{self};
    my $person        = $arg->{person};
    my $fileout       = $self->{out_file};
    my $table_name    = $self->{omop_tables}[0];
    my $table_name_lc = lc($table_name);

    # Define variables that modify what we load
    my $max_lines_sql = $self->{max_lines_sql};

    # Open filehandles
    my $fh_in  = open_filehandle( $filein,  'r' );
    my $fh_out = open_filehandle( $fileout, 'a' );

    # Determine the print interval based on file size
    my $print_interval = get_print_interval($filein);

    # Start printing the array
    #say $fh_out "[";

    # Now we we start processing line by line
    my $count  = 0;
    my $switch = 0;
    my @headers;

    while ( my $line = <$fh_in> ) {

        # Only parsing $table_name_lc and discarding others
        # Note that double quotes are optional
        # - COPY "OMOP_cdm_eunomia".person
        # . COPY omop_cdm_eunomia_2.person
        if ( $line =~ /^COPY \"?(\w+)\"?\.$table_name_lc / ) {
            chomp $line;

            # Create an array to hold the column names for this table
            $line =~ s/[\(\),]//g;    # getting rid of (),
            @headers = split /\s+/, $line;

            # Discarding headers which are not terms/variables
            @headers = @headers[ 2 .. $#headers - 2 ];

            # Turning on the switch for later
            $switch++;

            # Jump one line
            $line = <$fh_in>;

        }

        # Loading the data if $switch
        if ($switch) {

            # get rid of \n
            chomp $line;

            # Order matters. We exit before loading
            last if $line =~ /^\\\.$/;

            # Splitting by tab, it's ok
            my @fields = split /\t/, $line;

            # Using tmp hashref to load all fields at once with slice
            my $hash_slice;
            @{$hash_slice}{@headers} =
              map { dotify_and_coerce_number($_) } @fields;

            # Error related to -max-lines-sqlError related to -max-lines-sql
            die
"We could not find person_id:$hash_slice->{person_id}. Try increasing the #lines with --max-lines-sql\n"
              unless exists $person->{ $hash_slice->{person_id} };

            # Increase counter
            $count++;

            # Encode data
            my $encoded_data =
              encode_omop_stream( $table_name, $hash_slice, $person, $count,
                $self );

            # Only after encoding we are able to discard 'null'
            say $fh_out $encoded_data if $encoded_data ne 'null';

            # adhoc filter to speed-up development
            last if $count == $max_lines_sql;

            # Print if verbose
            say "Rows processed: $count"
              if ( $self->{verbose} && $count % $print_interval == 0 );
        }
    }
    say "==============\nRows processed(total): $count\n" if $self->{verbose};

    #say $fh_out "]"; # not needed

    # Closing filehandles
    close $fh_in;
    close $fh_out;
    return 1;
}

sub encode_omop_stream {
    my ( $table_name, $hash_slice, $person, $count, $self ) = @_;

    # *** IMPORTANT ***
    # Table PERSON only has 1 individual
    my $person_id = $hash_slice->{person_id};
    my $data      = {
        $table_name => [$hash_slice],
        PERSON      => $count == 1
        ? $person->{$person_id}
        : {
            map { $_ => $person->{$person_id}{$_} }
              qw(person_id gender_concept_id birth_datetime)
        }
    };

    # Obtain
    my $stream = Convert::Pheno::omop2bff_stream_processing( $self, $data );

    # Return JSON string
    #  - canonical has some overhead but needed for t/)
    #  - $fh is already utf-8, no need to encode again here
    return JSON::XS->new->canonical->encode($stream);
}

sub sqldump2csv {
    my ( $data, $dir, $sql_headers_data_structure ) = @_;

    # CSV separator (tab character)
    my $sep = "\t";

    # Natural sort flag: set to 0 (off) by default.
    my $sort = 0;

    # Iterate over each table in the data hash.
    for my $table ( keys %{$data} ) {

        # Build the file path for the CSV file.
        my $filepath = catdir( $dir, "$table.csv" );

        # Retrieve header fields for the current table.
        my $table_headers = $sql_headers_data_structure->{$table};
        die "No header data found for table '$table'"
          unless defined $table_headers && @$table_headers;

        # Determine header order: natural sort if enabled, or as stored.
        my $headers =
          $sort
          ? [ nsort( @{$table_headers} ) ]
          : $table_headers;

        # Write the CSV file using the specified separator, headers, and data.
        write_csv(
            {
                sep      => $sep,
                filepath => $filepath,
                headers  => $headers,
                data     => $data->{$table},
            }
        );
    }
    return 1;
}

sub transpose_omop_data_structure {
    my ( $self, $data ) = @_;

    # The situation is the following, $data comes in format:
    #
    #$VAR1 = {
    #          'MEASUREMENT' => [
    #                             {
    #                               'measurement_concept_id' => 1,
    #                               'person_id' => 666
    #                             },
    #                             {
    #                               'measurement_concept_id' => 2,
    #                               'person_id' => 666
    #                             }
    #                           ],
    #          'PERSON' => [
    #                        {
    #                          'person_id' => 666
    #                        },
    #                        {
    #                          'person_id' => 1
    #                        }
    #                      ]
    #        };

    # where all 'person_id' are together inside the TABLE_NAME.
    # But, BFF "ideally" works at the individual level so we are going to
    # transpose the data structure to end up into something like this
    # NB: MEASUREMENT and OBSERVATION (among others, i.e., CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE)
    #     can have multiple values for one 'person_id' so they will be loaded as arrays
    #
    #
    #$VAR1 = {
    #          '1' => {
    #                     'PERSON' => {
    #                                   'person_id' => 1
    #                                 }
    #                   },
    #          '666' => {
    #                     'MEASUREMENT' => [
    #                                        {
    #                                          'measurement_concept_id' => 1,
    #                                          'person_id' => 666
    #                                        },
    #                                        {
    #                                          'measurement_concept_id' => 2,
    #                                          'person_id' => 666
    #                                        }
    #                                      ],
    #                     'PERSON' => {
    #                                   'person_id' => 666
    #                                 }
    #                   }
    #        };

    # Debug messages
    say "> Transposing OMOP data..." if $self->{debug};
    my $omop_person_id = {};

    # Only performed for $omop_main_table
    for my $table ( @{ $omop_main_table->{$omop_version} } ) {    # global

        # We void the table when reading to avoid data duplication in RAM
        while ( my $item = shift @{ $data->{$table} } ) {         # We want to keep order (!pop)

            if ( exists $item->{person_id} && $item->{person_id} ) {
                my $person_id = $item->{person_id};

                # {person_id} can have multiple rows in @omop_array_tables
                if ( any { $_ eq $table } @omop_array_tables ) {
                    push @{ $omop_person_id->{$person_id}{$table} }, $item;    # array
                }

                # {person_id} only has one value in a given table
                else {
                    $omop_person_id->{$person_id}{$table} = $item;             # scalar
                }
            }
        }
    }

    # To get any unused memory back to Perl
    $data = undef;

    # Finally we get rid of the 'person_id' key and return values as an array
    #
    #$VAR1 = [
    #          {
    #            'PERSON' => {
    #                          'person_id' => 1
    #                        }
    #          },
    # ------------------------------------------------
    #          {
    #            'MEASUREMENT' => [
    #                               {
    #                                 'measurement_concept_id' => 1,
    #                                 'person_id' => 666
    #                               },
    #                               {
    #                                 'measurement_concept_id' => 2,
    #                                 'person_id' => 666
    #                               }
    #                             ],
    #            'PERSON' => {
    #                          'person_id' => 666
    #                        }
    #          }
    #        ];
    # NB: We nsort keys to always have the same result but it's not needed
    say "> Sorting OMOP data by <person_id>..." if $self->{debug};

    my $aoh;
    for my $key ( nsort keys %{$omop_person_id} ) {
        push @{$aoh}, $omop_person_id->{$key};
        delete $omop_person_id->{$key};    # To avoid data duplication
    }

    # RAM usage
    say ram_usage_str( 'transpose_omop_data_structure', $aoh )
      if ( DEVEL_MODE || $self->{verbose} );

    return $aoh;
}

sub read_csv {
    my $arg      = shift;
    my $filepath = $arg->{in};
    my $sep      = $arg->{sep};
    my $self     = exists $arg->{self} ? $arg->{self} : { verbose => 0 };

    # Define split record separator from file extension
    my ( $separator, $encoding ) = define_separator( $filepath, $sep );

    # *** IMPORTANT ***
    # Text::CSV_XS functional interface
    # duplicates RAM <= DEPRECATED
    #my $aoh = csv(
    #    in       => $filepath,
    #    sep_char => $separator,
    #    headers  => "auto",
    #    encoding  => $encoding,
    #    auto_diag => 1
    #);

    # Create a new Text::CSV_XS object
    my $csv = Text::CSV_XS->new(
        {
            sep_char  => $separator,
            binary    => 1,
            auto_diag => 1,
        }
    );

    # Open fh
    my $fh = open_filehandle( $filepath, 'r' );

    # Determine the print interval based on file size
    my $print_interval = get_print_interval($filepath);

    # Get headers
    my $headers = $csv->getline($fh);
    $csv->column_names(@$headers);

    # Check for too many occurrences of separators
    die
      "Are you sure you are using the right --sep <$separator> for your data?\n"
      if is_separator_incorrect($headers);

    # Load data
    my @aoh;
    my $count = 0;
    while ( my $row = $csv->getline_hr($fh) ) {
        push @aoh, $row;
        $count++;

        say "Rows read: $count"
          if ( $self->{verbose} && $count % $print_interval == 0 );

    }

    # Close fh
    close $fh;

    # Print if verbose
    print
"==========================\nRows read (total): $count\n==========================\n\n"
      if $self->{verbose};

    # Coercing the data before returning it
    for my $item (@aoh) {
        for my $key ( @{$headers} ) {
            $item->{$key} = dotify_and_coerce_number( $item->{$key} );
        }
    }

    # RAM usage
    say ram_usage_str( "read_csv($filepath)", \@aoh )
      if ( DEVEL_MODE || $self->{verbose} );

    # Return data
    return \@aoh;
}

sub is_separator_incorrect {
    my $keys           = shift;
    my $max_delimiters = 5;

    # Count the number of delimiters (comma, semicolon, or tab) in the first key
    my $sep_count = ( $keys->[0] =~ tr/,;\t// );

    # Return true (1) if the number of delimiters exceeds the maximum allowed, otherwise false (0)
    return $sep_count > $max_delimiters ? 1 : 0;
}

sub read_csv_stream {
    my $arg     = shift;
    my $filein  = $arg->{in};
    my $self    = $arg->{self};
    my $sep     = $arg->{sep};
    my $person  = $arg->{person};
    my $fileout = $self->{out_file};

    # Define split record separator
    my ( $separator, $encoding, $table_name ) =
      define_separator( $filein, $sep );
    my $table_name_lc = lc($table_name);

    # Create a new Text::CSV_XS object
    my $csv = Text::CSV_XS->new(
        {
            sep_char  => $separator,
            binary    => 1,
            auto_diag => 1,
        }
    );

    # Open filehandles
    my $fh_in  = open_filehandle( $filein,  'r' );
    my $fh_out = open_filehandle( $fileout, 'a' );

    # Get headers
    my $headers = $csv->getline($fh_in);

    # *** IMPORTANT ***
    # On Feb-19-2023 I tested Parallel::ForkManager and:
    # 1 - The performance was by far slower than w/o it
    # 2 - We hot SQLite errors for concurring fh
    # Thus, it was not implemented

    my $count = 0;

    while ( my $row = $csv->getline($fh_in) ) {

        # Load the values as a hash slice
        my %hash_slice;
        @hash_slice{@$headers} = map { dotify_and_coerce_number($_) } @$row;

        # Encode data
        my $encoded_data =
          encode_omop_stream( $table_name, \%hash_slice, $person, $count,
            $self );

        # Only after encoding we are able to discard 'null'
        say $fh_out $encoded_data if $encoded_data ne 'null';

        # Increment $count
        $count++;

        # Verbose logging every 10,000 rows
        if ( $self->{verbose} && $count % 10_000 == 0 ) {
            say "Rows processed: $count";
        }
    }
    say "==============\nRows total:     $count\n" if $self->{verbose};

    close $fh_in;
    close $fh_out;
    return 1;
}

sub write_csv {
    my $arg      = shift;
    my $sep      = $arg->{sep};
    my $data     = $arg->{data};
    my $filepath = $arg->{filepath};
    my $headers  = $arg->{headers};

    # Ensure $data is an array reference of hashes
    if ( ref $data eq 'HASH' ) {
        $data = [$data];    # Convert to an array reference containing one hash
    }

    my @exts = qw(.csv .tsv);
    my $msg =
      qq(Can't recognize <$filepath> extension. Extensions allowed are: )
      . ( join ',', @exts ) . "\n";
    my ( undef, undef, $ext ) = fileparse( $filepath, @exts );
    die $msg unless any { $_ eq $ext } @exts;

    # Use Text::CSV_XS to write to CSV, ensuring $data is always an AoH
    csv(
        in       => $data,       # This now can be an AoH or a single hash converted to AoH
        out      => $filepath,
        sep_char => $sep,
        eol      => "\n",
        encoding => 'UTF-8',
        headers  => $headers     # Ensure headers are defined or auto-detection is enabled
    );
    return 1;
}

sub open_filehandle {
    my ( $filepath, $mode ) = @_;
    my $handle = $mode eq 'a' ? '>>' : $mode eq 'w' ? '>' : '<';
    my $fh;
    if ( $filepath =~ /\.gz$/ ) {
        if ( $mode eq 'a' || $mode eq 'w' ) {
            $fh = IO::Compress::Gzip->new( $filepath,
                Append => ( $mode eq 'a' ? 1 : 0 ) );
        }
        else {
            $fh = IO::Uncompress::Gunzip->new( $filepath, MultiStream => 1 );
        }
        binmode( $fh, ":encoding(UTF-8)" );
    }
    else {
        open $fh, qq($handle:encoding(UTF-8)), $filepath;
    }
    return $fh;
}

sub define_separator {
    my ( $filepath, $sep ) = @_;

    # Define split record separator from file extension
    my @exts = map { $_, $_ . '.gz' } qw(.csv .tsv .sql .txt);
    my ( $table_name, undef, $ext ) = fileparse( $filepath, @exts );

    # Defining separator character
    my $separator =
        $sep
      ? $sep
      : $ext eq '.csv'    ? ';'     # Note we don't use comma but semicolon
      : $ext eq '.csv.gz' ? ';'     # idem
      : $ext eq '.tsv'    ? "\t"
      : $ext eq '.tsv.gz' ? "\t"
      :                     "\t";

    my $encoding =
      $ext =~ m/\.gz/ ? ':gzip:encoding(utf-8)' : 'encoding(utf-8)';

    # Return 3 but some get only 2
    return ( $separator, $encoding, $table_name );
}

sub to_gb {
    my $bytes = shift;

    # base 2 => 1,073,741,824
    my $gb = $bytes / 1_073_741_824;
    return sprintf( '%9.4f', $gb ) . ' GB';
}

sub ram_usage_str {
    my ( $func, $data ) = @_;
    return qq/***RAM Usage***($func):/ . to_gb( total_size($data) ) . "\n";
}

sub load_exposures {
    my $data = read_csv( { in => shift, sep => "\t" } );

    # We will only use the key 'concept_id' and discard the rest
    #$VAR1 = {
    #      '4138352' => 1
    #    };
    my %hash = map { $_->{concept_id} => 1 } @$data;

    # Returning hashref
    return \%hash;
}

sub get_headers {
    my $data = shift;

    # Ensure $data is an array reference, wrap it in an array if it's a hash reference.
    $data = [$data] unless ref $data eq 'ARRAY';

    # Step 1 & 2: Collect all unique keys from all hashes, ignoring hash references.
    my %all_keys;
    foreach my $row (@$data) {
        foreach my $key ( keys %$row ) {

            # Skip any key where the value is a reference (including hash references)
            # Why?
            # In pxf2csv I encountered HASH(foobarbaz) as header. This is actually
            # a deeper issue I have to investigate
            next if ref $row->{$key};
            $all_keys{$key} = ();
        }
    }

    # Step 3: Sort keys for consistency.
    my @headers = sort keys %all_keys;
    return \@headers;
}

sub remap_assignTermIdFromHeader {
    my $hash = shift;
    for my $key (%$hash) {
        if ( exists $hash->{$key}{assignTermIdFromHeader} ) {
            $hash->{$key}{assignTermIdFromHeader_hash} =
              array_ref_to_hash( $hash->{$key}{assignTermIdFromHeader} );
        }
    }
    return 1;
}

sub array_ref_to_hash {
    my $array_ref = shift;

    # Check if the input is an array reference
    die "Expected an array reference at <assignTermIdFromHeader>"
      unless ref($array_ref) eq 'ARRAY';

    my %hash;

    # Iterate over the elements of the array reference
    foreach my $element ( @{$array_ref} ) {
        $hash{$element} = 1;
    }
    return \%hash;
}

sub convert_table_aoh_to_hoh {
    my ( $data, $table, $self ) = @_;

    my %table_cursor =
      map { $_ => $data->{$_} } qw(CONCEPT PERSON VISIT_OCCURRENCE);
    my %table_id = (
        CONCEPT          => 'concept_id',
        PERSON           => 'person_id',
        VISIT_OCCURRENCE => 'visit_occurrence_id'
    );
    my $array_ref = $table_cursor{$table};
    my $id        = $table_id{$table};

    ###########
    # CONCEPT #
    ###########

    # $VAR1 = [
    #          {
    #            'concept_class_id' => '4-char billing code',
    #            'concept_code' => 'K92.2',
    #            'concept_id' => 35208414,
    #            'concept_name' => 'Gastrointestinal hemorrhage, unspecified',
    #            'domain_id' => 'Condition',
    #            'invalid_reason' => undef,
    #            'standard_concept' => undef,
    #            'valid_end_date' => '2099-12-31',
    #            'valid_start_date' => '2007-01-01',
    #            'vocabulary_id' => 'ICD10CM'
    #          },
    #
    # and we convert it to hash to allow for quick searches by 'concept_id'
    #
    # $VAR1 = {
    #          '1107830' => {
    #                         'concept_class_id' => 'Ingredient',
    #                         'concept_code' => 28889,
    #                         'concept_id' => 1107830,
    #                         'concept_name' => 'Loratadine',
    #                         'domain_id' => 'Drug',
    #                         'invalid_reason' => undef,
    #                         'standard_concept' => 'S',
    #                         'valid_end_date' => '2099-12-31',
    #                         'valid_start_date' => '1970-01-01',
    #                         'vocabulary_id' => 'RxNorm'
    #                         },
    #
    # NB: We store all columns yet we'll use 4:
    # 'concept_id', 'concept_code', 'concept_name', 'vocabulary_id'

    ####################
    # VISIT_OCCURRENCE #
    ####################

    # Going from
    #$VAR1 = [
    #        {
    #          'admitting_source_concept_id' => 0,
    #          'visit_occurrence_id' => 85,
    #          ...
    #        }
    #      ];

    # To
    #$VAR1 = {
    #        '85' => {
    #                  'admitting_source_concept_id' => 0,
    #                  'visit_occurrence_id' => 85,
    #                  ...
    #                }
    #      };

    # Initialize the new hash for transformed data
    my $hoh = {};

    # Iterate over the array and build the hash while clearing the array
    while ( my $item = pop @{$array_ref} ) {    #faster than shift (order irrelevant here)
        my $key = $item->{$id};                 # avoid stringfication
        $hoh->{$key} = $item;
    }

    # The original array @{ $self->{data}{$table} is now empty
    # RAM Usage
    say ram_usage_str( "convert_table_aoh_to_hoh($table)", $hoh )
      if ( $self->{verbose} || DEVEL_MODE );

    return $hoh;
}

sub get_print_interval {
    my $filepath = shift;

    # Determine file size
    my $file_size = -s $filepath;

    # Set print interval based on file size (threshold: 10 MB)
    my $print_interval = $file_size > 10 * 1024 * 1024 ? 10_000 : 1_000;

    return $print_interval;
}

1;