Yet Another CPAN Grep

Mashtree/lib/Mashtree.pm

#!/usr/bin/env perl
package Mashtree;
use strict;
use warnings;
use Exporter qw(import);
use File::Basename qw/fileparse basename dirname/;
use Data::Dumper;
use List::Util qw/shuffle/;
use Scalar::Util qw/looks_like_number/;

use threads;
use threads::shared;

use lib dirname($INC{"Mashtree.pm"});
use Bio::Matrix::IO;
use Bio::TreeIO;

our @EXPORT_OK = qw(
           logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames treeDist mashDist mashHashes raw_mash_distance raw_mash_distance_unequal_sizes
           @fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
           $MASHTREE_VERSION
         );

local $0=basename $0;

=pod

=head1 NAME Mashtree

=head1 SYNOPSIS

Helps run a mashtree analysis to make rapid trees for genomes.
Please see github.com/lskatz/Mashtree for more information.

=over

=item mashtree executables

This document covers the Mashtree library, but the highlight
the mashtree package is the executable `mashtree`.
See github.com/lskatz/Mashtree for more information.

Fast method:

    mashtree --numcpus 12 *.fastq.gz [*.fasta] > mashtree.dnd

More accurate method:

    mashtree --mindepth 0 --numcpus 12 *.fastq.gz [*.fasta] > mashtree.dnd

Bootstrapping and jackknifing

    mashtree_bootstrap.pl --reps 100 --numcpus 12 *.fastq.gz -- --min-depth 0 > mashtree.jackknife.dnd
    mashtree_jackknife.pl --reps 100 --numcpus 12 *.fastq.gz -- --min-depth 0 > mashtree.jackknife.dnd

=back

=head1 VARIABLES

=over

=item $VERSION

=item $MASHTREE_VERSION (same value as $VERSION)

=item @fastqExt = qw(.fastq.gz .fastq .fq .fq.gz)

=item @fastaExt = qw(.fasta .fna .faa .mfa .fas .fsa .fa)

=item @bamExt = qw(.sorted.bam .bam)

=item @vcfExt = qw(.vcf.gz .vcf)

=item @mshExt = qw(.msh)

=item @richseqExt = qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swiss .sp)

=item $fhStick :shared 

Used to mark whether a file is being read, so that Mashtree limits disk I/O

=back

=cut

######
# CONSTANTS

our $VERSION = "1.4.6";
our $MASHTREE_VERSION=$VERSION;
our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
our @bamExt=qw(.sorted.bam .bam);
our @vcfExt=qw(.vcf.gz .vcf);
our @mshExt=qw(.msh);
# Richseq extensions were obtained mostly from bioperl under
# the genbank, embl, and swissprot entries, under
# the source for Bio::SeqIO
our @richseqExt=qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swiss .sp);

# Helpful things
my $fhStick :shared;  # A thread can only open a fastq file if it has the talking stick.

=head1 METHODS

=over

=item $SIG{'__DIE__'}

Remakes how `die` works, so that it references the caller

=cut

#################################################
### COMMON SUBS/TOOLS (not object subroutines) ##
#################################################
# Redefine how scripts die
$SIG{'__DIE__'} = sub {
  local $0=basename($0);
  my $e = $_[0] || ""; 
  my $callerSub=(caller(1))[3] || (caller(0))[3] || "UnknownSub";

  $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; 
  die("$0: $callerSub: $e"); 
};

=pod

=item logmsg

Prints a message to STDERR with the thread number and the program name, with a trailing newline.

=cut

# Centralized logmsg
#sub logmsg {print STDERR "$0: ".(caller(1))[3].": @_\n";}
sub logmsg {
  local $0=basename $0;
  my $parentSub=(caller(1))[3] || (caller(0))[3];
  $parentSub=~s/^main:://;

  # Find the thread ID and stringify it
  my $tid=threads->tid;
  $tid=($tid) ? "(TID$tid)" : "";

  my $msg="$0: $parentSub$tid: @_\n";

  print STDERR $msg;
}

=pod

=item openFastq

 Opens a fastq file in a thread-safe way.

=cut

sub openFastq{
  my($fastq,$settings)=@_;

  my $fh;

  lock($fhStick);

  my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
  my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
  if($ext =~/\.gz$/){
    open($fh,"zcat $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
  } else {
    open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
  }
  return $fh;
}

=pod

=item _truncateFilename

 Removes fastq extension, removes directory name,

=cut

sub _truncateFilename{
  my($file,$settings)=@_;
  # strip off msh and any other known extentions
  my $name=$file;
  my $oldName="";
  # Strip until we get convergence
  while($name ne $oldName){
    $oldName = $name;
    $name = basename($name,@mshExt,@fastqExt,@richseqExt,@fastaExt);
  }
  return $name;
}

=pod

=item distancesToPhylip

1. Read the mash distances
2. Create a phylip file

Arguments: hash of distances, output directory, settings hash

=cut

sub distancesToPhylip{
  my($distances,$outdir,$settings)=@_;

  my $phylip = "$outdir/distances.phylip"; 
  # NOTE: need to regenerate the combined distances each time
  # because I need to allow variation in the input samples.
  #return $phylip if(-e $phylip);

  # The way phylip is, I need to know the genome names
  # a priori
  my %name;
  open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
  while(<MASHDIST>){
    next if(/^#/);
    my($name)=split(/\t/,$_);
    $name=~s/^\s+|\s+$//g;  # whitespace trim before right-padding is added
    $name{_truncateFilename($name,$settings)}=1;
  }
  close MASHDIST;
  my @name=sortNames([keys(%name)],$settings);
  # Index the array
  my $columnIndex=0;
  for(@name){
    $name{$_}=$columnIndex++;
  }

  # Load up the matrix object
  logmsg "Reading the distances file at $distances";
  open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
  my $query="UNKNOWN"; # Default ID in case anything goes wrong
  my @m;
  while(<MASHDIST>){
    chomp;
    if(/^#query\s+(.+)/){
      $query=_truncateFilename($1,$settings);
    } else {
      my ($reference,$distance)=split(/\t/,$_);
      $reference=_truncateFilename($reference,$settings);
      $distance=sprintf("%0.10f",$distance);
      $m[$name{$query}][$name{$reference}]=$distance;
      $m[$name{$reference}][$name{$query}]=$distance;
    }
  }
  close MASHDIST;
  #my $matrixObj=Bio::Matrix::Generic->new(-rownames=>\@name,-colnames=>\@name,-values=>\@m);

  # taking this method from write_matrix in http://cpansearch.perl.org/src/CJFIELDS/BioPerl-1.6.924/Bio/Matrix/IO/phylip.pm
  my $str;
  $str.=(" " x 4) . scalar(@name)."\n";
  for(my $i=0;$i<@name;$i++){
    $str.=$name[$i];
    my $count=0;
    for(my $j=0;$j<@name;$j++){
      if($count < $#name){
        $str.=$m[$i][$j]. "  ";
      } else {
        $str.=$m[$i][$j];
      }
      $count++;
    }
    $str.="\n";
  }
  open(PHYLIP,">",$phylip) or die "ERROR: could not write to $phylip: $!";
  print PHYLIP $str;
  close PHYLIP;
  return $phylip;
}

=pod

=item sortNames

Sorts names.

Arguments: 

1. $name - array of names
2. $settings - options
  * $$settings{'sort-order'} is either "abc", "random", "input-order"

=cut

sub sortNames{
  my($name,$settings)=@_;
  my @sorted;
  if($$settings{'sort-order'} =~ /^(abc|alphabet)$/){
    @sorted=sort { $a cmp $b } @$name;
  } elsif($$settings{'sort-order'}=~/^rand(om)?/){
    @sorted=shuffle(@$name);
  } elsif($$settings{'sort-order'} eq 'input-order'){
    @sorted=@$name;
  } else {
    die "ERROR: I don't understand sort-order $$settings{'sort-order'}";
  }
  return @sorted;
}

=pod

=item createTreeFromPhylip($phylip, $outdir, $settings)

 Create tree file with Quicktree but bioperl 
 as a backup.

=cut

sub createTreeFromPhylip{
  my($phylip,$outdir,$settings)=@_;

  my $treeObj;

  my $quicktreePath=`which quicktree 2>/dev/null`;
  # bioperl if there was an error with which quicktree
  if($?){
    logmsg "DEPRECATION WARNING: CANNOT FIND QUICKTREE IN YOUR PATH. I will use BioPerl to make the tree this time, but it will be removed in the next version.";
    #logmsg "Creating tree with BioPerl";
    my $dfactory = Bio::Tree::DistanceFactory->new(-method=>"NJ");
    my $matrix   = Bio::Matrix::IO->new(-format=>"phylip", -file=>$phylip)->next_matrix;
    $treeObj = $dfactory->make_tree($matrix);
    open(TREE,">","$outdir/tree.dnd") or die "ERROR: could not open $outdir/tree.dnd: $!";
    print TREE $treeObj->as_text("newick");
    print TREE "\n";
    close TREE;
  }
  # quicktree
  else {
    #logmsg "Creating tree with QuickTree";
    system("quicktree -in m $phylip > $outdir/tree.dnd.tmp");
    die "ERROR with quicktree" if $?;
    $treeObj=Bio::TreeIO->new(-file=>"$outdir/tree.dnd.tmp")->next_tree;
    open(my $treeFh, ">", "$outdir/tree.dnd") or die "ERROR: could not write to $outdir/tree.dnd: $!";
    print $treeFh $treeObj->as_text("newick")."\n";
    #my $outtree=Bio::TreeIO->new(-file=>">$outdir/tree.dnd", -format=>"newick");
    #$outtree->write_tree($treeObj);

    unlink("$outdir/tree.dnd.tmp");
  }

  return $treeObj;

}

=pod

=item treeDist($treeObj1, $treeObj2)

 Lee's implementation of a tree distance. The objective
 is to return zero if two trees are the same.

=cut

sub treeDist{
  my($treeObj1,$treeObj2)=@_;

  # If the tree objects are really strings, then make Bio::Tree::Tree objects
  if(!ref($treeObj1)){
    if(-e $treeObj1){ # if this is a file, get the contents
      $treeObj1=`cat $treeObj1`;
    }
    $treeObj1=Bio::TreeIO->new(-string=>$treeObj1)->next_tree;
  }
  if(!ref($treeObj2)){
    if(-e $treeObj2){ # if this is a file, get the contents
      $treeObj2=`cat $treeObj2`;
    }
    $treeObj2=Bio::TreeIO->new(-string=>$treeObj2)->next_tree;
  }
  for($treeObj1,$treeObj2){
    #$_->force_binary;
  }
  
  # Get all leaf nodes so that they can be compared
  my @nodes1=sort {$a->id cmp $b->id} grep{$_->is_Leaf} $treeObj1->get_nodes;
  my @nodes2=sort {$a->id cmp $b->id} grep{$_->is_Leaf} $treeObj2->get_nodes;
  my $numNodes=@nodes1;

  # Test 1: are these the same nodes?
  my $nodeString1=join(" ",map{$_->id} @nodes1);
  my $nodeString2=join(" ",map{$_->id} @nodes2);
  if($nodeString1 ne $nodeString2){
    # TODO print out the differing nodes?
    logmsg "ERROR: nodes are not the same in both trees!\n  $nodeString1\n  $nodeString2";
    return ~0; #largest int
  }

  # Find the number of branches it takes to get to each node.
  # Turn it into a Euclidean distance
  my $euclideanDistance=0;
  for(my $i=0;$i<$numNodes;$i++){
    for(my $j=$i+1;$j<$numNodes;$j++){
      my ($numBranches1,$numBranches2);

      my $lca1=$treeObj1->get_lca($nodes1[$i],$nodes1[$j]);
      my $lca2=$treeObj2->get_lca($nodes2[$i],$nodes2[$j]);
      
      # Distance in tree1
      my $distance1=0;
      my @ancestory1=reverse $treeObj1->get_lineage_nodes($nodes1[$i]);
      my @ancestory2=reverse $treeObj1->get_lineage_nodes($nodes1[$j]);
      for my $currentNode(@ancestory1){
        $distance1++;
        last if($currentNode eq $lca1);
      }
      for my $currentNode(@ancestory2){
        $distance1++;
        last if($currentNode eq $lca1);
      }
      
      # Distance in tree2
      my $distance2=0;
      my @ancestory3=reverse $treeObj2->get_lineage_nodes($nodes2[$i]);
      my @ancestory4=reverse $treeObj2->get_lineage_nodes($nodes2[$j]);
      for my $currentNode(@ancestory3){
        $distance2++;
        last if($currentNode eq $lca2);
      }
      for my $currentNode(@ancestory4){
        $distance2++;
        last if($currentNode eq $lca2);
      }

      if($distance1 != $distance2){
        logmsg "These two nodes do not have the same distance between trees: ".$nodes1[$i]->id." and ".$nodes1[$j]->id;
      }

      # Add up the Euclidean distance
      $euclideanDistance+=($distance1 - $distance2) ** 2;
    }
  }
  $euclideanDistance=sqrt($euclideanDistance);
  return $euclideanDistance;
}

=pod

=item mashDist($file1, $file2, $k, $settings)

Find the distance between two mash sketch files
Alternatively: two hash lists.

=cut

sub mashDist{
  my($file1, $file2, $k, $settings)=@_;

  my($hashes1, $hashes2, $kmer1, $kmer2);
  if(ref($file1) eq 'ARRAY'){
    $hashes1 = $file1;
    $kmer1 = -1;
  } else {
    ($hashes1, $kmer1) = mashHashes($file1);
  }
  if(ref($file2) eq 'ARRAY'){
    $hashes2 = $file2;
    $kmer2 = -1;
  } else {
    ($hashes2, $kmer2) = mashHashes($file2);
  }

  if($kmer1 ne $kmer2){
    die "ERROR: kmer lengths do not match($kmer1 vs $kmer2)";
  }

  # Set the default kmer length and perform sanity check
  $k ||= $kmer1;
  if(!looks_like_number($k)){
    die "ERROR: k was not set to an integer";
  }
  if($k < 1){
    die "ERROR: k was undefined or set to less than 1";
  }

  my($common, $total) = (0,0);
  if(scalar(@$hashes1) != scalar(@$hashes2)){
    ($common, $total) = raw_mash_distance_unequal_sizes($hashes1, $hashes2);
  } else {
    ($common, $total) = raw_mash_distance($hashes1, $hashes2);
  }
  my $jaccard = $common/$total;
  my $mash_distance = -1/$k * log(2*$jaccard / (1+$jaccard));
  #logmsg "========== $mash_distance = -1/$k * log(2*$jaccard / (1+$jaccard)) ==============";

  return $mash_distance;
}

=pod

=item mashHashes($sketch)

Return an array of hashes, the kmer length, and the genome estimated length

=cut

sub mashHashes{
  my($sketch)=@_;
  my @hash;
  my $length = 0;
  my $kmer   = 0;

  if(!-e $sketch){
    die "ERROR: file not found: $sketch";
  }

  my $fh;
  if($sketch =~ /\.msh$/){
    open($fh, "mash info -d $sketch | ") or die "ERROR: could not run mash info -d on $sketch: $!";
  } elsif($sketch =~ /\.json$/){
    open($fh, $sketch) or die "ERROR: could not read $sketch: $!";
  }
  while(<$fh>){
    if(/kmer\D+(\d+)/){
      $kmer = $1;
    }
    elsif(/length\D+(\d+)/){
      $length = $1;
    }
    elsif(/hashes/){
      while(<$fh>){
        last if(/\]/);
        next if(!/\d/);
        s/\D+//g;
        s/^\s+|\s+$//g;
        push(@hash, $_);
      }
    }
  }
  if(!@hash){
    die "ERROR: no hashes found in $sketch";
  }
  return (\@hash, $kmer, $length);
}

=pod

=item raw_mash_distance_unequal_sizes($hashes1, $hashes2)

Compare unequal sized hashes. Treat the first
set of hashes as the reference (denominator)
set.

=cut

sub raw_mash_distance_unequal_sizes{
  my($hashes1, $hashes2) = @_;

  my (%sketch1,%sketch2);
  @sketch1{@$hashes1} = (1) x scalar(@$hashes1);
  @sketch2{@$hashes2} = (1) x scalar(@$hashes2);

  my %union;
  for my $h(@$hashes1){
    if($sketch2{$h}){
      $union{$h}++;
    }
  }

  my $common = scalar(keys(%union));
  my $total  = scalar(@$hashes1);

  return($common,$total);
}

=pod

=item raw_mash_distance($hashes1, $hashes2)

Return the number of kmers in common and the number compared total. inspiration from
https://github.com/onecodex/finch-rs/blob/master/src/distance.rs#L34

=cut

sub raw_mash_distance{
  my($hashes1, $hashes2) = @_;

  my @sketch1 = sort {$a <=> $b} @$hashes1;
  my @sketch2 = sort {$a <=> $b} @$hashes2;

  my $i      = 0;
  my $j      = 0;
  my $common = 0;
  my $total  = 0;

  my $sketch_size = @sketch1;
  while($total < $sketch_size && $i < @sketch1 && $j < @sketch2){
    my $ltgt = ($sketch1[$i] <=> $sketch2[$j]); # -1 if sketch1 is less than, +1 if sketch1 is greater than

    if($ltgt == -1){
      $i += 1;
    } elsif($ltgt == 1){
      $j += 1;
    } elsif($ltgt==0) {
      $i += 1;
      $j += 1;
      $common += 1;
    } else {
      die "Internal error";
    }

    $total += 1;
  }

  if($total < $sketch_size){
    if($i < @sketch1){
      $total += @sketch1 - 1;
    }

    if($j < @sketch2){
      $total += @sketch2 - 1;
    }

    if($total > $sketch_size){
      $total = $sketch_size;
    }
  }

  return ($common, $total);
}

# Calculates the Transfer Bootstrap Expectation (TBE) for internal nodes based on
# the methods outlined in Lemoine et al, Nature, 2018.
# Currently experimental.
# I entered this sub into bioperl > v1.7.5 but it is worth
# having locally here to help maintain compatibility.
# The only difference is that it isn't an object method
# and that it is called without an OO implementation.

=item transfer_bootstrap_expectation

 Title   : transfer_bootstrap_expectation
 Usage   : my $tree_with_bs = transfer_bootstrap_expectation(\@bs_trees,$guide_tree);
 Function: Calculates the Transfer Bootstrap Expectation (TBE) for internal nodes based on 
           the methods outlined in Lemoine et al, Nature, 2018.
           Currently experimental.
 Returns : L<Bio::Tree::TreeI>
 Args    : Arrayref of L<Bio::Tree::TreeI>s
           Guide tree, L<Bio::Tree::TreeI>s

=back

=cut

sub transfer_bootstrap_expectation{
  my ($bs_trees,$guide_tree) = @_;

  if(!defined($bs_trees) || ref($bs_trees) ne 'ARRAY'){
    die "ERROR: second parameter in assess_bootstrap() must be a list";
  }
  my $num_bs_trees = scalar(@$bs_trees);
  if($num_bs_trees < 1){
    die "ERROR: no bootstrap trees were passed to ".(caller(0))[3];
  }

  # internal nodes are defined by their children
  my %internal = ();
  my %leafNameId = ();
  my @idLookup = ();
  my @internalLookup = ();
  my @tree = ($guide_tree, @$bs_trees);
  my $numTrees = scalar(@tree);
  for(my $i = 0; $i < $numTrees; $i++){ # guide tree's index is $i==0
    # Do this as a top down approach, can probably be
    # improved by caching internal node states, but not going
    # to worry about it right now.

    my @allnodes = $tree[$i]->get_nodes;
    my @internalnodes = grep { ! $_->is_Leaf } @allnodes;
    for my $node ( @internalnodes ) {
      my @tips = sort map { $_->id } 
                      grep { $_->is_Leaf() } $node->get_all_Descendents;
      my $id = join(",", @tips);
      # Map the concatenated-leaf ID to the internal ID on the guide tree
      if( $i == 0 ) {
        $internal{$id} = $node->internal_id;
        $leafNameId{$node->internal_id} = $id;
      }

      # Record the tips for each tree's internal node
      # ID lookup (concatenated string of leaf names)
      $idLookup[$i]{$id} = \@tips;
      # Internal ID lookup
      $internalLookup[$i]{$internal{$id}} = \@tips;
    }
  }

  # Find the average distance from branch b to all
  # bootstrap trees' branches b*
  my @id = sort keys %internal;
  my $numIds = @id;
  # Loop through all internal nodes of the guide tree
  for(my $j=0; $j<$numIds; $j++){
    my $refNode = $guide_tree->find_node(-internal_id => $internal{$id[$j]});
    my $refNodeId = $refNode->internal_id;
    my $refJoinId = $leafNameId{$refNodeId};
    my $refLeaves = $idLookup[0]{$refJoinId};
    my %refLeafIndex = map{$_=>1} @$refLeaves;
    #next if(!defined($refLeaves));

    # For each internal node, start calculating for
    # an average TBE distance.
    my $nodeTbeTotal = 0;
  
    # Loop through all bootstrap trees, skipping the 0th
    # tree which is the guide tree.
    for(my $i=1;$i<$numTrees;$i++){

      # Find the right branch to bootstrap with. The right
      # branch will be the one that has the smallest
      # TBE distance.
      my @bsNode   = grep {!$_->is_Leaf} $tree[$i]->get_nodes;
      my $numBsIds = scalar(@bsNode);
      my $minDistance = ~0; # large int
      for(my $k=0;$k<$numBsIds;$k++){
        my @queryLeaves = sort map { $_->id }
                      grep { $_->is_Leaf() } $bsNode[$k]->get_all_Descendents;

        my %queryLeafIndex = map{$_=>1} @queryLeaves;

        # How many moves does it take to go from query to ref?
        my $dist=0;
        for my $queryLeaf(@queryLeaves){
          if(!$refLeafIndex{$queryLeaf}){
            $dist++;
          }
        }
        for my $refLeaf(@$refLeaves){
          if(!$queryLeafIndex{$refLeaf}){
            $dist++;
          }
        }

        if($dist < $minDistance){
          $minDistance = $dist;
        }
      }
      $nodeTbeTotal += $minDistance;
    }
    my $avgTbe = $nodeTbeTotal / $numTrees;
    
    # Calculate the average of all b to b* distances
    # But it is also 1 - average.
    my $numRefLeaves = scalar(@$refLeaves);
    my $nodeTbe = 1 - $avgTbe/$numRefLeaves;
    # Round to an integer
    $refNode->bootstrap(sprintf("%0.0f",100 * $nodeTbe));
  }
   
  return $guide_tree;
}

1;


__END__