Group
Extension

Bio-Phylo-Forest-DBTree/lib/Bio/Phylo/Forest/DBTree.pm

package Bio::Phylo::Forest::DBTree;
use strict;
use warnings;
use DBI;
use Bio::Phylo::Factory;
use Bio::Phylo::Util::Exceptions 'throw';
use base 'DBIx::Class::Schema';
use base 'Bio::Phylo::Forest::Tree';

__PACKAGE__->load_namespaces;

my $SINGLETON;
my $DBH;
my $fac = Bio::Phylo::Factory->new;
use version 0.77; our $VERSION = qv("v0.1.2");

=head1 NAME

Bio::Phylo::Forest::DBTree - Phylogenetic database as a tree object

=head1 SYNOPSIS

 use Bio::Phylo::Forest::DBTree;
 
 # connect to the Green Genes tree
 my $file = 'gg_13_5_otus_99_annotated.db';
 my $dbtree = Bio::Phylo::Forest::DBTree->connect($file);

 # $dbtree can be used as a Bio::Phylo::Forest::Tree object,
 # and the node objects that are returned can be used as
 # Bio::Phylo::Forest::Node objects
 my $root = $dbtree->get_root;

=head1 DESCRIPTION

This package provides the functionality to handle very large phylogenies (examples: the
NCBI taxonomy, the Green Genes tree) as if they are L<Bio::Phylo> tree objects, with all 
the possibilities for traversal, computation, serialization, and visualization, but stored
in a SQLite database. These databases are single files, so that they can be easily shared.
Some useful database files are available here: 
https://figshare.com/account/home#/projects/18808

To make new tree databases, a number of scripts are provided with the distribution of this
package:

=over

=item * C<megatree-loader> Loads a very large Newick tree into a database.

=item * C<megatree-ncbi-loader> Loads the NCBI taxonomy dump into a database.

=item * C<megatree-phylotree-loader> Loads a tree in the format of L<http://phylotree.org>
into a database.

=back

As an example of interacting with a database tree, the script C<megatree-pruner> can be
used to extract subtrees from a database.

=head1 DATABASE METHODS

The following methods deal with the database as a whole: creating a new database, 
connecting to an existing one, persisting a tree in a database and extracting one as a
mutable, in-memory object.

=head2 create()

Creates a SQLite database file in the provided location. Usage:

  use Bio::Phylo::Forest::DBTree;
  
  # second argument is optional
  Bio::Phylo::Forest::DBTree->create( $file, '/opt/local/bin/sqlite3' );

The first argument is the location where the database file is going to be created. The
second argument is optional, and provides the location of the C<sqlite3> executable that
is used to create the database. By default, the C<sqlite3> is simply found on the 
C<$PATH>, but if it is installed in a non-standard location that location can be provided
here. The database schema that is created corresponds to the following SQL statements:

 create table node(
   id int not null,
   parent int,
   left int,
   right int,
   name varchar(20),
   length float,
   height float,
   primary key(id)
 );
 create index parent_idx on node(parent);
 create index left_idx on node(left);
 create index right_idx on node(right);
 create index name_idx on node(name);

=cut

sub create {
	my $class = shift;
	my $file  = shift;
	my $sqlite3 = shift || 'sqlite3';
	my $command = do { local $/; <DATA> };
	system("echo '$command' | sqlite3 '$file'") == 0 or die 'Create failed!';
}

=head2 connect()

Connects to a SQLite database file, returns the connection as a 
C<Bio::Phylo::Forest::DBTree> object. Usage:

 use Bio::Phylo::Forest::DBTree;
 my $dbtree = Bio::Phylo::Forest::DBTree->connect($file);

The argument is a file name. If the file exists, a L<DBD::SQLite> database handle to that
file is returned. If the file does not exist, a new database is created in that location,
and subsequently the handle to that newly created database is returned. The creation of 
the database is handled by the C<create()> method (see below).

=cut

sub connect {
	my $class = shift;
	my $file  = shift;
	if ( not $SINGLETON ) {
		
		# create if not exist
		if ( not -e $file ) {
			$class->create($file);
		}
	
		# fuck it, let's just hardcode it here - Yeehaw!
		my $dsn  = "dbi:SQLite:dbname=$file";
		$DBH = DBI->connect($dsn,'','');
		$DBH->{'RaiseError'} = 1;
		$SINGLETON = $class->SUPER::connect( sub { $DBH } );
	}
	return $SINGLETON;
}

=head2 persist()

Persist a phylogenetic tree object (a subclass of L<Bio::Phylo::Forest::Tree>) into a 
newly created database file. Usage:

  use Bio::Phylo::Forest::DBTree;  
  my $dbtree = Bio::Phylo::Forest::DBTree->persist(
      -file => $file,
      -tree => $tree,
  );

This method first create a database at the location specified by C<$file> by making a call
to the C<create()> method. Subsequently, the C<$tree> object is traversed from root to 
tips and inserted in the newly created database. Finally, the handle to this database is
returned, i.e. a C<Bio::Phylo::Forest::DBTree> object.

=cut

sub persist {
	my ( $class, %args ) = @_;
	
	# need a file argument to write to
	if ( not $args{'-file'} ) {
		throw 'BadArgs' => "Need -file argument!";
	}
	
	# need a tree argument to persis
	if ( not $args{'-tree'} ) {
		throw 'BadArgs' => "Need -tree argument!";
	}
	
	# create a new database, prepare statement handler
	$class->create( $args{'-file'} );
	my $dsn = 'dbi:SQLite:dbname=' . $args{'-file'};
	my $dbh = DBI->connect($dsn,'','');
	$dbh->{'RaiseError'} = 1;
	my $db = $class->SUPER::connect( sub { $dbh } );		
	my $sth = $dbh->prepare("insert into node values(?,?,?,?)");
	
	# start traversing
	my $counter = 2;
	my %idmap;
	$args{'-tree'}->visit_depth_first(
		'-pre' => sub {
			my $node    = shift;
			my $id      = $node->get_id;
			$idmap{$id} = $counter++;
			
			# get the parent id, or "1" if root
			my $parent_id;
			if ( my $parent = $node->get_parent ) {
				my $pid = $parent->get_id;
				$parent_id = $idmap{$pid};
			}
			else {
				$parent_id = 1;
			}
			
			# do the insertion
			$sth->execute(
				$idmap{$id},               # primary key
				$parent_id,                # self-joining foreign key
				undef,                     # not indexed yet
				undef,                     # not indexed yet
				$node->get_internal_name,  # node label or taxon name
				$node->get_branch_length,  # branch length
				undef                      # not computed yet
			);
		}
	);
	my $i = 0;
	$db->get_root->_index(\$i,0);
	return $db;
}

=head2 extract()

Extracts a tree from a database. The returned tree is an in-memory object. Hence, this is
an expensive operation that is best avoided as much as possible. Usage:

 my $tree = $dbtree->extract;

=cut

sub extract {
	my $self = shift;
	my $tree = $fac->create_tree;
	my $root = $self->get_root;
	_clone_mutable(
		$fac->create_node(
			'-name'          => $root->get_name,
			'-branch_length' => $root->get_branch_length,
		),
		$root,
		$tree
	);
	return $tree;
}

{
	no warnings 'recursion';
	sub _clone_mutable {
		my ( $parent, $template, $tree ) = @_;
		$tree->insert($parent);
		for my $child ( @{ $template->get_children } ) {
			_clone_mutable( 
				$fac->create_node(
					'-name'          => $child->get_name,
					'-branch_length' => $child->get_branch_length,
					'-parent'        => $parent,
				),
				$child,
				$tree
			);
		}
	}
}

=head2 dbh()

Returns the underlying handle through which SQL statements can be executed directly on the
database. This is a L<DBD::SQLite> object. Usage:

 my $dbh = $dbtree->dbh;

=cut

sub dbh { $DBH }

=head1 TREE METHODS

The following methods are implemented here to override methods of the same name in the
L<Bio::Phylo> hierarchy so that the tree database is accessed more efficiently than
otherwise would be the case.

=head2 get_root()

Returns the root of the tree, i.e. a L<Bio::Phylo::Forest::DBTree::Result::Node> object,
which is a subclass of L<Bio::Phylo::Forest::Node>. Usage:

 my $root = $dbtree->get_root;

=cut

sub get_root { 
	shift->_rs->search(
		{ 'parent' => 1 },
		{
			'order_by' => 'id',
			'rows'     => 1,
		}
	)->single 
}

=head2 get_id()

Returns a dummy ID, an integer. Usage:

 my $id = $dbtree->get_id;

=cut

sub get_id { 0 }

=head2 get_by_name()

Returns the first node object that has the provided name. Usage:

 my $node = $dbtree->get_by_name( 'Homo sapiens' );

=cut

sub get_by_name {
	my ( $self, $name ) = @_;
	return $self->_rs->search({ 'name' => $name })->single;
}

=head2 visit()

Given a code reference, visits all the nodes in the tree and executes the code on the 
focal node. Usage:

 $dbtree->visit(sub{
     my $node = shift;
     print $node->name, "\n"; 
 });

=cut

sub visit {
	my ( $self, $code ) = @_;
	my $rs = $self->_rs;
	while( my $node = $rs->next ) {
		$code->($node);
	}
	return $self;
}

sub _rs { shift->resultset('Node') }

1;

__DATA__
create table node(id int not null,parent int,left int,right int,name varchar(20),length float,height float,primary key(id));
create index parent_idx on node(parent);
create unique index left_idx on node(left asc);
create unique index right_idx on node(right asc);
create index name_idx on node(name);


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.