Group
Extension

Bio-Phylo/lib/Bio/Phylo/Matrices/MatrixRole.pm

package Bio::Phylo::Matrices::MatrixRole;
use strict;
use warnings;
use Data::Dumper;
use base qw'Bio::Phylo::Matrices::TypeSafeData Bio::Phylo::Taxa::TaxaLinker';
use Bio::Phylo::Models::Substitution::Binary;
use Bio::Phylo::Models::Substitution::Dna;
use Bio::Phylo::Util::OptionalInterface 'Bio::Align::AlignI';
use Bio::Phylo::Util::CONSTANT qw':objecttypes /looks_like/';
use Bio::Phylo::Util::Exceptions qw'throw';
use Bio::Phylo::NeXML::Writable;
use Bio::Phylo::Matrices::Datum;
use Bio::Phylo::IO qw(parse unparse);
use Bio::Phylo::Factory;
my $LOADED_WRAPPERS = 0;
{
    my $CONSTANT_TYPE      = _MATRIX_;
    my $CONSTANT_CONTAINER = _MATRICES_;
    my $logger             = __PACKAGE__->get_logger;
    my $factory            = Bio::Phylo::Factory->new;

=head1 NAME

Bio::Phylo::Matrices::MatrixRole - Extra behaviours for a character state matrix

=head1 SYNOPSIS

 use Bio::Phylo::Factory;
 my $fac = Bio::Phylo::Factory->new;

 # instantiate taxa object
 my $taxa = $fac->create_taxa;
 for ( 'Homo sapiens', 'Pan paniscus', 'Pan troglodytes' ) {
     $taxa->insert( $fac->create_taxon( '-name' => $_ ) );
 }

 # instantiate matrix object, 'standard' data type. All categorical
 # data types follow semantics like this, though with different
 # symbols in lookup table and matrix
 my $standard_matrix = $fac->create_matrix(
     '-type'   => 'STANDARD',
     '-taxa'   => $taxa,
     '-lookup' => {
         '-' => [],
         '0' => [ '0' ],
         '1' => [ '1' ],
         '?' => [ '0', '1' ],
     },
     '-labels' => [ 'Opposable big toes', 'Opposable thumbs', 'Not a pygmy' ],
     '-matrix' => [
         [ 'Homo sapiens'    => '0', '1', '1' ],
         [ 'Pan paniscus'    => '1', '1', '0' ],
         [ 'Pan troglodytes' => '1', '1', '1' ],
     ],
 );

 # note: complicated constructor for mixed data!
 my $mixed_matrix = Bio::Phylo::Matrices::Matrix->new(

    # if you want to create 'mixed', value for '-type' is array ref...
    '-type' =>  [

        # ...with first field 'mixed'...
        'mixed',

        # ...second field is an array ref...
        [

            # ...with _ordered_ key/value pairs...
            'dna'      => 10, # value is length of type range
            'standard' => 10, # value is length of type range

            # ... or, more complicated, value is a hash ref...
            'rna'      => {
                '-length' => 10, # value is length of type range

                # ...value for '-args' is an array ref with args
                # as can be passed to 'unmixed' datatype constructors,
                # for example, here we modify the lookup table for
                # rna to allow both 'U' (default) and 'T'
                '-args'   => [
                    '-lookup' => {
                        'A' => [ 'A'                     ],
                        'C' => [ 'C'                     ],
                        'G' => [ 'G'                     ],
                        'U' => [ 'U'                     ],
                        'T' => [ 'T'                     ],
                        'M' => [ 'A', 'C'                ],
                        'R' => [ 'A', 'G'                ],
                        'S' => [ 'C', 'G'                ],
                        'W' => [ 'A', 'U', 'T'           ],
                        'Y' => [ 'C', 'U', 'T'           ],
                        'K' => [ 'G', 'U', 'T'           ],
                        'V' => [ 'A', 'C', 'G'           ],
                        'H' => [ 'A', 'C', 'U', 'T'      ],
                        'D' => [ 'A', 'G', 'U', 'T'      ],
                        'B' => [ 'C', 'G', 'U', 'T'      ],
                        'X' => [ 'G', 'A', 'U', 'T', 'C' ],
                        'N' => [ 'G', 'A', 'U', 'T', 'C' ],
                    },
                ],
            },
        ],
    ],
 );

 # prints 'mixed(Dna:1-10, Standard:11-20, Rna:21-30)'
 print $mixed_matrix->get_type;

=head1 DESCRIPTION

This module defines a container object that holds
L<Bio::Phylo::Matrices::Datum> objects. The matrix
object inherits from L<Bio::Phylo::Listable>, so the
methods defined there apply here.

=head1 METHODS

=head2 CONSTRUCTOR

=over

=item new()

Matrix constructor.

 Type    : Constructor
 Title   : new
 Usage   : my $matrix = Bio::Phylo::Matrices::Matrix->new;
 Function: Instantiates a Bio::Phylo::Matrices::Matrix
           object.
 Returns : A Bio::Phylo::Matrices::Matrix object.
 Args    : -type   => optional, but if used must be FIRST argument,
                      defines datatype, one of dna|rna|protein|
                      continuous|standard|restriction|[ mixed => [] ]

           -taxa   => optional, link to taxa object
           -lookup => character state lookup hash ref
           -labels => array ref of character labels
           -matrix => two-dimensional array, first element of every
                      row is label, subsequent are characters

=cut

    sub new : Constructor {

        # could be child class
        my $class = shift;

        # notify user
        $logger->info("constructor called for '$class'");
        if ( not $LOADED_WRAPPERS ) {
            eval do { local $/; <DATA> };
            die $@ if $@;
            $LOADED_WRAPPERS++;
        }

        # go up inheritance tree, eventually get an ID
        my $self = $class->SUPER::new(
            '-characters' => $factory->create_characters,
            '-listener'   => \&_update_characters,
            @_
        );
        return $self;
    }

=item new_from_bioperl()

Matrix constructor from Bio::Align::AlignI argument.

 Type    : Constructor
 Title   : new_from_bioperl
 Usage   : my $matrix =
           Bio::Phylo::Matrices::Matrix->new_from_bioperl(
               $aln
           );
 Function: Instantiates a
           Bio::Phylo::Matrices::Matrix object.
 Returns : A Bio::Phylo::Matrices::Matrix object.
 Args    : An alignment that implements Bio::Align::AlignI

=cut

    sub new_from_bioperl {
        my ( $class, $aln, @args ) = @_;
        if ( looks_like_instance( $aln, 'Bio::Align::AlignI' ) ) {
            $aln->unmatch;
            $aln->map_chars( '\.', '-' );
            my @seqs = $aln->each_seq;
            my ( $type, $missing, $gap, $matchchar );
            if ( $seqs[0] ) {
                $type =
                     $seqs[0]->alphabet
                  || $seqs[0]->_guess_alphabet
                  || 'dna';
            }
            else {
                $type = 'dna';
            }
            my $self = $factory->create_matrix(
                '-type'            => $type,
                '-special_symbols' => {
                    '-missing'   => $aln->missing_char || '?',
                    '-matchchar' => $aln->match_char   || '.',
                    '-gap'       => $aln->gap_char     || '-',
                },
                @args
            );

			# XXX create raw getter/setter pairs for annotation,
			# accession, consensus_meta source
            for my $field ( qw(description accession id annotation consensus_meta score source) ) {
                $self->$field( $aln->$field );
            }
            my $to = $self->get_type_object;
            for my $seq (@seqs) {
                my $datum = Bio::Phylo::Matrices::Datum->new_from_bioperl( $seq,
                    '-type_object' => $to );
                $self->insert($datum);
            }
            return $self;
        }
        else {
            throw 'ObjectMismatch' => 'Not a bioperl alignment!';
        }
    }

=back

=head2 MUTATORS

=over

=item set_special_symbols

Sets three special symbols in one call

 Type    : Mutator
 Title   : set_special_symbols
 Usage   : $matrix->set_special_symbols(
 		       -missing   => '?',
 		       -gap       => '-',
 		       -matchchar => '.'
 		   );
 Function: Assigns state labels.
 Returns : $self
 Args    : Three args (with distinct $x, $y and $z):
  		       -missing   => $x,
 		       -gap       => $y,
 		       -matchchar => $z
 Notes   : This method is here to ensure
           you don't accidentally use the
           same symbol for missing AND gap

=cut

    sub set_special_symbols {
        my $self = shift;
        my %args;
        if (   ( @_ == 1 && ref( $_[0] ) eq 'HASH' && ( %args = %{ $_[0] } ) )
            || ( %args = looks_like_hash @_ ) )
        {
            if (   !defined $args{'-missing'}
                || !defined $args{'-gap'}
                || !defined $args{'-matchchar'} )
            {
                throw 'BadArgs' =>
'Need -missing => $x, -gap => $y, -matchchar => $z arguments, not '
                  . "@_";
            }
            my %values = map { $_ => 1 } values %args;
            my @values = keys %values;
            if ( scalar @values < 3 ) {
                throw 'BadArgs' => 'Symbols must be distinct, not '
                  . join( ', ', values %args );
            }
            my %old_special_symbols = (
                ( $self->get_missing   || '?' ) => 'set_missing',
                ( $self->get_gap       || '-' ) => 'set_gap',
                ( $self->get_matchchar || '.' ) => 'set_matchchar',
            );
            my %new_special_symbols = (
                $args{'-missing'}   => 'set_missing',
                $args{'-gap'}       => 'set_gap',
                $args{'-matchchar'} => 'set_matchchar',
            );
            my %dummies;
            while (%new_special_symbols) {
                for my $sym ( keys %new_special_symbols ) {
                    if ( not $old_special_symbols{$sym} ) {
                        my $method = $new_special_symbols{$sym};
                        $self->$method($sym);
                        delete $new_special_symbols{$sym};
                    }
                    elsif ( $old_special_symbols{$sym} eq
                        $new_special_symbols{$sym} )
                    {
                        delete $new_special_symbols{$sym};
                    }
                    else {
                      DUMMY: for my $dummy (qw(! @ $ % ^ & *)) {
                            if (   !$new_special_symbols{$dummy}
                                && !$old_special_symbols{$dummy}
                                && !$dummies{$dummy} )
                            {
                                my $method = $old_special_symbols{$sym};
                                $self->$method($dummy);
                                $dummies{$dummy} = 1;
                                delete $old_special_symbols{$sym};
                                $old_special_symbols{$dummy} = $method;
                                last DUMMY;
                            }
                        }
                    }
                }
            }
        }
        return $self;
    }

=item set_charlabels()

Sets argument character labels.

 Type    : Mutator
 Title   : set_charlabels
 Usage   : $matrix->set_charlabels( [ 'char1', 'char2', 'char3' ] );
 Function: Assigns character labels.
 Returns : $self
 Args    : ARRAY, or nothing (to reset);

=cut

    sub set_charlabels {
        my ( $self, $charlabels ) = @_;

        # it's an array ref, but what about its contents?
        if ( looks_like_instance( $charlabels, 'ARRAY' ) ) {
            for my $label ( @{$charlabels} ) {
                if ( ref $label ) {
                    throw 'BadArgs' =>
                      "charlabels must be an array ref of scalars";
                }
            }
        }

        # it's defined but not an array ref
        elsif ( defined $charlabels
            && !looks_like_instance( $charlabels, 'ARRAY' ) )
        {
            throw 'BadArgs' => "charlabels must be an array ref of scalars";
        }

		# there might be more labels than currently existing characters.
		# here we add however more are needed
		my $characters = $self->get_characters;
		my $missing = scalar(@{$charlabels}) - scalar(@{$characters->get_entities});
		for my $i ( 1 .. $missing ) {
			$characters->insert($factory->create_character);
		}

        # it's either a valid array ref, or nothing, i.e. a reset
        my @characters = @{ $characters->get_entities };
        for my $i ( 0 .. $#{$charlabels} ) {
            $characters[$i]->set_name( $charlabels->[$i] );
        }
        return $self;
    }

=item set_raw()

Set contents using two-dimensional array argument.

 Type    : Mutator
 Title   : set_raw
 Usage   : $matrix->set_raw( [ [ 'taxon1' => 'acgt' ], [ 'taxon2' => 'acgt' ] ] );
 Function: Syntax sugar to define $matrix data contents.
 Returns : $self
 Args    : A two-dimensional array; first dimension contains matrix rows,
           second dimension contains taxon name / character string pair.

=cut

    sub set_raw {
        my ( $self, $raw ) = @_;
        if ( defined $raw ) {
            if ( looks_like_instance( $raw, 'ARRAY' ) ) {
                my @rows;
                for my $row ( @{$raw} ) {
                    if ( defined $row ) {
                        if ( looks_like_instance( $row, 'ARRAY' ) ) {
                            my $matrixrow = $factory->create_datum(
                                '-type_object' => $self->get_type_object,
                                '-name'        => $row->[0],
                                '-char' => join( ' ', @$row[ 1 .. $#{$row} ] ),
                            );
                            push @rows, $matrixrow;
                        }
                        else {
                            throw 'BadArgs' =>
                              "Raw matrix row must be an array reference";
                        }
                    }
                }
                $self->clear;
                $self->insert($_) for @rows;
            }
            else {
                throw 'BadArgs' => "Raw matrix must be an array reference";
            }
        }
        return $self;
    }

=back

=head2 ACCESSORS

=over

=item get_special_symbols()

Retrieves hash ref for missing, gap and matchchar symbols

 Type    : Accessor
 Title   : get_special_symbols
 Usage   : my %syms = %{ $matrix->get_special_symbols };
 Function: Retrieves special symbols
 Returns : HASH ref, e.g. { -missing => '?', -gap => '-', -matchchar => '.' }
 Args    : None.

=cut

    sub get_special_symbols {
        my $self = shift;
        return {
            '-missing'   => $self->get_missing,
            '-matchchar' => $self->get_matchchar,
            '-gap'       => $self->get_gap
        };
    }

=item get_charlabels()

Retrieves character labels.

 Type    : Accessor
 Title   : get_charlabels
 Usage   : my @charlabels = @{ $matrix->get_charlabels };
 Function: Retrieves character labels.
 Returns : ARRAY
 Args    : None.

=cut

    sub get_charlabels {
        my $self = shift;
        my @labels =
          map { $_->get_name } @{ $self->get_characters->get_entities };
        return \@labels;
    }

=item get_nchar()

Calculates number of characters.

 Type    : Accessor
 Title   : get_nchar
 Usage   : my $nchar = $matrix->get_nchar;
 Function: Calculates number of characters (columns) in matrix (if the matrix
           is non-rectangular, returns the length of the longest row).
 Returns : INT
 Args    : none

=cut

    sub get_nchar {
        my $self  = shift;
        my $nchar = 0;
        for my $row ( @{ $self->get_entities } ) {
            my $offset    = ( $row->get_position || 1 ) - 1;
            my $rowlength = scalar @{ $row->get_entities };
            $rowlength += $offset;
            $nchar = $rowlength if $rowlength > $nchar;
        }
        return $nchar;
    }

=item get_ntax()

Calculates number of taxa (rows) in matrix.

 Type    : Accessor
 Title   : get_ntax
 Usage   : my $ntax = $matrix->get_ntax;
 Function: Calculates number of taxa (rows) in matrix
 Returns : INT
 Args    : none

=cut

    sub get_ntax { scalar @{ shift->get_entities } }

=item get_raw()

Retrieves a 'raw' (two-dimensional array) representation of the matrix's contents.

 Type    : Accessor
 Title   : get_raw
 Usage   : my $rawmatrix = $matrix->get_raw;
 Function: Retrieves a 'raw' (two-dimensional array) representation
           of the matrix's contents.
 Returns : A two-dimensional array; first dimension contains matrix rows,
           second dimension contains taxon name and characters.
 Args    : NONE

=cut

    sub get_raw {
        my $self = shift;
        my @raw;
        for my $row ( @{ $self->get_entities } ) {
            my @row;
            push @row, $row->get_name;
            my @char = $row->get_char;
            push @row, @char;
            push @raw, \@row;
        }
        return \@raw;
    }

=item get_ungapped_columns()

 Type    : Accessor
 Title   : get_ungapped_columns
 Usage   : my @ungapped = @{ $matrix->get_ungapped_columns };
 Function: Retrieves the zero-based column indices of columns without gaps
 Returns : An array reference with zero or more indices (i.e. integers)
 Args    : NONE

=cut

    sub get_ungapped_columns {
    	my $self  = shift;
		my $raw   = $self->get_raw;
		my $nchar = $self->get_nchar;
		my @ungapped;
		my $gap = $self->get_gap;
		for my $i ( 1 .. $nchar ) {
			my %seen;
			for my $row ( @$raw ) {
				my $c = $row->[$i];
				$seen{$c}++;
			}
			push @ungapped, $i - 1 unless $seen{$gap};
		}
		return \@ungapped;
    }

=item get_invariant_columns()

 Type    : Accessor
 Title   : get_invariant_columns
 Usage   : my @invariant = @{ $matrix->get_invariant_columns };
 Function: Retrieves the zero-based column indices of invariant columns
 Returns : An array reference with zero or more indices (i.e. integers)
 Args    : Optional:
           -gap     => if true, counts the gap symbol (probably '-') as a variant
           -missing => if true, counts the missing symbol (probably '?') as a variant

=cut

    sub get_invariant_columns {
    	my ( $self, %args ) = @_;
    	my $raw   = $self->get_raw;
    	my $nchar = $self->get_nchar;
    	my $gap   = $self->get_gap;
    	my $miss  = $self->get_missing;
    	my @invariant;
    	for my $i ( 1 .. $nchar ) {
    		my %seen;
    		for my $row ( @$raw ) {
    			my $c = uc $row->[$i];
    			$seen{$c}++;
    		}
    		my @states = keys %seen;
    		@states = grep { $_ ne $gap  } @states unless $args{'-gap'};
    		@states = grep { $_ ne $miss } @states unless $args{'-missing'};
    		push @invariant, $i - 1 if @states and @states == 1;
    	}
    	return \@invariant;
    }


=back

=head2 CALCULATIONS

=over

=item calc_indel_sizes()

Calculates size distribution of insertions or deletions

 Type    : Calculation
 Title   : calc_indel_sizes
 Usage   : my %sizes = %{ $matrix->calc_indel_sizes };
 Function: Calculates the size distribution of indels.
 Returns : HASH
 Args    : Optional:
           -trim       => if true, disregards indels at start and end
           -insertions => if true, counts insertions, if false, counts deletions

=cut

    sub calc_indel_sizes {
    	my ($self,%args) = @_;
    	my $ntax  = $self->get_ntax;
    	my $nchar = $self->get_nchar;
    	my $raw   = $self->get_raw;
    	my $gap   = $self->get_gap;
    	my @indels;

    	# iterate over rows
    	for my $row ( @$raw ) {
    		my $previous;
    		my @row_indels;

    		# iterate over columns
    		for my $i ( 1 .. $nchar ) {

    			# focal cell is a gap
    			if ( $row->[$i] eq $gap ) {
					if ( $previous ) {

						# we're extending an indel
						if ( $previous eq $gap ) {
							$row_indels[-1]->{'end'} = $i;
						}

						# we're starting an indel
						else {
							push @row_indels, { 'start' => $i };
						}
					}

					# sequence starts with an indel
					else {
						push @row_indels, { 'start' => $i };
					}
    			}
				else {
					# gap of length 1 is closed: start==end
					if ( $previous and $previous eq $gap ){
						$row_indels[-1]->{'end'} = $i;
					}
				}
    			$previous = $row->[$i];
    		}

    		# flatten the lists
    		push @indels, @row_indels;
    	}

    	# remove tops and tails
    	if ( $args{'-trim'} ) {
    		@indels = grep { $_->{'start'} > 1 } @indels;
    		@indels = grep { $_->{'end'} < $nchar } @indels;
    	}

    	# count sizes
    	my %sizes;
    	for my $i ( 1 .. $nchar ) {
    		my @starting_here = grep { $_->{'start'} == $i } @indels;
    		if ( @starting_here ) {
    			for my $j ( $i + 1 .. $nchar ) {
    				my @ending_here = grep { $_->{'end'} == $j } @starting_here;
    				if ( @ending_here ) {
    					my $size = $j - $i;

    					# more taxa have an indel here than don't. crudely, we
    					# say this is probably an insertion
    					if ( scalar(@ending_here) > ( $ntax / 2 ) ) {
    						$sizes{$size}++ if $args{'-insertions'};
    					}
    					# it's a deletion
    					else {
    						$sizes{$size}++ if not $args{'-insertions'};
    					}
    				}
    			}
    		}
    	}
    	return \%sizes;
    }


=item calc_prop_invar()

Calculates proportion of invariant sites.

 Type    : Calculation
 Title   : calc_prop_invar
 Usage   : my $pinvar = $matrix->calc_prop_invar;
 Function: Calculates proportion of invariant sites.
 Returns : Scalar: a number
 Args    : Optional:
           # if true, counts missing (usually the '?' symbol) as a state
	       # in the final tallies. Otherwise, missing states are ignored
           -missing => 1
           # if true, counts gaps (usually the '-' symbol) as a state
	       # in the final tallies. Otherwise, gap states are ignored
	       -gap => 1

=cut

    sub calc_prop_invar {
        my $self  = shift;
        my $raw   = $self->get_raw;
        my $ntax  = $self->get_ntax;
        my $nchar = $self->get_nchar || 1;
        my %args  = looks_like_hash @_;
        my @symbols_to_ignore;
        for my $sym (qw(missing gap)) {
            if ( not exists $args{"-${sym}"} ) {
                my $method = "get_${sym}";
                push @symbols_to_ignore, $self->$method;
            }
        }
        my $invar_count;
        for my $i ( 1 .. $nchar ) {
            my %seen;
            for my $j ( 0 .. ( $ntax - 1 ) ) {
                $seen{ $raw->[$j]->[$i] }++;
            }
            delete @seen{@symbols_to_ignore};
            my @symbols_in_column = keys %seen;
            $invar_count++ if scalar(@symbols_in_column) <= 1;
        }
        return $invar_count / $nchar;
    }

=item calc_state_counts()

Calculates occurrences of states.

 Type    : Calculation
 Title   : calc_state_counts
 Usage   : my %counts = %{ $matrix->calc_state_counts };
 Function: Calculates occurrences of states.
 Returns : Hashref: keys are states, values are counts
 Args    : Optional - one or more states to focus on

=cut

    sub calc_state_counts {
        my $self = shift;
        my %totals;
        for my $row ( @{ $self->get_entities } ) {
            my $counts = $row->calc_state_counts(@_);
            for my $state ( keys %{$counts} ) {
                $totals{$state} += $counts->{$state};
            }
        }
        return \%totals;
    }

=item calc_state_frequencies()

Calculates the frequencies of the states observed in the matrix.

 Type    : Calculation
 Title   : calc_state_frequencies
 Usage   : my %freq = %{ $object->calc_state_frequencies() };
 Function: Calculates state frequencies
 Returns : A hash, keys are state symbols, values are frequencies
 Args    : Optional:
           # if true, counts missing (usually the '?' symbol) as a state
	       # in the final tallies. Otherwise, missing states are ignored
           -missing => 1
           # if true, counts gaps (usually the '-' symbol) as a state
	       # in the final tallies. Otherwise, gap states are ignored
	       -gap => 1
 Comments: Throws exception if matrix holds continuous values

=cut

    sub calc_state_frequencies {
        my $self = shift;
        my %result;
        for my $row ( @{ $self->get_entities } ) {
            my $freqs = $row->calc_state_frequencies(@_);
            for my $state ( keys %{$freqs} ) {
                $result{$state} += $freqs->{$state};
            }
        }
        my $total = 0;
        $total += $_ for values %result;
        if ( $total > 0 ) {
            for my $state ( keys %result ) {
                $result{$state} /= $total;
            }
        }
        return \%result;
    }

=item calc_distinct_site_patterns()

Identifies the distinct distributions of states for all characters and
counts their occurrences. Returns an array-of-arrays, where the first cell
of each inner array holds the occurrence count, the second cell holds the
pattern, i.e. an array of states. For example, for a matrix like this:

 taxon1 GTGTGTGTGTGTGTGTGTGTGTG
 taxon2 AGAGAGAGAGAGAGAGAGAGAGA
 taxon3 TCTCTCTCTCTCTCTCTCTCTCT
 taxon4 TCTCTCTCTCTCTCTCTCTCTCT
 taxon5 AAAAAAAAAAAAAAAAAAAAAAA
 taxon6 CGCGCGCGCGCGCGCGCGCGCGC
 taxon7 AAAAAAAAAAAAAAAAAAAAAAA

The following data structure will be returned:

 [
	[ 12, [ 'G', 'A', 'T', 'T', 'A', 'C', 'A' ] ],
	[ 11, [ 'T', 'G', 'C', 'C', 'A', 'G', 'A' ] ]
 ]

The patterns are sorted from most to least frequently occurring, the states
for each pattern are in the order of the rows in the matrix. (In other words,
the original matrix can more or less be reconstructed by inverting the patterns,
and multiplying them by their occurrence, although the order of the columns
will be lost.)

 Type    : Calculation
 Title   : calc_distinct_site_patterns
 Usage   : my $patterns = $object->calc_distinct_site_patterns;
 Function: Calculates distinct site patterns.
 Returns : A multidimensional array, see above.
 Args    : NONE
 Comments:

=cut

    sub calc_distinct_site_patterns {
        my ( $self, $indices ) = @_;
        my $raw   = $self->get_raw;
        my $nchar = $self->get_nchar;
        my $ntax  = $self->get_ntax;
        my ( %pattern, %index );
        for my $i ( 1 .. $nchar ) {
            my @column;
            for my $j ( 0 .. ( $ntax - 1 ) ) {
                push @column, $raw->[$j]->[$i];
            }
            my $col_pattern = join ' ', @column;
            $pattern{$col_pattern}++;
            $index{$col_pattern} = [] if not $index{$col_pattern};
            push @{ $index{$col_pattern} }, $i - 1;
        }
        my @pattern_array;
        for my $key ( keys %pattern ) {
            my @column = split / /, $key;
            if ( $indices ) {
            	push @pattern_array, [ $pattern{$key}, \@column, $index{$key} ];
            }
            else {
            	push @pattern_array, [ $pattern{$key}, \@column ];
            }
        }
        return [ sort { $b->[0] <=> $a->[0] } @pattern_array ];
    }

=item calc_gc_content()

Calculates the G+C content as a fraction on the total

 Type    : Calculation
 Title   : calc_gc_content
 Usage   : my $fraction = $obj->calc_gc_content;
 Function: Calculates G+C content
 Returns : A number between 0 and 1 (inclusive)
 Args    : Optional:
           # if true, counts missing (usually the '?' symbol) as a state
	       # in the final tallies. Otherwise, missing states are ignored
           -missing => 1
           # if true, counts gaps (usually the '-' symbol) as a state
	       # in the final tallies. Otherwise, gap states are ignored
	       -gap => 1
 Comments: Throws 'BadArgs' exception if matrix holds anything other than DNA
           or RNA. The calculation also takes the IUPAC symbol S (which is C|G)
	       into account, but no other symbols (such as V, for A|C|G);

=cut

    sub calc_gc_content {
        my $self = shift;
        my $type = $self->get_type;
        if ( $type !~ /^(?:d|r)na/i ) {
            throw 'BadArgs' => "Matrix doesn't contain nucleotides";
        }
        my $freq  = $self->calc_state_frequencies;
        my $total = 0;
        for (qw(c C g G s S)) {
            $total += $freq->{$_} if exists $freq->{$_};
        }
        return $total;
    }

=item calc_median_sequence()

Calculates the median character sequence of the matrix

 Type    : Calculation
 Title   : calc_median_sequence
 Usage   : my $seq = $obj->calc_median_sequence;
 Function: Calculates median sequence
 Returns : Array in list context, string in scalar context
 Args    : Optional:
           -ambig   => if true, uses ambiguity codes to summarize equally frequent
                       states for a given character. Otherwise picks a random one.
           -missing => if true, keeps the missing symbol (probably '?') if this
                       is the most frequent for a given character. Otherwise strips it.
           -gaps    => if true, keeps the gap symbol (probably '-') if this is the most
                       frequent for a given character. Otherwise strips it.
 Comments: The intent of this method is to provide a crude approximation of the most
           commonly occurring sequences in an alignment, for example as a starting
           sequence for a sequence simulator. This gives you something to work with if
           ancestral sequence calculation is too computationally intensive and/or not
           really necessary.

=cut

    sub calc_median_sequence {
    	my ( $self, %args ) = @_;
    	my $to = $self->get_type_object;
    	my $type = $self->get_type;
    	if ( $type =~ /continuous/i ) {
			throw 'BadArgs' => 'No median sequence calculation for continuous data (yet)';
    	}
    	else {
    		my $raw   = $self->get_raw;
    		my $ntax  = $self->get_ntax;
    		my $nchar = $self->get_nchar;
    		my $gap   = $self->get_gap;
    		my $miss  = $self->get_missing;
    		my @seq;
    		for my $i ( 1 .. $nchar ) {
    			my %seen;
    			for my $row ( @$raw ) {
    				my $c = uc $row->[$i];
    				$seen{$c}++;
    			}
    			my @sorted = sort { $seen{$b} <=> $seen{$a} } keys %seen;
    			my $max = $seen{$sorted[0]};
    			my @top;
    			my $j = 0;
    			while ( $sorted[$j] and $seen{$sorted[$j]} == $max ) {
                                push @top, $sorted[$j];
                                $j++;
   			}
    			if ( @top == 1 ) {
    				push @seq, @top;
    			}
    			else {
    				if ( $args{'-ambig'} ) {
    					push @seq, $to->get_symbol_for_states(@top);
    				}
    				else {
    					push @seq, shift @top;
    				}
    			}
    		}
    		if ( not $args{'-gaps'} ) {
    			@seq = grep { $_ ne $gap } @seq;
    		}
    		if ( not $args{'-missing'} ) {
    			@seq = grep { $_ ne $miss } @seq;
    		}
    		return wantarray ? @seq : join '', @seq;
    	}
    }

=back

=head2 METHODS

=over

=item keep_chars()

Creates a cloned matrix that only keeps the characters at
the supplied (zero-based) indices.

 Type    : Utility method
 Title   : keep_chars
 Usage   : my $clone = $object->keep_chars([6,3,4,1]);
 Function: Creates spliced clone.
 Returns : A spliced clone of the invocant.
 Args    : Required, an array ref of integers
 Comments: The columns are retained in the order in
           which they were supplied.

=cut

    sub keep_chars {
        my ( $self, $indices_array_ref ) = @_;
        my @indices = @{$indices_array_ref};
        my $clone   = $self->clone;
        for my $seq ( @{ $clone->get_entities } ) {
            $seq->keep_entities( \@indices );
            my @anno = @{ $seq->get_annotations };
            if (@anno) {
                my @re_anno = @anno[@indices];
                $seq->set_annotations(@re_anno);
            }
        }
		$clone->get_characters->keep_entities(\@indices);
        return $clone;
    }

=item prune_chars()

Creates a cloned matrix that omits the characters at
the supplied (zero-based) indices.

 Type    : Utility method
 Title   : prune_chars
 Usage   : my $clone = $object->prune_chars([6,3,4,1]);
 Function: Creates spliced clone.
 Returns : A spliced clone of the invocant.
 Args    : Required, an array ref of integers
 Comments: The columns are retained in the order in
           which they were supplied.

=cut

    sub prune_chars {
        my ( $self, $indices ) = @_;
        my $nchar = $self->get_nchar;
        my %indices = map { $_ => 1 } @{$indices};
        my @keep;
        for my $i ( 0 .. ( $nchar - 1 ) ) {
            push @keep, $i if not exists $indices{$i};
        }
        return $self->keep_chars( \@keep );
    }

=item prune_invariant()

Creates a cloned matrix that omits the characters for which all taxa
have the same state (or missing);

 Type    : Utility method
 Title   : prune_invariant
 Usage   : my $clone = $object->prune_invariant;
 Function: Creates spliced clone.
 Returns : A spliced clone of the invocant.
 Args    : None
 Comments: The columns are retained in the order in
           which they were supplied.

=cut

	sub prune_invariant {
		my $self = shift;
		my $nchar = $self->get_nchar;
		my $missing = $self->get_missing;
		my @indices;
		for my $i ( 0 .. ( $nchar - 1 ) ) {
			my %seen;
			ROW: for my $row ( @{ $self->get_entities } ) {
				my $state = $row->get_by_index($i);
				next ROW if $state eq $missing;
				$seen{ $state } = 1;
			}
			my @states = keys %seen;
			push @indices, $i if scalar(@states) <= 1;
		}
		return $self->prune_chars(\@indices);
	}

=item prune_uninformative()

Creates a cloned matrix that omits all uninformative characters. Uninformative
are considered characters where all non-missing values are either invariant
or autapomorphies.

 Type    : Utility method
 Title   : prune_uninformative
 Usage   : my $clone = $object->prune_uninformative;
 Function: Creates spliced clone.
 Returns : A spliced clone of the invocant.
 Args    : None
 Comments: The columns are retained in the order in
           which they were supplied.

=cut

	sub prune_uninformative {
		my $self = shift;
		my $nchar = $self->get_nchar;
		my $ntax = $self->get_ntax;
		my $missing = $self->get_missing;
		my @indices;
		for my $i ( 0 .. ( $nchar - 1 ) ) {
			my %seen;
			for my $row ( @{ $self->get_entities } ) {
				my $state = $row->get_by_index($i);
				$seen{ $state }++;
			}
			my @states = keys %seen;
			push @indices, $i if scalar(@states) <= 1;
			if ( scalar(@states) > 1 ) {
				my $seen_informative;
				my $non_missing = $ntax - ( $seen{$missing} || 0 );
				my @informative_maybe;
				for my $state ( @states ) {
					if ( $seen{$state} == 1 ) {
						$non_missing--;
					}
					else {
						push @informative_maybe, $state;
					}
				}
				for my $state ( @informative_maybe ) {
					$seen_informative++ if $seen{$state} < $non_missing;
				}
				push @indices, $i unless $seen_informative;
			}
		}
		return $self->prune_chars(\@indices);
	}

=item prune_missing_and_gaps()

Creates a cloned matrix that omits all characters for which the invocant only
has missing and/or gap states.

 Type    : Utility method
 Title   : prune_missing_and_gaps
 Usage   : my $clone = $object->prune_missing_and_gaps;
 Function: Creates spliced clone.
 Returns : A spliced clone of the invocant.
 Args    : None
 Comments: The columns are retained in the order in
           which they were supplied.

=cut

	sub prune_missing_and_gaps {
		my $self = shift;
		my $nchar = $self->get_nchar;
		my $to = $self->get_type_object;
		my $m = $to->get_missing;
		my $g = $to->get_gap;
		my @indices;
		for my $i ( 0 .. ( $nchar - 1 ) ) {
			my %col;
			$self->visit(sub{
				my $row = shift;
				my $state = $row->get_by_index($i);
				if ( $state ne $m and $state ne $g ) {
					$col{$state} = 1;
				}
				push @indices, $i if not keys %col;
			});
		}
		return $self->prune_chars(\@indices);
	}

=item bootstrap()

Creates bootstrapped clone.

 Type    : Utility method
 Title   : bootstrap
 Usage   : my $bootstrap = $object->bootstrap;
 Function: Creates bootstrapped clone.
 Returns : A bootstrapped clone of the invocant.
 Args    : Optional, a subroutine reference that returns a random
           integer between 0 (inclusive) and the argument provided
           to it (exclusive). The default implementation is to use
           sub { int( rand( shift ) ) }, a user might override this
           by providing an implementation with a better random number
           generator.
 Comments: The bootstrapping algorithm uses perl's random number
           generator to create a new series of indices (without
           replacement) of the same length as the original matrix.
           These indices are first sorted, then applied to the
           cloned sequences. Annotations (if present) stay connected
           to the resampled cells.

=cut

    sub bootstrap {
        my $self  = shift;
        my $gen   = shift || sub { int( rand(shift) ) };
        my $nchar = $self->get_nchar;
        my @indices;
        push @indices, $gen->($nchar) for ( 1 .. $nchar );
        @indices = sort { $a <=> $b } @indices;
        return $self->keep_chars( \@indices );
    }

=item jackknife()

Creates jackknifed clone.

 Type    : Utility method
 Title   : jackknife
 Usage   : my $bootstrap = $object->jackknife(0.5);
 Function: Creates jackknifed clone.
 Returns : A jackknifed clone of the invocant.
 Args    : * Required, a number between 0 and 1, representing the
             fraction of characters to jackknife.
           * Optional, a subroutine reference that returns a random
             integer between 0 (inclusive) and the argument provided
             to it (exclusive). The default implementation is to use
             sub { int( rand( shift ) ) }, a user might override this
             by providing an implementation with a better random number
             generator.
 Comments: The jackknife algorithm uses perl's random number
           generator to create a new series of indices of cells to keep.
           These indices are first sorted, then applied to the
           cloned sequences. Annotations (if present) stay connected
           to the resampled cells.

=cut

    sub jackknife {
        my ( $self, $prop ) = @_;
        if ( not looks_like_number $prop or $prop >= 1 or $prop < 0 ) {
            throw 'BadNumber' =>
              "Jackknifing proportion must be a number between 0 and 1";
        }
        my $gen = $_[2] || sub { int( rand(shift) ) };
        my $nchar = $self->get_nchar;
        my ( %indices, @indices );
        while ( scalar keys %indices < ( $nchar - int( $nchar * $prop ) ) ) {
            $indices{ $gen->($nchar) } = 1;
        }
        @indices = sort { $a <=> $b } keys %indices;
        return $self->keep_chars( \@indices );
    }

=item replicate()

Creates simulated replicate.

 Type    : Utility method
 Title   : replicate
 Usage   : my $replicate = $matrix->replicate($tree);
 Function: Creates simulated replicate.
 Returns : A simulated replicate of the invocant.
 Args    : Tree to simulate the characters on.
           Optional:
           -seed           => a random integer seed
           -model          => an object of class Bio::Phylo::Models::Substitution::Dna or 
	                      Bio::Phylo::Models::Substitution::Binary
           -random_rootseq => start DNA sequence simulation from random ancestral sequence 
		              instead of the  median sequence in the alignment. 

 Comments: Requires Statistics::R, with 'ape', 'phylosim', 'phangorn' and 'phytools'.
           If model is not given as argument, it will be estimated.

=cut

    sub replicate {
    	my ($self,%args) = @_;

	my $tree = $args{'-tree'};
    	if ( not looks_like_object $tree, _TREE_ ) {
    		throw 'BadArgs' => "Need tree as argument";
    	}

    	my $type = $self->get_type;
    	if ( $type =~ /dna/i ) {
    		return $self->_replicate_dna(
		        '-tree'  => $tree, 
			'-model' => $args{'-model'}, 
			'-seed'  => $args{'-seed'}, 
			'-random_rootseq' => $args{'-random_rootseq'}
		);
    	}
    	elsif ( $type =~ /standard/i ) {
    		return $self->_replicate_binary(
			'-tree'  => $tree, 
			'-model' => $args{'-model'}, 
			'-seed'  => $args{'-seed'}
		);
    	}
    	else {
    		throw 'BadArgs' => "Can't replicate $type matrices (yet?)";
    	}
    }
	
    sub _replicate_dna {
    	my ($self,%args) = @_;
		
		my $seed = $args{'-seed'};
		my $tree = $args{'-tree'};
		my $model = $args{'-model'};
		my $random_rootseq = $args{'-random_rootseq'};
		
		if ( scalar @{ $self->get_entities } < 3 ) {
			$logger->warn("Cannot replicate a matrix with less than three elements.");
			return;
		}

    	# we will need 'ape', 'phylosim' (and 'phangorn' for model testing)
    	if ( looks_like_class 'Statistics::R' ) {

			# instantiate R
			my $R = Statistics::R->new;
			$R->run(qq[set.seed($seed)]) if $seed;
			$R->run(q[options(device=NULL)]);
			$R->run(q[require('ape')]);
			$R->run(q[phylosim <- require('phylosim')]);
			$R->run(q[PSIM_FAST<-TRUE]);
			# check if phylosim (and therefore ape) is installed
			if ( $R->get(q[phylosim]) eq 'FALSE' ) {
				$logger->warn('R package phylosim must be installed to replicate alignment.');
				return;
			}

			# pass in the tree, scale it so that its length sums to 1.
			# in combination with a gamma function and/or invariant sites
			# this should give us sequences that are reasonably realistic:
			# not overly divergent.
			my $newick = $tree->to_newick;
			$R->run(qq[phylo <- read.tree(text="$newick")]);
			$R->run(q[tree <- PhyloSim(phylo)]);
			$R->run(q[scaleTree(tree,1/tree$treeLength)]);
			$R->run(q[t <- tree$phylo]);

			# run the model test if model not given as argument
			if ( ! $model ) {
				$logger->info("no model given as argument, determining model with phangorn's modelTest");
				$model = 'Bio::Phylo::Models::Substitution::Dna'->modeltest( 
					'-matrix' => $self, 
					'-tree'   => $tree 
				);
			}
			# prepare data for processes
			my @ungapped   = @{ $self->get_ungapped_columns };
			my @invariant  = @{ $self->get_invariant_columns };
			my %deletions  = %{ $self->calc_indel_sizes( '-trim' => 0 ) };
			my %insertions = %{ $self->calc_indel_sizes( '-trim' => 0, '-insertions' => 1 ) };

			# set ancestral (root) sequence 
			my $ancestral;
			my @alphabet = ('A', 'T', 'C', 'G');				
			if ( $random_rootseq ) {
				# start from random sequence if requested
				$logger->debug('simulating from random ancestral sequence');
				$ancestral .= $alphabet[ rand @alphabet ] for 1..$self->length;
			}
			else {
				$logger->debug('simulating from median ancestral sequence');
				# start from median sequence
				$ancestral = $self->calc_median_sequence;
				# phylosim does not accept ambiguity characters, replace with random nucleotides
				$ancestral =~ s/[^ATCG]/$alphabet[rand @alphabet]/g;
			}
			$logger->debug("ancestral sequence for simulation : $ancestral");
			$R->run(qq[root.seq=NucleotideSequence(string='$ancestral')]);

			my $m = ref($model);
			if ( $m=~/([^\:\:]+$)/ ) {
					$m = $1;
			}
			# mapping between model names (can differ between Bio::Phylo and phylosim)
			my %models = ('JC69'=>'JC69', 'GTR'=>'GTR', 'F81'=>'F81', 'HKY85'=>'HKY', 'K80'=>'K80');
			my $type = $models{$m} || 'GTR';

			# collect model specific parameters in string passed to R
			my $model_params = '';
			if ( $type =~ /(?:F81|GTR|K80|HKY)/ ) {
					$logger->debug("setting base frequencies for substitution model");
					$model_params .= 'base.freqs=c(' . join(',', @{$model->get_pi}) .  ')';
			}
			if ( $type =~ /GTR/ ) {
					$logger->debug("setting rate params for GTR model");
					# (watch out for different column order in phangorn's and phylosim's Q matrix!)
					my $a = $model->get_rate('C', 'T');
					my $b = $model->get_rate('A', 'T');
					my $c = $model->get_rate('G', 'T');
					my $d = $model->get_rate('A', 'C');
					my $e = $model->get_rate('G', 'C');
					my $f = $model->get_rate('G', 'A');
					$model_params .= ", rate.params=list(a=$a, b=$b, c=$c, d=$d, e=$e, f=$f)";
			}
			if ( $type =~ /(?:K80|HKY)/) {
					$logger->debug("setting kappa parameter for substitution model");
					my $kappa = $model->get_kappa;
					# get transition and transversion rates from kappa,
					#  scale with number of nucleotides to obtain similar Q matrices
					my $alpha = $kappa * 4;
					my $beta = 4;
					$model_params .= ", rate.params=list(Alpha=$alpha, Beta=$beta)";
			}
			# create model for phylosim
			$R->run(qq[model <- $type($model_params)]);
				    # set parameters for indels
			if ( keys %deletions ) {
					# deletions
					my @del_sizes = keys %deletions;
					my $del_total = 0;
					$del_total += $_ for values (%deletions);
					my @del_probs = map {$_/$del_total} values %deletions;
					my $size_str = 'c(' . join(',', @del_sizes) . ')';
					my $prob_str = 'c(' . join(',', @del_probs) . ')';
					my $rate = scalar(keys %deletions) / $self->get_nchar;
					$logger->debug("Setting deletion rate to $rate");
					$R->run(qq[attachProcess(root.seq, DiscreteDeletor(rate=$rate, sizes=$size_str, probs=$prob_str))]);
			}
			if ( keys %insertions ) {
					# insertions
					my @ins_sizes = keys %insertions;
					my $ins_total = 0;
					$ins_total += $_ for values (%insertions);
					my @ins_probs = map {$_/$ins_total} values %insertions;
					my $size_str = 'c(' . join(',', @ins_sizes) . ')';
					my $prob_str = 'c(' . join(',', @ins_probs) . ')';
					my $maxsize = (sort { $b <=> $a } keys(%insertions))[0];
					my $rate = scalar(keys %insertions) / $self->get_nchar;
					$logger->debug("Setting insertion rate to $rate");
					$R->run(qq[i <- DiscreteInsertor(rate=$rate, sizes=$size_str, probs=$prob_str)]);
					$R->run(qq[template <- NucleotideSequence(length=$maxsize,processes=list(list(model)))]);
					$R->run(q[i$templateSeq <- template]);
					$R->run(qq[attachProcess(root.seq, i)]);
			}

			# specify model that evolves root sequence
			$R->run(q[attachProcess(root.seq, model)]);

			# set invariant sites
			if ( scalar @invariant ) {
					my $pinvar = $model->get_pinvar || scalar(@invariant)/$self->get_nchar;
					if ( $pinvar == 1){
						my $epsilon = 0.01;
						$pinvar -= $epsilon;
					}
					# set a high value for gamma, then we approximate the empirical number of invariant sites
					$R->run(qq[plusInvGamma(root.seq,model,pinv=$pinvar,shape=1e10)]);
			}

			# run the simulation
			$R->run(q[sim <- PhyloSim(root.seq=root.seq, phylo=t)]);
			$R->run(q[Simulate(sim)]);

			# get alignment as Fasta string
			my $aln = $R->get(q[paste('>', t$tip.label, '\n', apply(sim$alignment, 1, paste, collapse='')[t$tip.label], '\n', collapse='', sep='')]);
			$aln =~ s/\\n/\n/g;

			# create matrix
			my $project = parse(
					'-string'     => $aln,
					'-format'     => 'fasta',
					'-type'       => 'dna',
					'-as_project' => 1,
			    );

			my ($matrix) = @{ $project->get_items(_MATRIX_) };

			return $matrix;
    	}
    }

    sub _replicate_binary {
	my ($self,%args) = @_;

	my $seed = $args{'-seed'};
	my $tree = $args{'-tree'};

    	# we will need both 'ape' and 'phytools'
    	if ( looks_like_class 'Statistics::R' ) {
    		my $pattern = $self->calc_distinct_site_patterns('with_indices_of_patterns');
    		my $characters = $self->get_characters;
    		$logger->info("matrix has ".scalar(@$pattern)." distinct patterns");

			# instantiate R
			my $newick = $tree->to_newick;
			my $R = Statistics::R->new;
			$R->run(qq[set.seed($seed)]) if $seed;
			$R->run(q[library("ape")]);
			$R->run(q[library("phytools")]);
			$R->run(qq[phylo <- read.tree(text="$newick")]);

			# start the matrix
			my @matrix;
			push @matrix, [] for 1 .. $self->get_ntax;

    		# iterate over distinct site patterns
    		for my $p ( @$pattern ) {
    			my $i = $p->[2]->[0];
    			my %pat = map { $_ => 1 } @{ $p->[1] };
    			my $states;

    			# if this column is completely invariant, the model
    			# estimator is unhappy, so we just copy it over
    			if ( keys(%pat) == 1 ) {
    				$logger->info("column is invariant, copying verbatim");
    				$states = $p->[1];
    			}
    			else {
				       my $model = $args{'-model'};
				       if ( ! $model ) {
						$logger->info("going to test model for column(s) $i..*");
						$model = Bio::Phylo::Models::Substitution::Binary->modeltest(
							'-tree'   => $tree,
							'-matrix' => $self,
							'-char'   => $characters->get_by_index($i),
						    );
					}
					# pass model to R
					my ( $fw, $rev ) = ( $model->get_forward, $model->get_reverse );

					# add epsilon if one of the rates is zero, otherwise sim.history chokes
					$fw = 1e-6 if $fw == 0;
					$rev = 1e-6 if $rev == 0;
					$logger->info("0 -> 1 ($fw), 1 -> 0 ($rev)");

					my $rates = [ 0, $fw, $rev, 0 ];
					$R->set( 'rates' => $rates );
					$R->run(qq[Q<-matrix(rates,2,2,byrow=TRUE)]);
					$R->run(qq[rownames(Q)<-colnames(Q)<-c("0","1")]);
					$R->run(qq[diag(Q)<--rowSums(Q)]);

					# simulate character on tree, get states
					$R->run(qq[tt<-sim.history(phylo,Q,message=FALSE)]);
					$R->run(qq[states<-as.double(getStates(tt,"tips"))]);
					$states = $R->get(q[states]);
				}

				# add states to matrix
				my @indices = @{ $p->[2] };
				for my $row ( 0 .. $#{ $states } ) {
					my $value = $states->[$row];
					for my $col ( @indices ) {
						$matrix[$row]->[$col] = $value;
					}
				}
    		}

    		# create matrix
    		my $i = 0;
    		$tree->visit_depth_first(
    			'-pre' => sub {
    				my $node = shift;
    				if ( $node->is_terminal ) {
    					unshift @{ $matrix[$i++] }, $node->get_name
    				}
    			}
    		);
    		$logger->info(Dumper(\@matrix));
    		return $factory->create_matrix(
    			'-type' => $self->get_type,
    			'-raw'  => \@matrix,
    		);
    	}
    }

=item insert()

Insert argument in invocant.

 Type    : Listable method
 Title   : insert
 Usage   : $matrix->insert($datum);
 Function: Inserts $datum in $matrix.
 Returns : Modified object
 Args    : A datum object
 Comments: This method re-implements the method by the same
           name in Bio::Phylo::Listable

=cut

    sub insert {
        my ( $self, $obj ) = @_;
        my $obj_container;
        eval { $obj_container = $obj->_container };
        if ( $@ || $obj_container != $self->_type ) {
            throw 'ObjectMismatch' => 'object not a datum object!';
        }
        $logger->info("inserting '$obj' in '$self'");
        if ( !$self->get_type_object->is_same( $obj->get_type_object ) ) {
            throw 'ObjectMismatch' => 'object is of wrong data type';
        }
        my $taxon1 = $obj->get_taxon;
        my $mname = $self->get_name || $self->get_internal_name;
        for my $ents ( @{ $self->get_entities } ) {
            if ( $obj->get_id == $ents->get_id ) {
                throw 'ObjectMismatch' => 'row already inserted';
            }
            if ($taxon1) {
		my $tname = $taxon1->get_name;
                my $taxon2 = $ents->get_taxon;
                if ( $taxon2 && $taxon1->get_id == $taxon2->get_id ) {
                	my $tmpl = 'Note: a row linking to %s already exists in matrix %s';
                	$logger->info(sprintf $tmpl,$tname,$mname);
                }
            }
        }
        $self->SUPER::insert($obj);
        return $self;
    }

=begin comment

Validates the object's contents.

 Type    : Method
 Title   : validate
 Usage   : $obj->validate
 Function: Validates the object's contents
 Returns : True or throws Bio::Phylo::Util::Exceptions::InvalidData
 Args    : None
 Comments: This method implements the interface method by the same
           name in Bio::Phylo::Matrices::TypeSafeData

=end comment

=cut

    sub _validate {
        shift->visit( sub { shift->validate } );
    }

=item compress_lookup()

Removes unused states from lookup table

 Type    : Method
 Title   : validate
 Usage   : $obj->compress_lookup
 Function: Removes unused states from lookup table
 Returns : $self
 Args    : None

=cut

    sub compress_lookup {
        my $self   = shift;
        my $to     = $self->get_type_object;
        my $lookup = $to->get_lookup;
        my %seen;
        for my $row ( @{ $self->get_entities } ) {
            my @char = $row->get_char;
            $seen{$_}++ for (@char);
        }
        for my $state ( keys %{$lookup} ) {
            if ( not exists $seen{$state} ) {
                delete $lookup->{$state};
            }
        }
        $to->set_lookup($lookup);
        return $self;
    }

=item check_taxa()

Validates taxa associations.

 Type    : Method
 Title   : check_taxa
 Usage   : $obj->check_taxa
 Function: Validates relation between matrix and taxa block
 Returns : Modified object
 Args    : None
 Comments: This method implements the interface method by the same
           name in Bio::Phylo::Taxa::TaxaLinker

=cut

    sub check_taxa {
        my $self = shift;

        # is linked to taxa
        if ( my $taxa = $self->get_taxa ) {
            my %taxa =
              map { $_->get_internal_name => $_ } @{ $taxa->get_entities };
          ROW_CHECK: for my $row ( @{ $self->get_entities } ) {
                if ( my $taxon = $row->get_taxon ) {
                    next ROW_CHECK if exists $taxa{ $taxon->get_name };
                }
                my $name = $row->get_name;
                if ( exists $taxa{$name} ) {
                    $row->set_taxon( $taxa{$name} );
                }
                else {
                    my $taxon = $factory->create_taxon( -name => $name );
                    $taxa{$name} = $taxon;
                    $taxa->insert($taxon);
                    $row->set_taxon($taxon);
                }
            }
        }

        # not linked
        else {
            for my $row ( @{ $self->get_entities } ) {
                $row->set_taxon();
            }
        }
        return $self;
    }

=item make_taxa()

Creates a taxa block from the objects contents if none exists yet.

 Type    : Method
 Title   : make_taxa
 Usage   : my $taxa = $obj->make_taxa
 Function: Creates a taxa block from the objects contents if none exists yet.
 Returns : $taxa
 Args    : NONE

=cut

    sub make_taxa {
        my $self = shift;
        if ( my $taxa = $self->get_taxa ) {
            return $taxa;
        }
        else {
            my %taxa;
            my $taxa = $factory->create_taxa;
            for my $row ( @{ $self->get_entities } ) {
                my $name = $row->get_internal_name;
                if ( not $taxa{$name} ) {
                    $taxa{$name} = $factory->create_taxon( '-name' => $name );
                }
            }
			if ( keys %taxa ) {
				$taxa->insert( map { $taxa{$_} } sort { $a cmp $b } keys %taxa );
			}
            $self->set_taxa($taxa);
            return $taxa;
        }
    }

=back

=head2 SERIALIZERS

=over

=item to_xml()

Serializes matrix to nexml format.

 Type    : Format convertor
 Title   : to_xml
 Usage   : my $data_block = $matrix->to_xml;
 Function: Converts matrix object into a nexml element structure.
 Returns : Nexml block (SCALAR).
 Args    : Optional:
 		   -compact => 1 (for compact representation of matrix)

=cut

    sub to_xml {
        my $self = shift;
        $logger->debug("writing $self to xml");
        my ( %args, $ids_for_states );
        if (@_) {
            %args = @_;
        }

        # creating opening tag
        my $type      = $self->get_type;
        my $verbosity = $args{'-compact'} ? 'Seqs' : 'Cells';
        my $xsi_type  = 'nex:' . ucfirst($type) . $verbosity;
        $self->set_attributes( 'xsi:type' => $xsi_type );
        my $xml = $self->get_xml_tag;
        $logger->debug("created opening tag $xml");

        # normalizing symbol table
        my $normalized = $self->_normalize_symbols;
        $logger->debug("normalized symbols");

        # the format block
        $xml .= '<format>';
        $logger->debug($xml);
        my $to = $self->get_type_object;
        $ids_for_states = $to->get_ids_for_states(1);

        # write state definitions
        # this calls Datatype::to_xml method
        #
        $xml .= $to->to_xml( $normalized, $self->get_polymorphism );
        $logger->debug($xml);
        $xml .= $self->get_characters->to_xml;
        $xml .= "\n</format>";

        # the matrix block
        $xml .= "\n<matrix>";
        my @char_ids =
          map { $_->get_xml_id } @{ $self->get_characters->get_entities };

        # write rows
        my $special = $self->get_type_object->get_ids_for_special_symbols(1);
        for my $row ( @{ $self->get_entities } ) {
            $xml .= "\n"
              . $row->to_xml(
                '-states'  => $ids_for_states,
                '-chars'   => \@char_ids,
                '-symbols' => $normalized,
                '-special' => $special,
                %args,
              );
        }
        $xml .= "\n</matrix>";
        $xml .= "\n" . sprintf( '</%s>', $self->get_tag );
        return $xml;
    }

    # what this does:
    # numerical states are their own keys; also want numerical
    # representations for non-numerical states..."normalization"
    # provides this mapping....
    sub _normalize_symbols {
        my $self = shift;
        $logger->debug("normalizing symbols");
        if ( $self->get_type =~ /^standard$/i ) {
            my $to     = $self->get_type_object;
            my $lookup = $self->get_lookup;
            my @states = keys %{$lookup};
            if ( my @letters = sort { $a cmp $b } grep { /[a-z]/i } @states ) {
                my @numbers = sort { $a <=> $b } grep { /^\d+$/ } @states;
                my $i       = $numbers[-1];
                my %map     = map { $_ => ++$i } @letters;
                return \%map;
            }
            else {
                return {};
            }
        }
        else {
            return {};
        }
    }

=item to_nexus()

Serializes matrix to nexus format.

 Type    : Format convertor
 Title   : to_nexus
 Usage   : my $data_block = $matrix->to_nexus;
 Function: Converts matrix object into a nexus data block.
 Returns : Nexus data block (SCALAR).
 Args    : The following options are available:

            # if set, writes TITLE & LINK tokens
            '-links' => 1

            # if set, writes block as a "data" block (deprecated, but used by mrbayes),
            # otherwise writes "characters" block (default)
            -data_block => 1

            # if set, writes "RESPECTCASE" token
            -respectcase => 1

            # if set, writes "GAPMODE=(NEWSTATE or MISSING)" token
            -gapmode => 1

            # if set, writes "MSTAXA=(POLYMORPH or UNCERTAIN)" token
            -polymorphism => 1

            # if set, writes character labels
            -charlabels => 1

            # if set, writes state labels
            -statelabels => 1

            # if set, writes mesquite-style charstatelabels
            -charstatelabels => 1

            # by default, names for sequences are derived from $datum->get_name, if
            # 'internal' is specified, uses $datum->get_internal_name, if 'taxon'
            # uses $datum->get_taxon->get_name, if 'taxon_internal' uses
            # $datum->get_taxon->get_internal_name, if $key, uses $datum->get_generic($key)
            -seqnames => one of (internal|taxon|taxon_internal|$key)

=cut

    sub to_nexus {
        my $self = shift;
        $logger->info("writing to nexus: $self");
        my %args   = @_;
        my $nchar  = $self->get_nchar;
        my $string = sprintf "BEGIN %s;\n",
          $args{'-data_block'} ? 'DATA' : 'CHARACTERS';
        $string .=
            "[! Characters block written by "
          . ref($self) . " "
          . $self->VERSION . " on "
          . localtime() . " ]\n";

        # write links
        if ( $args{'-links'} ) {
            $string .= sprintf "\tTITLE %s;\n", $self->get_internal_name;
            $string .= sprintf "\tLINK TAXA=%s;\n",
              $self->get_taxa->get_internal_name
              if $self->get_taxa;
        }

     # dimensions token line - data block defines NTAX, characters block doesn't
        if ( $args{'-data_block'} ) {
            $string .= "\tDIMENSIONS NTAX=" . $self->get_ntax() . ' ';
            $string .= 'NCHAR=' . $nchar . ";\n";
        }
        else {
            $string .= "\tDIMENSIONS NCHAR=" . $nchar . ";\n";
        }

        # format token line
        $string .= "\tFORMAT DATATYPE=" . $self->get_type();
        $string .= ( $self->get_respectcase ? " RESPECTCASE" : "" )
          if $args{'-respectcase'};    # mrbayes no like
        $string .= " MATCHCHAR=" . $self->get_matchchar if $self->get_matchchar;
        $string .= " MISSING=" . $self->get_missing();
        $string .= " GAP=" . $self->get_gap()           if $self->get_gap();
        $string .= ";\n";

        # options token line (mrbayes no like)
        if ( $args{'-gapmode'} or $args{'-polymorphism'} ) {
            $string .= "\tOPTIONS ";
            $string .=
              "GAPMODE=" . ( $self->get_gapmode ? "NEWSTATE " : "MISSING " )
              if $args{'-gapmode'};
            $string .=
              "MSTAXA="
              . ( $self->get_polymorphism ? "POLYMORPH " : "UNCERTAIN " )
              if $args{'-polymorphism'};
            $string .= ";\n";
        }

        # charlabels token line
        if ( $args{'-charlabels'} ) {
            my $charlabels;
            if ( my @labels = @{ $self->get_charlabels } ) {
                my $i = 1;
                for my $label (@labels) {
                    $charlabels .=
                      $label =~ /\s/
                      ? "\n\t\t [$i] '$label'"
                      : "\n\t\t [$i] $label";
                    $i++;
                }
                $string .= "\tCHARLABELS$charlabels\n\t;\n";
            }
        }

        # statelabels token line
        if ( $args{'-statelabels'} ) {
            my $statelabels;
            if ( my @labels = @{ $self->get_statelabels } ) {
                my $i = 1;
                for my $labelset (@labels) {
                    $statelabels .= "\n\t\t $i";
                    for my $label ( @{$labelset} ) {
                        $statelabels .=
                          $label =~ /\s/
                          ? "\n\t\t\t'$label'"
                          : "\n\t\t\t$label";
                        $i++;
                    }
                    $statelabels .= ',';
                }
                $string .= "\tSTATELABELS$statelabels\n\t;\n";
            }
        }

        # charstatelabels token line
        if ( $args{'-charstatelabels'} ) {
            my @charlabels  = @{ $self->get_charlabels };
            my @statelabels = @{ $self->get_statelabels };
            if ( @charlabels and @statelabels ) {
                my $charstatelabels;
                my $nlabels = $self->get_nchar - 1;
                for my $i ( 0 .. $nlabels ) {
                    $charstatelabels .= "\n\t\t" . ( $i + 1 );
                    if ( my $label = $charlabels[$i] ) {
                        $charstatelabels .=
                          $label =~ /\s/ ? " '$label' /" : " $label /";
                    }
                    else {
                        $charstatelabels .= " ' ' /";
                    }
                    if ( my $labelset = $statelabels[$i] ) {
                        for my $label ( @{$labelset} ) {
                            $charstatelabels .=
                              $label =~ /\s/ ? " '$label'" : " $label";
                        }
                    }
                    else {
                        $charstatelabels .= " ' '";
                    }
                    $charstatelabels .=  ',' if $i < $nlabels;
                }
                $string .= "\tCHARSTATELABELS$charstatelabels\n\t;\n";
            }
        }

        # ...and write matrix!
        $string .= "\tMATRIX\n";
        my $length = 0;
        foreach my $datum ( @{ $self->get_entities } ) {
            $length = length( $datum->get_nexus_name )
              if length( $datum->get_nexus_name ) > $length;
        }
        $length += 4;
        my $sp = ' ';
        foreach my $datum ( @{ $self->get_entities } ) {
            $string .= "\t\t";

            # construct name
            my $name;
            if ( not $args{'-seqnames'} ) {
                $name = $datum->get_nexus_name;
            }
            elsif ( $args{'-seqnames'} =~ /^internal$/i ) {
                $name = $datum->get_nexus_name;
            }
            elsif ( $args{'-seqnames'} =~ /^taxon/i and $datum->get_taxon ) {
                if ( $args{'-seqnames'} =~ /^taxon_internal$/i ) {
                    $name = $datum->get_taxon->get_nexus_name;
                }
                elsif ( $args{'-seqnames'} =~ /^taxon$/i ) {
                    $name = $datum->get_taxon->get_nexus_name;
                }
            }
            else {
                $name = $datum->get_generic( $args{'-seqnames'} );
            }
            $name = $datum->get_nexus_name if not $name;
            $string .= $name . ( $sp x ( $length - length($name) ) );
            my $char = $datum->get_char;
            $string .= $char . "\n";
        }
        $string .= "\t;\nEND;\n";
        return $string;
    }

=item to_dom()

Analog to to_xml.

 Type    : Serializer
 Title   : to_dom
 Usage   : $matrix->to_dom
 Function: Generates a DOM subtree from the invocant
           and its contained objects
 Returns : an Element object
 Args    : Optional:
           -compact => 1 : renders characters as sequences,
                           not individual cells

=cut

    sub to_dom {
        my $self = shift;
        my $dom  = $_[0];
        my @args = @_;

        # handle dom factory object...
        if ( looks_like_instance( $dom, 'SCALAR' )
            && $dom->_type == _DOMCREATOR_ )
        {
            splice( @args, 0, 1 );
        }
        else {
            $dom = $Bio::Phylo::NeXML::DOM::DOM;
            unless ($dom) {
                throw 'BadArgs' => 'DOM factory object not provided';
            }
        }
        #### make sure argument handling works here...
        my ( %args, $ids_for_states );
        %args = @args if @args;
        my $type      = $self->get_type;
        my $verbosity = $args{'-compact'} ? 'Seqs' : 'Cells';
        my $xsi_type  = 'nex:' . ucfirst($type) . $verbosity;
        $self->set_attributes( 'xsi:type' => $xsi_type );
        my $elt        = $self->get_dom_elt($dom);
        my $normalized = $self->_normalize_symbols;

        # the format block
        my $format_elt = $dom->create_element( '-tag' => 'format' );
        my $to = $self->get_type_object;
        $ids_for_states = $to->get_ids_for_states(1);

        # write state definitions
        $format_elt->set_child(
            $to->to_dom( $dom, $normalized, $self->get_polymorphism ) );

        # write column definitions
        $format_elt->set_child($_)
          for $self->_package_char_labels( $dom,
            %{$ids_for_states} ? $to->get_xml_id : undef );
        $elt->set_child($format_elt);

        # the matrix block
        my $mx_elt = $dom->create_element( '-tag' => 'matrix' );
        my @char_ids;
        for ( 0 .. $self->get_nchar ) {
            push @char_ids, 'c' . ( $_ + 1 );
        }

        # write rows
        my $special = $self->get_type_object->get_ids_for_special_symbols(1);
        for my $row ( @{ $self->get_entities } ) {

            # $row->to_dom is calling ...::Datum::to_dom...
            $mx_elt->set_child(
                $row->to_dom(
                    $dom,
                    '-states'  => $ids_for_states,
                    '-chars'   => \@char_ids,
                    '-symbols' => $normalized,
                    '-special' => $special,
                    %args,
                )
            );
        }
        $elt->set_child($mx_elt);
        return $elt;
    }

    # returns an array of elements
    sub _package_char_labels {
        my ( $self, $dom, $states_id ) = @_;
        my @elts;
        my $labels = $self->get_charlabels;
        for my $i ( 1 .. $self->get_nchar ) {
            my $char_id = 'c' . $i;
            my $label   = $labels->[ $i - 1 ];
            my $elt     = $dom->create_element( '-tag' => 'char' );
            $elt->set_attributes( 'id'     => $char_id );
            $elt->set_attributes( 'label'  => $label ) if $label;
            $elt->set_attributes( 'states' => $states_id ) if $states_id;
            push @elts, $elt;
        }
        return @elts;
    }
    sub _tag       { 'characters' }
    sub _type      { $CONSTANT_TYPE }
    sub _container { $CONSTANT_CONTAINER }

    sub _update_characters {
	my $self        = shift;
	my $nchar       = $self->get_nchar;
	my $characters  = $self->get_characters;
	my $type_object = $self->get_type_object;
	my @chars       = @{ $characters->get_entities };
	my @defined     = grep { defined $_ } @chars;
	if ( scalar @defined != $nchar ) {
	    for my $i ( 0 .. ( $nchar - 1 ) ) {
		if ( not $chars[$i] ) {
		    $characters->insert_at_index(
			$factory->create_character(
			    '-type_object' => $type_object
			),
			$i,
		    );
		}
	    }
	}
	@chars = @{ $characters->get_entities };
	if ( scalar(@chars) > $nchar ) {
	    $characters->prune_entities( [ $nchar .. $#chars ] );
	}
    }

=back

=cut

    # podinherit_insert_token

=head1 SEE ALSO

There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
for any user or developer questions and discussions.

=over

=item L<Bio::Phylo::Taxa::TaxaLinker>

This object inherits from L<Bio::Phylo::Taxa::TaxaLinker>, so the
methods defined therein are also applicable to L<Bio::Phylo::Matrices::Matrix>
objects.

=item L<Bio::Phylo::Matrices::TypeSafeData>

This object inherits from L<Bio::Phylo::Matrices::TypeSafeData>, so the
methods defined therein are also applicable to L<Bio::Phylo::Matrices::Matrix>
objects.

=item L<Bio::Phylo::Manual>

Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.

=back

=head1 CITATION

If you use Bio::Phylo in published research, please cite it:

B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
I<BMC Bioinformatics> B<12>:63.
L<http://dx.doi.org/10.1186/1471-2105-12-63>

=cut

}
1;
__DATA__

my %CONSERVATION_GROUPS = (
        'strong' => [ qw(
		STA
		NEQK
		NHQK
		NDEQ
		QHRK
		MILV
		MILF
		HY
		FYW
	)],
	'weak' => [ qw(
                CSA
		ATV
		SAG
		STNK
		STPA
		SGND
		SNDEQK
		NDEQHK
		NEQHRK
		FVLIM
		HFY
	)],
);

sub description {
	my ( $self, $desc ) = @_;
	if ( defined $desc ) {
		$self->set_desc( $desc );
	}
	return $self->get_desc;
}

sub num_sequences {
	my ( $self, $num ) = @_;
	# setter?
	return scalar @{ $self->get_entities };
}

sub datatype {
	my ( $self, $type ) = @_;
	# setter?
	return uc $self->get_type;
}

sub score {
	my ( $self, $score ) = @_;
	if ( defined $score ) {
		$self->set_score( $score );
	}
	return $self->get_score;
}

sub add_seq {
    my ( $self, $seq, $order ) = @_;
    $self->insert( $seq );
}

sub remove_seq {
    my ( $self, $seq ) = @_;
    $self->delete( $seq );
}

sub purge {
 $logger->warn
}

sub sort_alphabetically {
    my $self = shift;
    my @sorted = map  { $_->[0] }
                 sort { $a->[1] cmp $b->[1] }
                 map  { [ $_, $_->get_name ] }
                 @{ $self->get_entities };
    $self->clear;
    $self->insert(@sorted);
    return @sorted;
}

sub each_seq {
    my $self = shift;
    return @{ $self->get_entities };
}

sub each_alphabetically {
    my $self = shift;
    return map  { $_->[0] }
           sort { $a->[1] cmp $b->[1] }
           map  { [ $_, $_->get_name ] } @{ $self->get_entities };
}

sub each_seq_with_id {
    my ( $self, $name ) = @_;
    return @{
        $self->get_by_regular_expression(
            '-value' => 'get_name',
            '-match' => qr/^\Q$name\E$/
        )
    }
}

sub get_seq_by_pos {
    my ( $self, $pos ) = @_;
    return $self->get_by_index( $pos - 1 );
}

sub select {
    my ( $self, $start, $end ) = @_;
    my $clone = $self->clone;
    my @contents = @{ $clone->get_entities };
    my @deleteme;
    for my $i ( 0 .. $#contents ) {
        if ( $i < $start - 1 or $i > $end - 1 ) {
            push @deleteme, $contents[$i];
        }
    }
    $clone->delete( $_ ) for @deleteme;
    return $clone;
}

sub select_noncont {
    my ( $self, @indices ) = @_;
    my $clone = $self->clone;
    my @contents = @{ $clone->get_entities };
    my ( @deleteme, %keep );
    %keep = map { ( $_ - 1 ) => 1 } @indices;
    for my $i ( 0 .. $#contents ) {
        if ( not exists $keep{$i} ) {
            push @deleteme, $contents[$i];
        }
    }
    $clone->delete( $_ ) for @deleteme;
    return $clone;
}

sub slice {
    my ( $self, $start, $end, $include_gapped ) = @_;
    my $clone = $self->clone;
    my $gap = $self->get_gap;
    SEQ: for my $seq ( @{ $clone->get_entities } ) {
        my @char = $self->get_char;
        my @slice = splice @char, ( $start - 1 ), ( $end - $start - 1 );
        if ( not $include_gapped ) {
            if ( not grep { $_ !~ /^\Q$gap\E$/ } @slice ) {
                next SEQ;
            }
        }
        $seq->set_char(@slice);
    }
}

sub map_chars {
    my ( $self, $from, $to ) = @_;
    for my $seq ( @{ $self->get_entities } ) {
        my @char = $seq->get_char;
        for my $c ( @char ) {
            $c =~ s/$from/$to/;
        }
        $seq->set_char( @char );
    }
}

sub uppercase {
    my $self = shift;
    for my $seq ( @{ $self->get_entities } ) {
        my @char = $seq->get_char;
        my @uc = map { uc $_ } @char;
        $seq->set_char(@uc);
    }
}

# from simplealign
sub match_line {
	my ($self,$matchlinechar, $strong, $weak) = @_;
	my %matchchars = ('match'    => $matchlinechar || '*',
							  'weak'     => $weak          || '.',
							  'strong'   => $strong        || ':',
							  'mismatch' => ' ',
						  );

	my @seqchars;
	my $alphabet;
	foreach my $seq ( $self->each_seq ) {
		push @seqchars, [ split(//, uc ($seq->seq)) ];
		$alphabet = $seq->alphabet unless defined $alphabet;
	}
	my $refseq = shift @seqchars;
	# let's just march down the columns
	my $matchline;
 POS:
	foreach my $pos ( 0..$self->length ) {
		my $refchar = $refseq->[$pos];
		my $char = $matchchars{'mismatch'};
		unless( defined $refchar ) {
			last if $pos == $self->length; # short circuit on last residue
			# this in place to handle jason's soon-to-be-committed
			# intron mapping code
			goto bottom;
		}
		my %col = ($refchar => 1);
		my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
		foreach my $seq ( @seqchars ) {
			next if $pos >= scalar @$seq;
			$dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
							  $seq->[$pos] eq ' ' );
			$col{$seq->[$pos]}++ if defined $seq->[$pos];
		}
		my @colresidues = sort keys %col;

		# if all the values are the same
		if( $dash ) { $char =  $matchchars{'mismatch'} }
		elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
		elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
			# matches for protein seqs
	    TYPE:
			foreach my $type ( qw(strong weak) ) {
				# iterate through categories
				my %groups;
				# iterate through each of the aa in the col
				# look to see which groups it is in
				foreach my $c ( @colresidues ) {
					foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
						push @{$groups{$f}},$c;
					}
				}
			 GRP:
				foreach my $cols ( values %groups ) {
					@$cols = sort @$cols;
					# now we are just testing to see if two arrays
					# are identical w/o changing either one
					# have to be same len
					next if( scalar @$cols != scalar @colresidues );
					# walk down the length and check each slot
					for($_=0;$_ < (scalar @$cols);$_++ ) {
						next GRP if( $cols->[$_] ne $colresidues[$_] );
					}
					$char = $matchchars{$type};
					last TYPE;
				}
			}
		}
	 bottom:
		$matchline .= $char;
	}
	return $matchline;
}

sub match {
    my ( $self, $match ) = @_;
    if ( defined $match ) {
        $self->set_matchchar($match);
    }
    else {
        $self->set_matchchar('.');
    }
    $match = $self->get_matchchar;
    my $lookup = $self->get_type_object->get_lookup->{$match} = [ $match ];
    my @seqs = @{ $self->get_entities };
    my @firstseq = $seqs[0]->get_char;
    for my $i ( 1 .. $#seqs ) {
        my @char = $seqs[$i]->get_char;
        for my $j ( 0 .. $#char ) {
            if ( $char[$j] eq $firstseq[$j] ) {
                $char[$j] = $match;
            }
        }
        $seqs[$i]->set_char(@char);
    }
    1;
}

sub unmatch {
    my ( $self, $match ) = @_;
    if ( defined $match ) {
        $self->set_matchchar($match);
    }
    else {
        $self->set_matchchar('.');
    }
    $match = $self->get_matchchar;
    my @seqs = @{ $self->get_entities };
    my @firstseq = $seqs[0]->get_char;
    for my $i ( 1 .. $#seqs ) {
        my @char = $seqs[$i]->get_char;
        for my $j ( 0 .. $#char ) {
            if ( $char[$j] eq $match ) {
                $char[$j] = $firstseq[$j];
            }
        }
        $seqs[$i]->set_char(@char);
    }
    1;
}

sub id {
    my ( $self, $name ) = @_;
    if ( defined $name ) {
        $self->set_name( $name );
    }
    return $self->get_name;
}

sub missing_char {
    my ( $self, $missing ) = @_;
    if ( defined $missing ) {
        $self->set_missing( $missing );
    }
    return $self->get_missing;
}

sub match_char {
    my ( $self, $match ) = @_;
    if ( defined $match ) {
        $self->set_matchchar( $match );
    }
    return $self->get_matchchar;
}

sub gap_char {
    my ( $self, $gap ) = @_;
    if ( defined $gap ) {
        $self->set_gap( $gap );
    }
    return $self->get_gap;
}

sub symbol_chars {
    my ( $self, $includeextra ) = @_;
	my %seen;
	for my $row ( @{ $self->get_entities } ) {
		my @char = $row->get_char;
		$seen{$_} = 1 for @char;
	}
    return keys %seen if $includeextra;
    my $special_values = $self->get_special_symbols;
    my %special_keys   = map { $_ => 1 } values %{ $special_values };
    return grep { ! $special_keys{$_} } keys %seen;
}

sub consensus_string {
	my $self = shift;
	my $to = $self->get_type_object;
	my $ntax = $self->get_ntax;
	my $nchar = $self->get_nchar;
	my @consensus;
	for my $i ( 0 .. $ntax - 1 ) {
		my ( @column, %column );
		for my $j ( 0 .. $nchar - 1 ) {
			$column{ $self->get_by_index($i)->get_by_index($j) } = 1;
		}
		@column = keys %column;
		push @consensus, $to->get_symbol_for_states(@column);
	}
	return join '', @consensus;
}

sub consensus_iupac {
 $logger->warn
}

sub is_flush { 1 }

sub length { shift->get_nchar }

sub maxname_length { $logger->warn }

sub no_residues { $logger->warn }

sub no_sequences {
    my $self = shift;
    return scalar @{ $self->get_entities };
}

sub percentage_identity { $logger->warn }

# from simplealign
sub average_percentage_identity{
   my ($self,@args) = @_;

   my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
                   'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

   my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);

   if (! $self->is_flush()) {
       throw 'Generic' => "All sequences in the alignment must be the same length";
   }

   @seqs = $self->each_seq();
   $len = $self->length();

   # load the each hash with correct keys for existence checks

   for( my $index=0; $index < $len; $index++) {
       foreach my $letter (@alphabet) {
       		$countHashes[$index] = {} if not $countHashes[$index];
	   $countHashes[$index]->{$letter} = 0;
       }
   }
   foreach my $seq (@seqs)  {
       my @seqChars = split //, $seq->seq();
       for( my $column=0; $column < @seqChars; $column++ ) {
	   my $char = uc($seqChars[$column]);
	   if (exists $countHashes[$column]->{$char}) {
	       $countHashes[$column]->{$char}++;
	   }
       }
   }

   $total = 0;
   $divisor = 0;
   for(my $column =0; $column < $len; $column++) {
       my %hash = %{$countHashes[$column]};
       $subdivisor = 0;
       foreach my $res (keys %hash) {
	   $total += $hash{$res}*($hash{$res} - 1);
	   $subdivisor += $hash{$res};
       }
       $divisor += $subdivisor * ($subdivisor - 1);
   }
   return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
}

# from simplealign
sub overall_percentage_identity{
   my ($self, $length_measure) = @_;

   my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
                   'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

   my ($len, $total, @seqs, @countHashes);

   my %enum = map {$_ => 1} qw (align short long);

   throw 'Generic' => "Unknown argument [$length_measure]"
       if $length_measure and not $enum{$length_measure};
   $length_measure ||= 'align';

   if (! $self->is_flush()) {
       throw 'Generic' => "All sequences in the alignment must be the same length";
   }

   @seqs = $self->each_seq();
   $len = $self->length();

   # load the each hash with correct keys for existence checks
   for( my $index=0; $index < $len; $index++) {
       foreach my $letter (@alphabet) {
       		$countHashes[$index] = {} if not $countHashes[$index];
	   $countHashes[$index]->{$letter} = 0;
       }
   }
   foreach my $seq (@seqs)  {
       my @seqChars = split //, $seq->seq();
       for( my $column=0; $column < @seqChars; $column++ ) {
	   my $char = uc($seqChars[$column]);
	   if (exists $countHashes[$column]->{$char}) {
	       $countHashes[$column]->{$char}++;
	   }
       }
   }

   $total = 0;
   for(my $column =0; $column < $len; $column++) {
       my %hash = %{$countHashes[$column]};
       foreach ( values %hash ) {
	   next if( $_ == 0 );
	   $total++ if( $_ == scalar @seqs );
	   last;
       }
   }

   if ($length_measure eq 'short') {
       ## find the shortest length
       $len = 0;
       foreach my $seq ($self->each_seq) {
           my $count = $seq->seq =~ tr/[A-Za-z]//;
           if ($len) {
               $len = $count if $count < $len;
           } else {
               $len = $count;
           }
       }
   }
   elsif ($length_measure eq 'long') {
       ## find the longest length
       $len = 0;
       foreach my $seq ($self->each_seq) {
           my $count = $seq->seq =~ tr/[A-Za-z]//;
           if ($len) {
               $len = $count if $count > $len;
           } else {
               $len = $count;
           }
       }
   }

   return ($total / $len ) * 100.0;
}

sub column_from_residue_number {
    my ( $self, $seqname, $resnumber ) = @_;
    my $col;
    if ( my $seq = $self->get_by_name($seqname) ) {
        my $gap  = $seq->get_gap;
        my @char = $seq->get_char;
        for my $i ( 0 .. $#char ) {
            $col++ if $char[$i] ne $gap;
            if ( $col + 1 == $resnumber ) {
                return $i + 1;
            }
        }
    }
}

sub displayname {
    my ( $self, $name, $disname ) = @_;
	my $seq;
	$self->visit( sub{ $seq = $_[0] if $_[0]->get_nse eq $name } );
    $self->throw("No sequence with name [$name]") unless $seq;
	my $disnames = $self->get_generic( 'displaynames' ) || {};
    if ( $disname and  $name ) {
    	$disnames->{$name} = $disname;
		return $disname;
    }
    elsif( defined $disnames->{$name} ) {
		return  $disnames->{$name};
    }
    else {
		return $name;
    }
}

# from SimpleAlign
sub maxdisplayname_length {
    my $self = shift;
    my $maxname = (-1);
    my ($seq,$len);
    foreach $seq ( $self->each_seq() ) {
		$len = CORE::length $self->displayname($seq->get_nse());
		if( $len > $maxname ) {
		    $maxname = $len;
		}
    }
    return $maxname;
}

# from SimpleAlign
sub set_displayname_flat {
    my $self = shift;
    my ($nse,$seq);

    foreach $seq ( $self->each_seq() ) {
	$nse = $seq->get_nse();
	$self->displayname($nse,$seq->id());
    }
    return 1;
}

sub set_displayname_count { $logger->warn }

sub set_displayname_normal { $logger->warn }

sub accession {
	my ( $self, $acc ) = @_;
	if ( defined $acc ) {
		$self->set_generic( 'accession' => $acc );
	}
	return $self->get_generic( 'accession' );
}

sub source {
	my ( $self, $source ) = @_;
	if ( defined $source ) {
		$self->set_generic( 'source' => $source );
	}
	return $self->get_generic( 'source' );
}

sub annotation {
	my ( $self, $anno ) = @_;
	if ( defined $anno ) {
		$self->set_generic( 'annotation' => $anno );
	}
	return $self->get_generic( 'annotation' );
}

sub consensus_meta {
	my ( $self, $meta ) = @_;
	if ( defined $meta ) {
		$self->set_generic( 'consensus_meta' => $meta );
	}
	return $self->get_generic( 'consensus_meta' );
}

# XXX this might be removed, and instead inherit from SimpleAlign
sub max_metaname_length {
    my $self = shift;
    my $maxname = (-1);
    my ($seq,$len);

    # check seq meta first
    for $seq ( $self->each_seq() ) {
        next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
        for my $mtag ($seq->meta_names) {
            $len = CORE::length $mtag;
            if( $len > $maxname ) {
                $maxname = $len;
            }
        }
    }

    # alignment meta
    for my $meta ($self->consensus_meta) {
        next unless $meta;
        for my $name ($meta->meta_names) {
            $len = CORE::length $name;
            if( $len > $maxname ) {
                $maxname = $len;
            }
        }
    }

    return $maxname;
}


sub get_SeqFeatures { return }




Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.