Group
Extension

Bio-EnsEMBL/lib/Bio/EnsEMBL/DBSQL/FastaSequenceAdaptor.pm

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2024] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::DBSQL::FastaSequenceAdaptor

=head1 DESCRIPTION

This sequence adaptor extends the BaseSequenceAdaptor code to provide an 
implementation of the .fai index lookup as defined by samtools. The code uses
this indexing system to access portions of sequence and translates Slice requests
into sensible locations for our FASTA query layer.

The adaptor must be initalised with access to a Faidx compatible object and the FASTA
file backing must use the same seq_region_name as the querying slices otherwise
we cannot return the required data.

=cut

package Bio::EnsEMBL::DBSQL::FastaSequenceAdaptor;
$Bio::EnsEMBL::DBSQL::FastaSequenceAdaptor::VERSION = '113.0.0';
use strict;
use warnings;

use base qw/Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor/;

use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/;
use Bio::EnsEMBL::Utils::Sequence qw/reverse_comp/;
use English qw/-no_match_vars/;
require bytes;

=head2 new
  
  Arg [1]     : FileFaidx; $faindex. A FileFaidx object or a compatible version
  Arg [2]     : Integer; $chunk_power. Size of the region to cache
  Arg [3]     : Integer; $cache_size. Number of regions to cache
  Description : Builds an instance of the FastaSequenceAdaptor

=cut

sub new {
  my ($class, $faindex, $chunk_power, $cache_size) = @_;
  my $self = $class->SUPER::new($chunk_power, $cache_size);
  $self->faindex($faindex);
  return $self;
}

=head2 fetch_by_Slice_start_end_strand
  
  Arg [1]     : Bio::EnsEMBL::Slice; $slice. Slice to fetch sequence for
  Arg [2]     : Integer; $start. Start of region to retrieve relative to the Slice (defaults to 1)
  Arg [3]     : Integer; $end. End of region to retreive relative to the Slice (defaults to length)
  Arg [4]     : Integer; $strand. Strand to fetch (defaults to 1)
  Description : Fetches sequence for the given slice. Unlike the normal SequenceAdaptor we assume
                Sequence is held in a FASTA file under the Slice's seq_region_name.
  Exception   : Thrown if we are given a circular slice
=cut

sub fetch_by_Slice_start_end_strand {
  my ( $self, $slice, $start, $end, $strand ) = @_;
  
  # Input checks
  assert_ref($slice, 'Bio::EnsEMBL::Slice', 'slice');
  if(defined $end && $start > $end && $slice->is_circular()) {
    throw "Currently we do not support circular requests";
  }
  
  #Get a new slice that spans the exact region to retrieve dna from.
  #Then constrain to seq region if it's gone negative or over the end  
  $slice = $self->expand_Slice($slice, $start, $end, $strand);
  $slice = $slice->constrain_to_seq_region();
  
  # This call is likely to barf if we try to query using a chr name
  # we do not understand. Use can_access_Slice() to make sure
  my $seq_ref = $self->_fetch_seq($slice->seq_region_name(), $slice->start(), $slice->length());
  reverse_comp($seq_ref) if $strand == -1;
  return $seq_ref;
}

=head2 faindex

  Description : Holds a reference to the Faindex object to use for sequence access

=cut

sub faindex {
  my ($self, $faindex) = @_;
  if(defined $faindex) {
    assert_ref($faindex, 'Bio::EnsEMBL::Utils::IO::FileFaidx', 'faidx');
    $self->{faindex} = $faindex;
  }
  return $self->{faindex};
}

=head2 can_access_Slice

  Description : Checks the lookup to see if we have access to the Slice given (using 
                seq region name as the ID). We reject any Circular Slice

=cut

sub can_access_Slice {
  my ($self, $slice) = @_;
  return 0 if $slice->is_circular();
  return $self->faindex()->can_access_id($slice->seq_region_name());
}

=head2 store
  
  Description : Unsupported operation. Please use a FASTA serialiser

=cut

sub store {
  throw "Unsupported operation. Cannot store sequence in a fasta file";
}

=head _fetch_raw_seq

  Description : Provides access to the underlying faindex object and returns a sequence scalar ref

=cut

sub _fetch_raw_seq {
  my ($self, $id, $start, $length) = @_;
  my $seq_ref = $self->faindex()->fetch_seq($id, $start, $length);
  return $seq_ref;
}

1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.