Group
Extension

Bio-SeqAlignment-Applications-SequencingSimulators-RNASeq-Polyester/bin/polyester_polyA.pl

#!/home/chrisarg/perl5/perlbrew/perls/current/bin/perl
use v5.36;
use strict;
use warnings;

use Bio::SeqAlignment::Components::Sundry::DocumentSequenceModifications
  qw(store_modifications retrieve_modifications);
use Bio::SeqAlignment::Components::Sundry::IOHelpers
  qw(read_fastx_sequences write_fastx_sequences split_fastx_sequences);
use Bio::SeqAlignment::Components::Sundry::Tailing qw(add_polyA);
use Data::MessagePack;
use File::Basename;
use File::Spec;
use FindBin qw($Bin);
use Getopt::Long qw(:config no_ignore_case);
use JSON;
use YAML::Tiny;

###############################################################################
## system constants
use constant DEFAULT_READS      => 300;
use constant DEFAULT_READS_FILE => 'reads.txt';
use constant POLYESTER_SEED     => 45;

###############################################################################

# Define the variables for the options with default NULL values for optional arguments
my $bias;
my @distparams;
my $errormodel;
my $errorrate;
my $fastafile;
my $fcfile;
my $gcbias;
my $max_sequences_per_file = 0;         ## default is to not split the files
my $modformat              = 'YAML';    ## default format is 'YAML
my @numreps;
my $outdir;
my $paired;
my $readsfile;
my $readlen;
my $fraglen;
my $fragsd;
my $seed;
my $strandspec;
my $taildist;
my $writeinfo;

# Configure Getopt::Long
GetOptions(
    'bias|b:s'           => \$bias, # fragment selection bias (optional, string)
    'distparams|P=f{1,}' => \@distparams
    ,    # distribution parameters (mandatory, list of numeric values)
    'errormodel|e:s' => \$errormodel,   # error model (optional, string)
    'errorrate|E:f'  => \$errorrate,    # error probability (optional, float)
    'fastafile|f=s'  => \$fastafile,    # fasta file (path) (mandatory, strings)
    'fcfile|c:s'     => \$fcfile,       # fold change (path) (optional, string)
    'fraglen|F:i'    => \$fraglen,   # fragment length (avg) (optional, integer)
    'fragsd|S:i'     => \$fragsd,    # fragment length (sd) (optional, integer)
    'gcbias|g:i'     => \$gcbias,    # gc bias (optional, integer)
    'maxseqs|m:i'    => \$max_sequences_per_file,    # max sequences per file
    'modformat|M:s'  => \$modformat
    , # case insensitive format for storing modifications (one of JSON, YAML, or MessagePack)
    'numreps|n:i{,}' =>
      \@numreps,    # num of replicates in each group (optional, list)
    'outdir|o:s' => \$outdir,  # path to output directory (optional, string)
    'paired|p:s' => \$paired,  # paired reads (TRUE or FALSE) (optional, string)
    'readlen|R:i'   => \$readlen,    # read length (optional, integer)
    'readsfile|r:s' =>
      \$readsfile,    # reads_per_transcript (path) (optional, string)
    'seed|d:i'       => \$seed,    # random seed (optional, integer)
    'strandspec|s:s' =>
      \$strandspec,    # strand specificity (TRUE or FALSE) (optional, string)
    'taildist|t=s'  => \$taildist,    # tail distribution (mandatory, string)
    'writeinfo|w:i' =>
      \$writeinfo    # save simulation info? (optional, integer/boolean)
) or die "Error in command line arguments\n";

###############################################################################
# Boring list of things to do

# Check for mandatory arguments
die "Error: --fastafile is required.\n" unless $fastafile;

## Check that the fastafile exists
die "Error: $fastafile does not exist.\n" unless -e $fastafile;

$seed = POLYESTER_SEED unless defined $seed;

## create outdir if provided, but set it to dirname of fastafile if not
unless ( defined $outdir ) {
    $outdir = dirname($fastafile);
}
else {
    mkdir $outdir unless -d $outdir;
}

## set readsfile to constant expression if not provided
my $has_readfile = 1;
$has_readfile = 0 unless ( defined $readsfile );

###############################################################################
## Pipeline logic : add polyA tails to source files, then simulate reads with R
## Idea is to flesh out the pipeline logic in the script, then refactor through
## a pipeline module

## Pipeline Segment 1 : read sequences from the fasta file
my $bioseq_objects = read_fastx_sequences($fastafile);

## Pipeline Segment 2 : set up various defauls if not provided
default_read_counts( $outdir, DEFAULT_READS_FILE, $#$bioseq_objects + 1 )
  unless $has_readfile;

## Pipeline Segment 3 : adds polyA tails to the sequences & record what was done
my $modifications_HoH =
  add_polyA( $bioseq_objects, $taildist, $seed, @distparams );

## Pipeline Segment 4 : write the modified sequences to a new fasta file
$fastafile = basename($fastafile);
$fastafile =~ s/(.+)\.fasta$/$1_tail.fasta/;
$fastafile = File::Spec->catfile( $outdir, $fastafile );
write_fastx_sequences( $fastafile, $bioseq_objects );

## Pipeline Segment 5 : simulate reads with R
polyester_run(
    bias       => $bias,
    errormodel => $errormodel,
    errorrate  => $errorrate,
    fastafile  => $fastafile,
    fcfile     => $fcfile,
    gcbias     => $gcbias,
    numreps    => \@numreps,
    outdir     => $outdir,
    paired     => $paired,
    readsfile  => $readsfile,
    readlen    => $readlen,
    fraglen    => $fraglen,
    fragsd     => $fragsd,
    seed       => $seed,
    strandspec => $strandspec,
    writeinfo  => $writeinfo
);

## Pipeline Segment 6 : store the modifications into a file for future use
my $mod_fname = store_modifications(
    mods        => $modifications_HoH,
    bioseq_file => $fastafile,
    format      => $modformat
);

## Pipeline Segment 7 : split files into smaller files
if ($max_sequences_per_file) {
    my @simulated_read_files =
      glob( File::Spec->catfile( $outdir, 'sample_*.fasta' ) );
    split_fastx_sequences( \@simulated_read_files, $max_sequences_per_file );
    say "done splitting files";
}

## Pipeline Segment 8 : cleanup
unlink $readsfile unless $has_readfile;
unlink $fastafile;

###############################################################################
# Subroutines

sub default_read_counts {
    my ( $outdir, $reads_fname, $num_of_seqs, $seq_count ) = @_;
    $seq_count = $seq_count // DEFAULT_READS;
    $readsfile = File::Spec->catfile( $outdir, $reads_fname );
    open my $reads_fh, '>', $readsfile
      or die "Error: unable to open $readsfile for writing\n";
    for ( 1 .. $num_of_seqs ) {
        say {$reads_fh} $seq_count;
    }
    close $reads_fh;
}

sub polyester_run {    # simulate reads with R
    my (%polyester_params) = @_;
    my $bias               = $polyester_params{bias};
    my $errormodel         = $polyester_params{errormodel};
    my $errorrate          = $polyester_params{errorrate};
    my $fastafile          = $polyester_params{fastafile};
    my $fcfile             = $polyester_params{fcfile};
    my $gcbias             = $polyester_params{gcbias};
    my $numreps            = $polyester_params{numreps};
    my $outdir             = $polyester_params{outdir};
    my $paired             = $polyester_params{paired};
    my $readsfile          = $polyester_params{readsfile};
    my $readlen            = $polyester_params{readlen};
    my $fraglen            = $polyester_params{fraglen};
    my $fragsd             = $polyester_params{fragsd};
    my $seed               = $polyester_params{seed};
    my $strandspec         = $polyester_params{strandspec};
    my $writeinfo          = $polyester_params{writeinfo};

    my $r_command =
      "Rscript --vanilla --slave --default-packages=getopt,polyester,utils "
      . File::Spec->catfile( $Bin, "polyester.R" );

## Add the options to the R command
    $r_command .= " --bias \"$bias\""             if defined $bias;
    $r_command .= " --errormodel \"$errormodel\"" if defined $errormodel;
    $r_command .= " --errorrate $errorrate"       if defined $errorrate;
    $r_command .= " --fastafile \"$fastafile\"";
    $r_command .= " --fcfile \"$fcfile\""      if defined $fcfile;
    $r_command .= " --gcbias $gcbias"          if defined $gcbias;
    $r_command .= " --numreps \"@{$numreps}\"" if $numreps;
    $r_command .= " --outdir \"$outdir\""      if defined $outdir;
    $r_command .= " --paired $paired"          if defined $paired;
    $r_command .= " --readsfile \"$readsfile\"";
    $r_command .= " --readlen $readlen"       if defined $readlen;
    $r_command .= " --fraglen $fraglen"       if defined $fraglen;
    $r_command .= " --fragsd $fragsd"         if defined $fragsd;
    $r_command .= " --seed $seed"             if defined $seed;
    $r_command .= " --strandspec $strandspec" if defined $strandspec;
    $r_command .= " --writeinfo $writeinfo"   if defined $writeinfo;

## Execute the R command
    system($r_command);
}

__END__


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.