Group
Extension

App-Dazz/lib/App/Dazz/Command/cover.pm

package App::Dazz::Command::cover;
use strict;
use warnings;
use autodie;

use App::Dazz -command;
use App::Dazz::Common;

use constant abstract => "trusted regions in the first file covered by the second";

sub opt_spec {
    return (
        [ "outfile|o=s", "output filename, [stdout] for screen", ],
        [ "range|r=s",    "ranges of first sequences",    { required => 1 }, ],
        [ 'coverage|c=i', 'minimal coverage',             { default  => 2 }, ],
        [ 'max|m=i',      'maximal coverage',             { default  => 200 }, ],
        [ "len|l=i",      "minimal length of overlaps",   { default  => 1000 }, ],
        [ "idt|i=f",      "minimal identity of overlaps", { default  => 0.85 }, ],
        [ "tmp=s",        "user defined tempdir", ],
        [ "verbose|v",    "verbose mode", ],
        { show_defaults => 1, }
    );
}

sub usage_desc {
    return "dazz cover [options] <.ovlp.tsv>";
}

sub description {
    my $desc;
    $desc .= ucfirst(abstract) . ".\n";
    $desc .= "\tAll operations are running in a tempdir and no intermediate files are kept.\n";
    return $desc;
}

sub validate_args {
    my ( $self, $opt, $args ) = @_;

    if ( @{$args} != 1 ) {
        my $message = "This command need one input file.\n\tIt found";
        $message .= sprintf " [%s]", $_ for @{$args};
        $message .= ".\n";
        $self->usage_error($message);
    }
    for ( @{$args} ) {
        if ( !Path::Tiny::path($_)->is_file ) {
            $self->usage_error("The input file [$_] doesn't exist.");
        }
    }

    if ( !exists $opt->{outfile} ) {
        $opt->{outfile} = Path::Tiny::path( $args->[0] )->absolute . ".cover.json";
    }
}

sub execute {
    my ( $self, $opt, $args ) = @_;

    # make paths absolute before we chdir
    my $infile = Path::Tiny::path( $args->[0] )->absolute->stringify;

    if ( lc $opt->{outfile} ne "stdout" ) {
        $opt->{outfile} = Path::Tiny::path( $opt->{outfile} )->absolute->stringify;
    }

    # record cwd, we'll return there
    my $cwd = Path::Tiny->cwd;
    my $tempdir;
    if ( $opt->{tmp} ) {
        $tempdir = Path::Tiny->tempdir(
            TEMPLATE => "dazz_cover_XXXXXXXX",
            DIR      => $opt->{tmp},
        );
    }
    else {
        $tempdir = Path::Tiny->tempdir("dazz_cover_XXXXXXXX");
    }
    chdir $tempdir;

    my $basename = $tempdir->basename();
    $basename =~ s/\W+/_/g;

    #@type AlignDB::IntSpan
    my $first_range = AlignDB::IntSpan->new->add_runlist( $opt->{range} );

    {
        # paf to meancov
        my $cmd;
        $cmd .= "ovlpr covered";
        $cmd .= " $infile";
        $cmd .= " --coverage $opt->{max}";
        $cmd .= " --mean";
        $cmd .= " --len $opt->{len} --idt $opt->{idt}";
        $cmd .= " -o $basename.meancov.txt";
        App::Dazz::Common::exec_cmd( $cmd, { verbose => $opt->{verbose}, } );

        if ( !$tempdir->child("$basename.meancov.txt")->is_file ) {
            Carp::croak "Failed: create $basename.meancov.txt\n";
        }
    }

    # anchor_id => mean coverage
    my $coverage_of = {};
    my $len_of      = {};
    my $stat        = Statistics::Descriptive::Full->new();
    for my $line ( App::RL::Common::read_lines("$basename.meancov.txt") ) {
        my @parts = split "\t", $line;
        next unless @parts == 3;

        my $seq_id = $parts[0];
        next unless $first_range->contains($seq_id);

        $len_of->{$seq_id} = $parts[1];

        $coverage_of->{$seq_id} = $parts[2];
        $stat->add_data( $parts[2] );
    }

    my $meta_of = { TRUSTED => $first_range->copy, };
    {
        my $median       = $stat->median();
        my @abs_res      = map { abs( $median - $_ ) } $stat->get_data();
        my $abs_res_stat = Statistics::Descriptive::Full->new();
        $abs_res_stat->add_data(@abs_res);
        my $MAD = $abs_res_stat->median();

        # the scale factor
        my $k = 3;

        my $lower_limit = ( $median - $k * $MAD ) / 2;
        my $upper_limit = ( $median + $k * $MAD ) * 1.5;

        $meta_of->{NON_OVERLAPPED} = AlignDB::IntSpan->new;
        $meta_of->{REPEAT_LIKE}    = AlignDB::IntSpan->new;
        for my $key ( keys %{$coverage_of} ) {
            if ( $coverage_of->{$key} < $lower_limit ) {
                $meta_of->{NON_OVERLAPPED}->add($key);
            }
            if ( $coverage_of->{$key} > $upper_limit ) {
                $meta_of->{REPEAT_LIKE}->add($key);
            }
        }

        $meta_of->{COV_MEDIAN}      = $median;
        $meta_of->{COV_MAD}         = $MAD;
        $meta_of->{COV_LOWER_LIMIT} = $lower_limit;
        $meta_of->{COV_UPPER_LIMIT} = $upper_limit;

        $meta_of->{TRUSTED}->subtract( $meta_of->{NON_OVERLAPPED} );
        $meta_of->{TRUSTED}->subtract( $meta_of->{REPEAT_LIKE} );
    }

    {
        # paf to covered
        my $cmd;
        $cmd .= "ovlpr covered";
        $cmd .= " $infile";
        $cmd .= " --coverage $opt->{coverage}";
        $cmd .= " --len $opt->{len} --idt $opt->{idt}";
        $cmd .= " -o $basename.covered.txt";
        App::Dazz::Common::exec_cmd( $cmd, { verbose => $opt->{verbose}, } );

        if ( !$tempdir->child("$basename.covered.txt")->is_file ) {
            Carp::croak "Failed: create $basename.covered.txt\n";
        }
    }

    # anchor_id => covered region
    my $covered_of = {};
    $meta_of->{PARTIAL_COVERED} = AlignDB::IntSpan->new;
    for my $line ( App::RL::Common::read_lines("$basename.covered.txt") ) {
        my @parts = split ":", $line;
        next unless @parts == 2;

        my $seq_id = $parts[0];
        next unless $first_range->contains($seq_id);

        my $covered = AlignDB::IntSpan->new()->add_runlist( $parts[1] );

        if ( $covered->size < $len_of->{$seq_id} ) {
            $meta_of->{TRUSTED}->remove($seq_id);
            $meta_of->{PARTIAL_COVERED}->add($seq_id);
            $covered_of->{$seq_id} = $covered->runlist;
        }
    }

    #    {
    #        # Create covered.fasta
    #        $tempdir->child("covered.fasta")->remove;
    #        for my $serial ( sort { $a <=> $b } keys %{$covered_of} ) {
    #            if ( $trusted->contains($serial) ) {
    #                my $cmd;
    #                $cmd .= "DBshow -U $basename $serial";
    #                $cmd .= " | faops replace -l 0 stdin first.replace.tsv stdout";
    #                $cmd .= " >> covered.fasta";
    #                App::Dazz::Common::exec_cmd( $cmd, { verbose => $opt->{verbose}, } );
    #            }
    #            else {
    #
    #                #@type AlignDB::IntSpan
    #                my $region = $covered_of->{$serial}{ $opt->{coverage} };
    #
    #                for my $set ( $region->sets ) {
    #                    next if $set->size < $opt->{len};
    #
    #                    my $cmd;
    #                    $cmd .= "DBshow -U $basename $serial";
    #                    $cmd .= " | faops replace -l 0 stdin first.replace.tsv stdout";
    #                    $cmd .= " | faops frag -l 0 stdin @{[$set->min]} @{[$set->max]} stdout";
    #                    $cmd .= " >> covered.fasta";
    #                    App::Dazz::Common::exec_cmd( $cmd, { verbose => $opt->{verbose}, } );
    #                }
    #            }
    #        }
    #
    #        if ( !$tempdir->child("covered.fasta")->is_file ) {
    #            Carp::croak "Failed: create covered.fasta\n";
    #        }
    #    }

    {
        $meta_of->{TRUSTED}         = $meta_of->{TRUSTED}->runlist;
        $meta_of->{NON_OVERLAPPED}  = $meta_of->{NON_OVERLAPPED}->runlist;
        $meta_of->{REPEAT_LIKE}     = $meta_of->{REPEAT_LIKE}->runlist;
        $meta_of->{PARTIAL_COVERED} = $meta_of->{PARTIAL_COVERED}->runlist;
        $meta_of->{TOTAL_RANGE}     = $first_range->runlist;

        $tempdir->child("meta.cover.json")
            ->spew( JSON::to_json( $meta_of, { pretty => 1, canonical => 1, } ) );
        $tempdir->child("meta.cover.json")->copy( $opt->{outfile} );

        $tempdir->child("partial.txt")
            ->spew( map { sprintf "%s:%s\n", $_, $covered_of->{$_} } keys %{$covered_of} );
        $tempdir->child("partial.txt")->copy("$opt->{outfile}.partial.txt");

        YAML::Syck::DumpFile( "coverage.yml", $coverage_of );
        $tempdir->child("coverage.yml")->copy("$opt->{outfile}.coverage.yml");
    }

    chdir $cwd;
}

1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.