Group
Extension

App-Dazz/lib/App/Dazz/Common.pm

package App::Dazz::Common;
use strict;
use warnings;
use autodie;

use 5.010001;

use Carp qw();
use Graph;
use IO::Zlib;
use IPC::Cmd qw();
use JSON qw();
use List::Util;
use Path::Tiny qw();
use Statistics::Descriptive qw();
use YAML::Syck qw();

use AlignDB::IntSpan;
use AlignDB::Stopwatch;
use App::RL::Common;
use App::Fasops::Common;

sub get_len_from_header {
    my $fa_fn = shift;

    my %len_of;

    my $fa_fh;
    if ( lc $fa_fn eq 'stdin' ) {
        $fa_fh = *STDIN{IO};
    }
    else {
        open $fa_fh, "<", $fa_fn;
    }

    while ( my $line = <$fa_fh> ) {
        if ( substr( $line, 0, 1 ) eq ">" ) {
            if ( $line =~ /\/(\d+)\/\d+_(\d+)/ ) {
                $len_of{$1} = $2;
            }
        }
    }

    close $fa_fh;

    return \%len_of;
}

sub get_replaces {
    my $fn = shift;

    my $replace_of = {};
    my @lines = Path::Tiny::path($fn)->lines( { chomp => 1 } );

    for my $line (@lines) {
        my @fields = split /\t/, $line;
        if ( @fields == 2 ) {
            if ( $fields[0] =~ /\/(\d+)\/\d+_\d+/ ) {
                $replace_of->{$1} = $fields[1];
            }
        }
    }

    return $replace_of;
}

sub exec_cmd {
    my $cmd = shift;
    my $opt = shift;

    if ( defined $opt and ref $opt eq "HASH" and $opt->{verbose} ) {
        print STDERR "CMD: ", $cmd, "\n";
    }

    system $cmd;
}

sub parse_ovlp_line {
    my $line = shift;

    chomp $line;
    my @fields = split "\t", $line;

    my $info = {
        f_id      => $fields[0],
        g_id      => $fields[1],
        ovlp_len  => $fields[2],
        ovlp_idt  => $fields[3],
        f_strand  => $fields[4],
        f_B       => $fields[5],
        f_E       => $fields[6],
        f_len     => $fields[7],
        g_strand  => $fields[8],
        g_B       => $fields[9],
        g_E       => $fields[10],
        g_len     => $fields[11],
        contained => $fields[12],
    };

    return $info;
}

sub beg_end {
    my $beg = shift;
    my $end = shift;

    if ( $beg > $end ) {
        ( $beg, $end ) = ( $end, $beg );
    }

    if ( $beg == 0 ) {
        $beg = 1;
    }

    return ( $beg, $end );
}


sub serial2name {
    my $dazz_db = shift;

    my $serials = shift;

    my $cmd = sprintf "DBshow -n %s %s ", $dazz_db, join( " ", @{$serials} );
    my @lines = map { $_ =~ s/^>//; $_; } grep {defined} split /\n/, `$cmd`;

    my %name_of;
    for my $i ( 0 .. $#lines ) {
        $name_of{ $serials->[$i] } = $lines[$i];
    }

    return \%name_of;
}

sub judge_distance {
    my $d_ref = shift;
    my $max_dis = shift || 5000;

    return 0 unless defined $d_ref;

    my $sum = 0;
    my $min = $d_ref->[0];
    my $max = $min;
    for my $d ( @{$d_ref} ) {
        $sum += $d;
        if ( $d < $min ) { $min = $d; }
        if ( $d > $max ) { $max = $d; }
    }
    my $avg = $sum / scalar( @{$d_ref} );
    return 0 if $avg > $max_dis;
    return 0 if $avg == 0;

    # max k-mer is 127.
    # For k-unitigs, overlaps are less than k-mer
    my $v = $max - $min;
    if ( $v < 20 or abs( $v / $avg ) < 0.2 ) {
        return 1;
    }
    else {
        return 0;
    }
}

sub g2gv {
    require GraphViz;

    #@type Graph
    my $g  = shift;
    my $fn = shift;

    my $gv = GraphViz->new( directed => 1 );

    for my $v ( $g->vertices ) {
        $gv->add_node($v);
    }

    for my $e ( $g->edges ) {
        if ( $g->has_edge_weight( @{$e} ) ) {
            $gv->add_edge( @{$e}, label => $g->get_edge_weight( @{$e} ) );
        }
        else {
            $gv->add_edge( @{$e} );
        }
    }

    Path::Tiny::path($fn)->spew_raw( $gv->as_png );
}

sub g2gv0 {
    require GraphViz;

    #@type Graph
    my $g  = shift;
    my $fn = shift;

    my $gv = GraphViz->new( directed => 0 );

    for my $v ( $g->vertices ) {
        $gv->add_node($v);
    }

    for my $e ( $g->edges ) {
        $gv->add_edge( @{$e} );
    }

    Path::Tiny::path($fn)->spew_raw( $gv->as_png );
}

sub transitive_reduction {

    #@type Graph
    my $g = shift;

    my $count = 0;
    my $prev_count;
    while (1) {
        last if defined $prev_count and $prev_count == $count;
        $prev_count = $count;

        for my $v ( $g->vertices ) {
            next if $g->out_degree($v) < 2;

            my @s = sort { $a cmp $b } $g->successors($v);
            for my $i ( 0 .. $#s ) {
                for my $j ( 0 .. $#s ) {
                    next if $i == $j;
                    if ( $g->is_reachable( $s[$i], $s[$j] ) ) {
                        $g->delete_edge( $v, $s[$j] );

                        $count++;
                    }
                }
            }
        }
    }

    return $count;
}

# https://metacpan.org/source/GSULLIVAN/String-LCSS-1.00/lib/String/LCSS.pm
# `undef` is returned if the susbstring length is one char or less.
# In scalar context, returns the substring.
# In array context, returns the index of the match root in the two args.
sub lcss {
    my $solns0 = ( _lcss( $_[0], $_[1] ) )[0];
    return unless $solns0;
    my @match = @{$solns0};
    return if length $match[0] == 1;
    return wantarray ? @match : $match[0];
}

sub _lcss {

    # Return array-of-arrays of longest substrings and indices
    my ( $r1, $r2 ) = @_;
    my ( $l1, $l2, ) = ( length $r1, length $r2, );
    ( $r1, $r2, $l1, $l2, ) = ( $r2, $r1, $l2, $l1, ) if $l1 > $l2;

    my ( $best, @solns ) = 0;
    for my $start ( 0 .. $l2 - 1 ) {
        for my $l ( reverse 1 .. $l1 - $start ) {
            my $substr = substr( $r1, $start, $l );
            my $o = index( $r2, $substr );
            next if $o < 0;
            if ( $l > $best ) {
                $best = length $substr;
                @solns = [ $substr, $start, $o ];
            }
            elsif ( $l == $best ) {
                push @solns, [ $substr, $start, $o ];
            }
        }
    }
    return @solns;
}

sub histogram_percentile {
    my $hist_of  = shift;
    my $fraction = shift;

    my @keys = sort { $a <=> $b } keys %{$hist_of};
    if ( scalar @keys == 0 ) {
        return 0;
    }

    my $target     = List::Util::sum( values %{$hist_of} ) * $fraction;
    my $cumulative = 0;

    for my $i (@keys) {
        $cumulative += $hist_of->{$i};

        if ( $cumulative >= $target ) {
            return $i;
        }
    }

    return $keys[-1];
}

1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.