Group
Extension

Geanfammer/Bin/geanfammer.pl

#!/usr/bin/perl
# Last Update by /usr/users/jpark/Perl/update_subroutines.pl: Sat Jan 24 20:29:50 EST 1998
#________________________________________________________________________
# Title     : geanfammer.pl
#
# Usage     : geanfammer.pl DATABASE(or GENOME) [a= ] [e= ] [f= ]
#             geanfammer.pl DB=XXX.mpfa File=your_query.mpfa e=0.001
#             geanfammer.pl Big_fasta Small_fasta <- Big=>target DB, small=>query db
#            * look at the Example : in the header section of this program!
#
# Function  : Creates a domain level clustering file from a given
#              FASTA format sequence DB. It has been used for complete
#              genome sequence analysis.
#
#              ------------ USAGE INFORMATION -------------------
#             The parameters you put are important for the meaningful
#               protein family maker.
#             The most important one is the E and e options (Mostly,
#               they will have same value).
#             Large E is for setting the threshold for the single
#               linkage clustering.
#             This means, any sequence hit BELOW the threshold
#               (which is good ) will be linked.
#             For example, if Seq1 matched with Seq2 with E value
#              of FASTA search:
#              0.001, and you set the threshold 0.1, then YOU
#              ordered the geanfammer to regard them a family.
#
#             The second small e option is for the dividing a complex
#              and wrong cluster into correct more correct
#              duplication modules. This is necessary as a
#              lot of multidomain proteins can be clustered together
#              WRONGLY by single linkage.
#             At this stage, the e value is irrelevant to E value
#              and you can set a higher or lower one. Or you can set
#              the same as E(just set the 2 the same!)
#
#             Rough guide from our experience for E and e values:
#              We know that with 1000 sequence database, 0.01
#              produces around 1% error in grouping sequences
#              according to the evalue.
#              With 180,000, 0.081 gave us less than 1% error.
#             Evalue of FASTA and SSEARCH is DEPENDENT on DB size,
#              so you need to play a little bit to know the best
#              E value for your OWN database or genome.
#             The best approach is :
#               1) You run geanfammer.pl with any of your target DB
#                  with certain E value you like
#               2) Check sequence families which are clustered
#                  in the final resultant file xxxx.gclu and decide
#                  if the E value is low or high. Lower evalues will
#                  make sure you do not make wrong clusters while
#                  high evalue will include more probable sequence
#                  family members.
#               3) Put all the xxxx.mspa files in subdirectory(s)
#                  created by geanfammer and run divclus.pl (which
#                  is accompanied in the package) with different
#                  Evalues. Divclus will not run any search algorithm
#                  etc, so it can be done fairly quickly.
#
#          * Most of the subroutines are found in Bio.pl or Bio.pm
#            Bioperl library for perl.
#
#
# Example   :geanfammer.pl E_gnme.fa             # simplest form (FASTA ssearch, I guess)
#            geanfammer.pl E_gnme.fa a=ssearch   # use SSEARCH
#            geanfammer.pl E_gnme.fa a=blast     # use PSI-blast ssearch
#            geanfammer.pl E_gnme.fa o           # for overwriting
#                                                   when you want a
#                                                   fresh run ovr old
#            geanfammer.pl E_gnme.fa c         # For keeping
#                                                 SSO files
#                                                 (fasta output)
#            geanfammer.pl E_gnme.fa k=2       # changing default
#                                                 k tuple for
#                                                 FASTA to 2
#       -->  geanfammer.pl E_gnme.fa E=0.01
#
# Keywords  : genome_analysis_and_protein_family_maker,
#             genome_ana_protein_fam_maker
# Options   :
#             o  for overwrite existing xxxx.fa files for search
#             c  for create SSO file (sequence search out file)
#             d  for very simple run and saving the result in
#                    xxxx.gz format in sub dir starting with one char
#             N
#             s
#             m
#             v  for debugging purpose. It says more to you while running
#             z
#             D  for making subdir like ./MG or /FA in PWD. For clean PWD
#             L  for Lean output(removes all the intermediate
#                                     outputs to save space)
#             u  for making separate summary file (redundant now)
#
#             DB=
#             File=
#             k= for k-tuple value. default is 1 (ori. FASTA prog.
#                                                   default is 2)
#             a= for choosing either fasta or ssearch algorithm
#                    You can set absolute path like (/usr/bin/fasta)
#             E= for Evalue cutoff for single linkage clustering
#                    $E_cut_main AND divide_clusters subroutine.
#             u=
#             l=
#             d=
#             T= for minimal domain size (default is 30 aa residue)
#
#   !! Do not remove the following lines down to # Author line.
#                This program parses them!!
#
#  $factor=                 by f=     ## overlapping factor
#  $Lean_output=L           by L -L
#  $dynamic_factor=y        by y  Y -y -Y
#  $over_write=o            by o -o
#  $create_sso_file=c       by c -c
#  $k_tuple=                by k=
#  $upper_expect_limit=     by u=
#  $lower_expect_limit=     by l=
#  $algorithm=              by a=
#  $No_processing=N         by N -N
#  $single_msp=s            by s -s
#  $sequence_DB=            by DB= D=
#  $query_File=             by File= F=
#  $machine_readable=M      by M -M
#  $make_subdir_out=D       by D
#  $make_subdir_gzipped=d   by d -d
#  $direct_MSP_conversion=m by m -m
#  $verbose=v               by v -v
#  $sub_dir_size=           by d=
#  $Evalue_thresh=          by E=
#  $optimize=z              by z -z
#  $make_separate_summary=u by u -u
#  $length_thresh=          by T=      # minimal sequence domain length threshold
#  $take_last_iter_PSI_BLA=l  by l -l
#  $PSI_BLA_iter_limit=     by j=
#  $PSI_BLA_step_eval=      by e=
#
# Author    : Sarah A Teichmann, Jong Park, sat@mrc-lmb.cam.ac.uk,
#                                      jong@salt2.med.harvard.edu
#             For commercial use, please contact one of the authors.
# Version   : 3.4
#------------------------------------------------------------------

print "\n\n\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
print "# (1) $0: The Geanfammer started ... BEFORE parsing ARGUments\n";
print "#   -Remember that default evalue used is 0.01. For PSI-Blast step E=0.001 \n";
print "#_____________________________________________________________________\n";

@final_clu_files=@{&geanfammer_main};


#________________________________________________________________________________
# Title     : geanfammer_main
# Usage     : &geanfammer_main;
# Function  : The main sub of geanfammer
# Example   :
# Keywords  : main_geanfammer, geanfammer
# Options   :
# Version   : 2.4
#--------------------------------------------------------------------------------
sub geanfammer_main{
    ### !!! As parse_arguments is in this, DO not 'my' any variables here
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # All the defaults, Evalues are determined in geanfammer sub
    #__________________________________________________________________
    $|=1;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # If $algorithm is not set by the user, set it to FASTA
    #_________________________________________________________
    unless($algorithm){
        $algorithm='fasta'; # default search algorithm setting(will be overridden
    }                         # by 'a=xxx' prompt argument

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Setting the default values
    #_______________________________________________________________
    $sub_dir_size=2; # this is the subdirectory name char size
    $machine_readable='M';  # this is to invoke FASTA's m=10 option. It makes machine readable output which is used by JONG's programs
    $make_msp_in_sub_dir_opt='m';
    $make_subdir_gzipped='d';
    $make_subdir_out=$make_msp_in_sub_dir_opt='D';
    $Evalue_thresh=0.01;
    $length_thresh=30;  # default
    $take_last_iter_PSI_BLA='l';  ## this is to take the last iter only(default)
    $PSI_BLA_iter_limit=5;            # let's set it low
    $PSI_BLA_step_eval=0.001;  # default
    $make_MSP_files='q';

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # preprocessing the inputs,  parse_arguments reads in the options in the headerbox
    # JONG's parse_arguments parses the shell prompt input files and options
    #__________________________________________________________________________________
    @your_genome_or_db_to_analyse_file=@{&parse_arguments(1)};

    print "\n\n# (2) geanfammer_main (with $algorithm): \@your_genome_or_db_to_analyse_file are(is) @your_genome_or_db_to_analyse_file\n";
    print "\n# (2.1) geanfammer_main (with $algorithm): \$sequence_DB: $sequence_DB \$query_File: $query_File\n";
    print "\n => @your_genome_or_db_to_analyse_file with $algorithm. Min domain size is \"$length_thresh\"
    \$make_subdir_out is $make_subdir_out\n\n";

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Just checking error!. So far nothing important happened.
    #_____________________________________________________________
    if(@your_genome_or_db_to_analyse_file < 1
         and (!$sequence_DB and !$query_File) ){
        print "\n# (E) geanfammer_main: ERROR!\n";
        print "\n# Dear $ENV{'USER'}, $0: failed to find input file!\n
                    Did you put FASTA format DB file as input?\n
                    Or I guess your INPUT file DOES NOT exist in PWD.\n\n";
        print " As like:  $0 MG.fa \n\n\n";
        print chr(7);
        die;
    }else{ print "# (i) $0 is running geanfammer sub\n"; }


    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # This is to handle 'geanfammer.pl temp.fa DB=temp_DB.fa' situation
    # where, the query file(temp) and Target DB file (DB) are not the same
    #____________________________________________________________________
    if(@your_genome_or_db_to_analyse_file==1 and -s $sequence_DB){
         $query_File=$your_genome_or_db_to_analyse_file[0];
    }elsif(@your_genome_or_db_to_analyse_file==1 and -s $query_File){
         $sequence_DB=$your_genome_or_db_to_analyse_file[0];
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
    # Checking if they have 'fasta' or 'ssearch' in the path
    #__________________________________________________________________________________
    if($algorithm=~/\/([^\/]+) *$/){
        print "\n# Checking \$algorithm ($algorithm) name\n";
        $algorithm_name=$1;
        if(-s $algorithm){
            print "\n# (i) $0 will use \"$algorithm\"\n";
        }else{
            $result_of_which_run=`which $algorithm_name`;
            if($result_of_which_run=~/^ *(\/\S+\/)[fastassearch]+\d* *$/){ ## after Lily Fu's suggestion
                print "\n# $0: Your $algorithm_name is in $1, good, I am running it\n";
            }else{
                print "\n# (E) \$algorithm value $algorithm is not found\n";
                print "\n# (E) $0 ran \'which\' Linux command and the result is:\n";
                print "  $result_of_which_run, You may have fasta3 instead of fasta\n";
                print "\n# Please check your path for $algorithm\n\n"; print chr(7);
                die;
            }
        }
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # NOW the REAL thing is here !!!!!!!!!
    # Running the actual big sub 'geanfammer'
    #_____________________________________________
    @final_clu_files=@{&geanfammer(\@your_genome_or_db_to_analyse_file,
                "DB=$sequence_DB",  ## when user specified the target DB
                "File=$query_File", ## when user specified the query file
                "f=$factor",        ## f is the sequence overlap factor option (very important!!)
                "a=$algorithm",     ## what type of search algo shall I use?
                "T=$length_thresh", ## MINIMUM seq. length thresh
                 $verbose,          ## Nothing, just ignore
                 "d=$sub_dir_size", ## When geanfammer makes temporary sub dir such as SU, how many chars can it use?
                 $over_write,
                 $dynamic_factor,
                 $create_sso_file,
                 $reverse_sequence,
                 $machine_readable,
                 $make_msp_in_sub_dir_opt,
                 $make_subdir_out,
                 $make_MSP_files,
                 "E=$Evalue_thresh",
                 $make_subdir_gzipped,
                 $Lean_output,
                 $take_last_iter_PSI_BLA,
                 "j=$PSI_BLA_iter_limit",
                 "e=$PSI_BLA_step_eval"
             )};
    print "\n# $0 : the final output \'@final_clu_files\' is created " if $make_separate_summary;
    print "\n#__________________________________________________________________\n\n" if $verbose;
    return(\@final_clu_files);
}



#------------------------- SUB ROUTINES -----------------------------------


#______________________________________________________________________
# Title     : geanfammer
# Usage     : &geanfammer(\@your_genome_or_db_to_analyse_file,
#                          $verbose);
#
# Function  : Creates a domain level clustering file from a given
#              FASTA format sequence DB. It has been used for complete
#              genome sequence analysis. Can use PSI-blat, fasta, ssearch
#
#              ------------ USAGE INFORMATION -------------------
#             The parameters you put are important for the meaningful
#               protein family maker.
#             The most important one is the E and e options (Mostly,
#               they can have same value).
#             Large E is for setting the threshold for the single
#               linkage clustering.
#             This means, any sequence hit BELOW the threshold
#               (which is good ) will be linked.
#             For example, if Seq1 matched with Seq2 with E value
#              of FASTA search:
#              0.001, and you set the threshold 0.1, then YOU
#              ordered the geanfammer to regard them a family.
#
#             The second small e option is for the dividing a complex
#              and wrong cluster into correct more correct
#              duplication modules. This is necessary as a
#              lot of multidomain proteins can be clustered together
#              WRONGLY by single linkage.
#             At this stage, the e value is irrelevant to E value
#              and you can set a higher or lower one. Or you can set
#              the same as E.
#
#             Rough guide from our experience for E and e values:
#              We know that with 1000 sequence database, 0.01
#              produces around 1% error in grouping sequences
#              according to the evalue.
#              With 180,000, 0.081 gave us less than 1% error.
#             Evalue of FASTA and SSEARCH is DEPENDENT on DB size,
#              so you need to play a little bit to know the best
#              E value for your database or genome.
#             The best approach is :
#               1) You run geanfammer.pl with any of your target DB
#                  with certain E value you like
#               2) Check sequence families which are clustered
#                  in the final resultant file xxxx.gclu and decide
#                  if the E value is low or high. Lower evalues will
#                  make sure you do not make wrong clusters while
#                  high evalue will include more probable sequence
#                  family members.
#               3) Put all the xxxx.mspa files in subdirectory(s)
#                  created by geanfammer and run divclus.pl (which
#                  is accompanied in the package) with different
#                  Evalues. Divclus will not run any search algorithm
#                  etc, so it can be done fairly quickly.
#
# Example   :geanfammer.pl E_gnme.fa             # simplest form
#            geanfammer.pl E_gnme.fa a=ssearch   # use SSEARCH
#            geanfammer.pl E_gnme.fa o           # for overwriting
#                                                   when you want a
#                                                   fresh run ovr old
#            geanfammer.pl E_gnme.fa c         # For keeping
#                                                 SSO files
#                                                 (fasta output)
#            geanfammer.pl E_gnme.fa k=2       # changing default
#                                                 k tuple for
#                                                 FASTA to 2
#            geanfammer.pl E_gnme.fa E=0.01     # set the E value
#                                                 for initial single
#                                                 linkage clustering
#            geanfammer.pl E_gnme.fa e=0.01    # set the E value
#                                                for domain level linkage
#       -->  geanfammer.pl E_gnme.fa e=0.01 E=0.01 # set the 2 E values
#                                                    separately (no need
#                                                    to do this)
#
# Keywords  : genome_analysis_and_protein_family_maker,
#             genome_ana_protein_fam_maker
# Options   :
#             o  for overwrite existing xxxx.fa files for search
#             c  for create SSO file (sequence search out file)
#             d  for very simple run and saving the result in
#                    xxxx.gz format in sub dir starting with one char
#             N
#             s
#             m
#             v
#             z
#             D
#             y  for dynamic factor
#             L  for Lean output(removes all the intermediate
#                                     outputs to save space)
#             u  for making separate summary file (redundant now)
#
#             DB=
#             File=
#             k= for k-tuple value. default is 1 (ori. FASTA prog.
#                                                   default is 2)
#             a= for choosing either fasta or ssearch algorithm
#             E= for Evalue cutoff for single linkage clustering
#                    $E_cut_main
#             e= for Evalue cutoff for divide_clusters subroutine.
#             u=
#             l=
#             d=
#             T= for $length_thresh (minimal domain size thresh)
#
#   !! Do not remove the following lines down to # Author line.
#                This program parses them
#
#  $factor=                 by f=       ## overlapping factor
#  $Lean_output=L           by L -L
#  $dynamic_factor=y        by y  Y -y -Y
#  $over_write=o            by o -o
#  $create_sso_file=c       by c -c
#  $k_tuple=                by k=
#  $upper_expect_limit=     by u=
#  $lower_expect_limit=     by l=
#  $algorithm=              by a=
#  $No_processing=N         by N -N
#  $single_msp=s            by s -s
#  $sequence_DB=            by DB=
#  $query_File=             by File=
#  $machine_readable=M      by M -M
#  $make_subdir_out=D       by D
#  $make_subdir_gzipped=d   by d -d
#  $make_MSP_files=q        by q -q
#  $direct_MSP_conversion=m by m -m
#  $verbose=v               by v -v
#  $sub_dir_size=           by d=
#  $Evalue_thresh=          by E=
#  $optimize=z              by z -z
#  $make_separate_summary=u by u -u
#  $length_thresh=          by T=
#  $take_last_iter_PSI_BLA  by l
#  $PSI_BLA_iter_limit=     by j=
#  $PSI_BLA_step_eval=      by e=
#
# Author    : Sarah A Teichmann, Jong Park, sat@mrc-lmb.cam.ac.uk,
#                                      jong@salt2.med.harvard.edu
# Version   : 3.3
#--------------------------------------------------------------------------------
sub geanfammer{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my ($algorithm, $upper_expect_limit, $k_tuple, $sub_dir_size, $factor,
            @MSPA_files_main, $msp_directly_opt, %hash_main, $single_linkage_file,
            @written_mspa_files, $num_of_seq_in_FASTA_file, $take_last_iter_PSI_BLA,
            $Evalue_thresh, %fasta_seqs, $final_summary_file,
            @sub_clustering_clu_files, $base, $final_gclu_output, $Lean_output,
            $dynamic_factor, $length_thresh, $make_msp_in_sub_dir_opt );

    $algorithm         ='fasta'; # default algorithm set(you can change of course by using a= opt
    $upper_expect_limit= 10;     # default
    $k_tuple=1;                  # FASTA default (not applicable for other algorithms)
    $sub_dir_size=2;             # default
    $msp_directly_opt='m';
    $Evalue_thresh=0.01; # default E value
    $length_thresh=30;            # default
    $PSI_BLA_iter_limit=5;
    $PSI_BLA_step_eval=0.001;
    $take_last_iter_PSI_BLA='l';

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________
    if($char_opt=~/m/){    $msp_directly_opt='m';   }
    if($char_opt=~/b/){    $do_in_batch        = 'b' } # for reading in all the
    if($char_opt=~/v/){    $verbose='v';             } # for showing debugging information
    if($char_opt=~/i/){    $indup='i';               }
    if($char_opt=~/R/){    $add_range          = 'r' }
    if($char_opt=~/y/){    $dynamic_factor     = 'y' }
    if($char_opt=~/z/){    $optimize           = 'z' }
    if($char_opt=~/o/){    $over_write         = 'o' }
    if($char_opt=~/c/){    $create_sso         = 'c' }
    if($char_opt=~/s/){    $single_big_msp     = 's'; print "\n# Single file opt is set\n"; }
    if($char_opt=~/M/){    $machine_readable   = 'M' }
    if($char_opt=~/d/){$make_gz_in_sub_dir_opt = 'd' } # for simple search and storing in gz file (sso file will be zipped
    if($char_opt=~/D/){$make_msp_in_sub_dir_opt= 'D' } # for simple search and storing msp file
    if($char_opt=~/n/){    $new_format         = 'n' }
    if($char_opt=~/x/){    $answer_for_search  = 'x' }
    if($char_opt=~/r/){    $reverse_sequence   = 'r'  }
    if($char_opt=~/L/){    $Lean_output        = 'L'  }
    if($char_opt=~/l/){    $take_last_iter_PSI_BLA = 'l'; print "\n     \"l\" option was used for last Blast iteration parsing\n";  }
    if($char_opt=~/q/){    $make_MSP_files     ='q'  }
    if($vars{'f'}=~/\S+/){       $factor= $vars{'f'}; }
    if($vars{'a'}=~/(\S+)/){   $algorithm          = $1            }
    if($vars{'l'}=~/(\S+)/){   $lower_expect_limit = $1            }
    if($vars{'u'}=~/(\S+)/){   $upper_expect_limit = $1            }
    if($vars{'k'}=~/(\S+)/){   $k_tuple            = $1            }
    if($vars{'t'}=~/(\S+)/){   $Score_thresh       = $1            }
    if($vars{'d'}=~/(\S+)/){   $sub_dir_size       = $1            }
    if($vars{'s'}=~/(\S+)/){   $single_big_msp     = $1            }
    if($vars{'DB'}=~/(\S+)/){  $sequence_DB        = $1; push(@file,$sequence_DB);   }
    if($vars{'File'}=~/(\S+)/){$query_File         = $1; push(@file,$query_File);  }
    if($vars{'Query_seqs'}=~/(\S+)/){            %seq_input = %{$1}            }
    if($vars{'E'}=~/\S+/){   $Evalue_thresh      = $vars{'E'};     }
    if($vars{'T'}=~/\S+/){   $length_thresh      = $vars{'T'};     }
    if($vars{'j'}=~/\S+/){   $PSI_BLA_iter_limit = $vars{'j'};   }
    if($vars{'e'}=~/\S+/){   $PSI_BLA_step_eval    = $vars{'e'};   }

    if(@file==2 and (!$query_File and !$sequence_DB) ){
        ($query_File, $sequence_DB)=@{&sort_files_by_size(\@file)};
    }elsif(@file==1){ $query_File=$file[0]; $sequence_DB=$file[0]; }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Only the Query file is taken to @file variable
    #________________________________________________
    @file=($query_File);

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Now, lets break the input query FASTA file to do something useful for humanity!
    # creating xxx.msso.gz or pbla files in subdirs
    #_______________________________________________________________________________________
    for($i=0; $i< @file; $i++){
        if(-s $file[$i]){   $input_db_or_genome=$file[$i] }
        if(-s $query_File){ $input_db_or_genome=$query_File }

        print "\n# (i) geanfammer : \$query_File: $query_File, \$sequence_DB: $sequence_DB, Evalue for single link:$Evalue_thresh\n";
        print "# (i) geanfammer : Min. Domain size is $length_thresh\n";
        print "# (i) geanfammer : Factor is $factor\n";
        print "# (i) geanfammer : \$Evalue_cut_divclus is $Evalue_cut_divclus\n";
        $base=${&get_base_names($query_File)};
        $final_gclu_output="$base\.gclu";
        %fasta_seqs=%{&open_fasta_files(\$query_File)};
        $num_of_seq_in_FASTA_file=keys %fasta_seqs;

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # When you want to reverse your protein sequence for whatever crazy reason, do it here
        #________________________________________________________________________________________
        if($reverse_sequence){ ## reverse the query seqs.
            %fasta_seqs=%{&reverse_sequences(\%fasta_seqs)};
        }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ask if she wants to search the db first, as it is the first time in her life??
        #________________________________________________________________________________
        unless($answer_for_search and !$over_write){
            print "\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~";
            print "\n# (3) You should have working FASTA (or SSEARCH) in the path";
            print "\n#     If you are doing the very first search now, Say, \"y\"es.     ";
            print "\n#_______________________________________________________________";
            print "\n# Shall I do seq. search using $file[$i]?", "\n  (y/n)\n\n \t-->";
            $answer_for_search=getc;
        }

        if($answer_for_search=~/y/i){
            $over_write='o';

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  By default, the search level E value is always higher by 2 ! Why?, well, that is how I decided.
            #____________________________________________________________________________________________________
            $Evalue_thresh_AT_SEARCH=$Evalue_thresh+2; ## adding 2 to the given E value

            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            #  Now, let's do the critical SEARCHes.....
            #  Searching the db self to self using default algorithm of FASTA ktup=1
            #__________________________________________________________________________
            $num_of_seq_in_FASTA_file=@{&do_sequence_search(
                                $Lean_output,
                                "a=$algorithm",
                                \$file[$i],
                                $over_write,
                                $msp_directly_opt,
                                "u=$upper_expect_limit",
                                $do_in_batch,
                                $create_sso,
                                $reverse_query,
                                $single_big_msp,
                                $machine_readable,
                                "DB=$sequence_DB",
                                "File=$query_File",
                                $make_msp_in_sub_dir_opt,
                                $make_gz_in_sub_dir_opt,
                                "d=$sub_dir_size",
                                $new_format,
                                $add_range,
                                "E=$Evalue_thresh_AT_SEARCH",
                                $take_last_iter_PSI_BLA,
                                "j=$PSI_BLA_iter_limit",
                                "e=$PSI_BLA_step_eval",
                            ) };
        }else{
            print "\n# (i) $0; You chose NO search. I will cluster and divide now\n";
            print "# (i) The min domain size var \$length_thresh is $length_thresh\n";
            $over_write='o';
            sleep(1);
        }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # MSPA files are the most important outputs. Formerly, it was called MSP files
        #______________________________________________________________________________________
        @MSPA_files_main=@{&get_all_mspa_files};
        unless(@MSPA_files_main > 0){ die "\n (E) Something is wrong. @MSPA_files_main is empty. \n\n\n"; }

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # When users didn't define E value and no STATIC Eval is set,
        #    estimate according to the db size
        #______________________________________________________________________
        if( !$Evalue_thresh  or $dynamic_factor){
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # following is a very rough guide for a reasonable E value thresh for different DB size
                #______________________________________________________________________________________________
                if(@MSPA_files_main > 280,000){     $Evalue_thresh=0.2;
                }elsif(@MSPA_files_main > 180000){  $Evalue_thresh=0.1;
                }elsif(@MSPA_files_main > 50000){   $Evalue_thresh=0.08;
                }elsif(@MSPA_files_main > 10000){   $Evalue_thresh=0.07;
                }elsif(@MSPA_files_main > 1000){    $Evalue_thresh=0.05;
                }elsif(@MSPA_files_main > 500){     $Evalue_thresh=0.03;
                }elsif(@MSPA_files_main > 100){     $Evalue_thresh=0.02;
                }elsif(@MSPA_files_main > 50){      $Evalue_thresh=0.01;
                }elsif(@MSPA_files_main > 20 ){     $Evalue_thresh=0.001;
                }
        }


        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Making single linkage cluster file xxxxxxxx.sclu
        #___________________________________________________________________________________________
        print "\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`#\n";
        print "# BEfore running Sinlge linkage sub                    #\n";
        print "# Evalue single for clus & Divclus  : $Evalue_thresh   #\n";
        print "#______________________________________________________#\n";
        %hash_main=%{&mspa_single_link_hash(\@MSPA_files_main, $Evalue_thresh)};
        sleep(2);


        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Single LINKAGE file,
        #___________________________________________________________________________________________
        $single_linkage_file=${&print_clusfile_from_hash(\%hash_main)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Make MSP files from the SCLU file (like: 2-2_cluster.mspa 3-4_cluster.mspa)
        #__________________________________________________________________________________
        if(-s $single_linkage_file){
            print "\n# (i) Good! \$single_linkage_file $single_linkage_file is made\n";
            @written_mspa_files=@{&convert_clu_to_mspa(\$single_linkage_file, $Lean_output)};
        }else{
            print "\n# (E) BAD! geanfammer: Before divide_clusters sub \n";
            print "\n# (E) BAD! \$single_linkage_file $single_linkage_file is NOT here\n";
            die;
        }

        $over_write='o';
        $average_region='A';
        $range='r';
        $merge='m';
        $sat_file=0;

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # >>>>>> !! The very core of geanfammer. DIVCLUS
        #___________________________________________________________________________
        @sub_clustering_clu_files=@{&divide_clusters(
                        $Lean_output,
                        \@written_mspa_files,
                        "s=$score",
                        "f=$factor", ## this is a very impo. parameter in the behaviour of divclus, Sarah!
                        "T=$length_thresh",
                        "E=$Evalue_thresh", ## this is a very impo. parameter in the behaviour of divclus, Sarah!
                        $dynamic_factor,
                        $verbose,
                        $range,
                        $merge,
                        $sat_file,
                        $dindom,
                        $indup,
                        $over_write,
                        $optimize,
                        $short_region,
                        $large_region,
                        $average_region
                     )};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  This is the final result now.
        #_______________________________________________________________

        #print "\n\n# create_sorted_cluster: Final gclu file name is $final_gclu_output\n  with @sub_clustering_clu_files\n" if $verbose;
        $good_cluster_file=${&create_sorted_cluster(\@sub_clustering_clu_files, \$final_gclu_output)};
        print "\n\n\n#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~";
        print "\n# ! Finished create_sorted_cluster(geanfammer), result is \"$good_cluster_file\"";
        print "\n#_________________________________________________________________\n\n\n";

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  make_clustering_summary  became redundant after peer review
        #_______________________________________________________________
        $final_summary_file=${&make_clustering_summary(\$final_gclu_output, $make_separate_summary)};

        push(@files_created, $final_summary_file);

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Make the whole output Lean
        #_______________________________________
        if($Lean_output){
                for($k=0; $k < @sub_clustering_clu_files; $k++){
                        $base_name=${&get_base_names($sub_clustering_clu_files[$k])};
                        $sub_clustering_msp_file="$base_name\.mspa";
                        unlink($sub_clustering_clu_files[$k], $sub_clustering_msp_file);
                }
        }
    }# end of for loop for @file
    return(\@files_created);
}



#_______________________________________________________________________
# Title     : divide_clusters
# Usage     : &divide_clusters(\@file);
# Function  : This is the main funciton for divclus.pl
#               divides complex single linkage cluster into smaller duplication
#               module level sub clusters.
# Example   : &divide_clusters(\@file, $verbose, $range, $merge, $sat_file,
# 	                $dindom, $indup, "T=$length_thresh", "E=$Evalue_thresh", $over_write,
#                   $optimize, "s=$score", "f=$factor");
#
# Keywords  : divicl, divclus, div_clus, divide clusters
# Options   : _  for debugging.
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   t=<digit>   for seqlet(duplication module) length threshold
#                  (same as l opt, confusing, huh? )
#   s=<digit>   for score threshold
#   E=<digit>   for evalue threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   v           for verbose printout (infor)
#   D           for dynamic factor
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#   o  for $over_write
#
# Version   : 3.4
#------------------------------------------------------------------------
sub divide_clusters{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
         $score, @temp_show_sub, $optimize, $file, $Evalue_thresh, $over_write, $din_dom,
         $sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
         $average_region, $dynamic_factor, @sub_clustering_clu_files,
         @splited1, $link_or_not,  %duplicate, @No_file_found_divclus_fail);

    $optimize=1;
    $Evalue_thresh=0.001; # the default
    $factor=7; # default factor is 7 for 70%
    $length_thresh=30;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________
    if($char_opt=~/m/){        $merge='m';
    }if($char_opt=~/v/){       $verbose='v'; # for showing debugging information
    }if($char_opt=~/i/){       $indup='i';
    }if($char_opt=~/z/){       $optimize='z';
    }if($char_opt=~/o/){       $over_write='o';
    }if($char_opt=~/d/){       $din_dom='d';
    }if($char_opt=~/s/){       $sat_file='s';
    }if($char_opt=~/y/){       $dynamic_factor='y';
    }if($char_opt=~/S/){       $short_region  ='S';
    }if($char_opt=~/L/){       $large_region  ='L';
    }if($char_opt=~/A/){       $average_region='A';
    }if($vars{'T'}=~/\d+/){    $length_thresh= $vars{'T'};
    }if($vars{'l'}=~/\d+/){    $length_thresh= $vars{'l'}; ## synonym of 't'
    }if($vars{'f'}=~/\S+/){    $factor= $vars{'f'};
    }if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
    }if($vars{'e'}=~/\d+/){    $Evalue_thresh= $vars{'e'}; # synonym of e
    }if($vars{'E'}=~/\d+/){    $Evalue_thresh= $vars{'E'}; # synonym of e
    }
    $percent_fac=$factor*10; # <-- this is just to show the factor in %
    print "\n(i) Input to divide_clusters sub are: \"@file\" sleeping for 3 seconds"; sleep 3;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
    #________________________________________________________________________________
    if(@file == 1){  #<=== @file has xxxx.mspa, yyyy.mspa  zzzz.mspa ....,
		$file=$file[0];
		$base_1=${&get_base_names($file)};
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (2) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
		#______________________________________________________________________________________________
		$output_clu_file="$base_1\_F${factor}\.clu";

		if( !$over_write and -s $output_clu_file){
			print "\n# $output_clu_file Already EXISTS, skipping. Use \'o\' opt to overwrite\n"; exit;
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (3) merge_sequence_in_mspa_file does not do much. Just filtering and producing
		#     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format from mspa format
		#________________________________________________________________________________
        print "\n(i) Running merge_sequence_in_mspa_file";
        @grouped_seq_names=@{&merge_sequence_in_mspa_file(\@file, "s=$score", $optimize, $din_dom, $sat_file,
							$optimize, "T=$length_thresh", "E=$Evalue_thresh", "f=$factor", "$range", "$merge", $verbose,
							$short_region, $large_region, $average_region, $over_write, $dynamic_factor)};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (4) This is critical seqlet merging step. Up to now, things are fine usually.!!!
		#________________________________________________________________________________
		unless(@grouped_seq_names == 1){  ##  if @grouped_seq_names has one string like 'FAM_8_7 FAM_8_4 FAM_8_3' skip
			F1: for($i=0; $i< @grouped_seq_names; $i++){
                @splited1=split(/\s+/, $grouped_seq_names[$i]);
				for($j=0; $j< @grouped_seq_names; $j++){
    				 if($grouped_seq_names[$i] eq $grouped_seq_names[$j]){ next  }
					 @splited2=split(/\s+/, $grouped_seq_names[$j]);
                     $link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1,
                                                                           \@splited2,
                                                                           "f=$factor")};
					if($link_or_not){
                        $optimize=1; ## This should be nearly always 1 !!!!!!!
                        if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
                            $grouped_seq_names[$i]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																		$short_region, $large_region, $average_region)} );
     				    }else{
							$grouped_seq_names[$i]=join(' ', grep { ! $duplicate{$_}++ } (@splited1, @splited2) );
					    }
                        splice(@grouped_seq_names, $j,1);
						$j--; $i--; next F1;
					}
				}

             }
		}
		#~~~~~~~~~~~~~~ I used to use a sub, but to save time above is inserted ~~~~~~~~~~~~~
        #@grouped_seq_names=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
	    #				 "f=$factor", $short_region, $large_region, $average_region, $optimize)};

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# (5) This is showing the result in clu file format
				#________________________________________________________________________________
                @temp_show_sub=&show_subclusterings(\@grouped_seq_names, $file, $sat_file, $dindom, $indup,
						   "E=$Evalue_thresh", "p=$percent_fac", "f=$factor" );
				$good_bad       = $temp_show_sub[0];
				$indup_c        = $temp_show_sub[1];
				$sum_seq_num   += $temp_show_sub[2];
				push(@sub_clustering_out_files, @{$temp_show_sub[3]});

				if($good_bad==1){      push(@good, $file);
				}else{                 push(@bad, $file);       }

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# (6) Final write up stage (unecessary)
				#_______________________________________________________________
          &write_good_bad_list_in_divide_clusters(\@good, \@bad);

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # when more than one single file input is given (Default usually)
	 #_________________________________________________________________
	 }elsif(@file >1 ){
		 my (@good, @bad);
		 if($indup =~/i/i){   open (INDUP, ">indup_stat\.txt");  } # this is not essential.

         for($i=0; $i< @file; $i++){
			my (@grouped_seq_names, @temp_show_sub, $indup_c, $big_mspa_file);
			$indup_c=0;
			$big_mspa_file=$file[$i];
            unless(-s $big_mspa_file){
               print "\n# (E) \$big_mspa_file ($big_mspa_file) does not exist.\n\@file: @file\n";
               push(@No_file_found_divclus_fail, $big_mspa_file);
            }

			$base_1=${&get_base_names($big_mspa_file)};
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# (1) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
			#______________________________________________________________________________________________
			$output_clu_file="$base_1\_F${factor}\.clu";

			if( !$over_write and -s $output_clu_file){	print "\n# $output_clu_file Already EXISTS, skipping. Use \'w\' opt to overwrite\n";
					next;  }

						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						#  (2) If clu file(eg 2-1618_ss.clu ) is in pwd, tries to skip
						#____________________________________________________________
						if((-s $output_clu_file) > 10 and $over_write !~/o/){
							print "# $output_clu_file exists, skipping, use \"o\" option to overwrite\n";  next;
						}

						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# (3) merge_sequence_in_mspa_file does not do much. Just filtering and producing
						#     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format of STRING from mspa format
						#     $big_mspa_file is an MSPA file
						#________________________________________________________________________________
                        @grouped_seq_names=@{&merge_sequence_in_mspa_file(\$big_mspa_file, "s=$score", $din_dom, $sat_file, $optimize,
																"T=$length_thresh", "E=$Evalue_thresh", "f=$factor", "$range", "$merge", $verbose, $over_write,
																 $short_region, $large_region, $average_region, $dynamic_factor )};
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						#  (4) Clustering the sets of merged seqlets => CORE algorithm
						#_____________________________________________________________________
						unless(@grouped_seq_names == 1){  ##  if @grouped_seq_names has one string like 'FAM_8_7 FAM_8_4 FAM_8_3' skip
								F2: for($g=0; $g< @grouped_seq_names; $g++){
										@splited1=split(/ +/, $grouped_seq_names[$g]);
										for($h=0; $h< @grouped_seq_names; $h++){
												if($grouped_seq_names[$g] eq $grouped_seq_names[$h]){ next  }
												@splited2=split(/ +/, $grouped_seq_names[$h]);
												$link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1, \@splited2, "f=$factor")};
												if($link_or_not){
														if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
															 $grouped_seq_names[$g]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																													 $short_region, $large_region, $average_region)} );
														}else{
															 $grouped_seq_names[$g]=join(' ', grep { ! $duplicate{$_}++ } (@splited1, @splited2) );
														}
														splice(@grouped_seq_names, $h, 1); $h--; $g--; %duplicate=(); next F2;
												}
										}
								}
						}
						#~~~~~~~~~~~~~~ I used to use a sub, but to save time above is inserted ~~~~~~~~~~~~~
						#@grouped_seq_names=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
						#			 $short_region, $large_region, $average_region)};
						@temp_show_sub=&show_subclusterings(\@grouped_seq_names, $big_mspa_file, $sat_file, $dindom, $indup,
																										"E=$Evalue_thresh", "p=$percent_fac", "f=$factor");
												$good_bad       = $temp_show_sub[0];
												$indup_c        = $temp_show_sub[1];
												$sum_seq_num   += $temp_show_sub[2];
						push(@sub_clustering_out_files, @{$temp_show_sub[3]});

						if($good_bad==1){          push(@good, $big_mspa_file);
						}else{         push(@bad, $big_mspa_file);       }

					}
					#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
					&write_good_bad_list_in_divide_clusters(\@good, \@bad);
					sub write_good_bad_list_in_divide_clusters{
							 my  (@good, $i, @bad); @good=@{$_[0]}; @bad=@{$_[1]};
                             open(GOODBAD, ">good_bad.list") || warn "\n Can not open good_bad.list \n\n";
							 print GOODBAD "GOOD: all link    : 000\n";
							 for($i=0; $i< @good; $i++){  print GOODBAD "$good[$i]\n";  }
							 print GOODBAD "BAD : Not all link: 000\n";
							 for($i=0; $i< @bad; $i++){   print GOODBAD "$bad[$i]\n";   }
							 close(GOODBAD);
					}
					#_______________________________________________________________

	 }
	 print "\n (E) \"@No_file_found_divclus_fail\" has been failed to be parsed by divclus\n"; sleep 3;
     return(\@sub_clustering_out_files); # contains (xxxx.clu, yyy.clu,, )
}





#_______________________________________________________________________________
# Title     : fasta_kt1_search
# Usage     : &fasta_kt1_search($query_database, $target_database, $fasta_version_to_use
# Function  : to search one database against the other using fasta
#                ktup=1 (default is simply "fasta"). The results are stored in sub dirs
#                which are from the 2 first chars of the query sequence.
# Example   : &fasta_kt1_search ($qdb_main, $tdb_main, $fastaver_main);
# Keywords  : fasta_search, fasta_database_search
# Options   :
# Author    : Sarah A. Teichmann
# Date      : 19th September 1997
# Version   : 1.1
#-------------------------------------------------------------------------------
sub fasta_kt1_search{
		my ($qdb, $tdb, @qdbcont, $fastaver, $gene, $seq,
				@genes, %genes, $genes, $out, $tmp, $sw_score,
				$e_val, @tmpcontent, $i, $dir, @dir);
		$qdb=$_[0];
		open (QDB, "$qdb");
		@qdbcont=<QDB>;
		close QDB;
		$tdb=$_[1];
		open(TDB, "$tdb");
		close TDB;
		if ($_[2]){ $fastaver=$_[2]; }
		else{$fastaver="fasta";}
		for ($i=0; $i<@qdbcont; $i++) {

	my $qdbcont=$qdbcont[$i];
	if ($qdbcont=~/^\>(\S+)/) {
	   $gene=$1;
	   push (@genes, $gene);
	}
	if ($qdbcont=~/^(\w+)/) {
	   $seq=$1;
	   $genes{"$gene"}.="$seq";
			 }
	else {next;}
		}
		for ($i=0; $i<@genes; $i++) {
				$genes=$genes[$i];
				@dir=split(//, $genes);
				@dir=splice(@dir, 0, 2);
				$dir=join('', @dir);
				mkdir ($dir,  0777) unless -d $dir;  ## Jong changed
				$out="$dir"."/"."$genes".".sso";
				if (-s $out){next;}  ## -s is better than -e
				$tmp=&tempname;
				open (TMP, ">$tmp");
				print TMP ">$genes\n", $genes{"$genes"}, "\n";
				close TMP;
				$sw_score=0;
				$e_val=10;
				@tmpcontent=`$fastaver -E 0.1 -H -m 10 $tmp $tdb 1`;
				open (OUT, ">$out");
				print OUT "@tmpcontent\n";
				close OUT;
				unlink ("$tmp");
				next;
		}
}

#___________________________________________________________________________________________
# Title     : sw_search
# Usage     : &sw_search($query_database, $target_database, $ssearch_version_to_use
# Function  : to search one database against the other using ssearch (default is simply "ssearch")
# Example   :
# Keywords  : ssearch, database_ssearch
# Options   :
# Author    : Sarah A. Teichmann
# Date      : 19th September 1997
# Version   : 1.0
#--------------------------------------------------------------------------------------------
sub sw_search{
my ($qdb, $tdb, @qdbcont, $fastaver, $gene, $seq, @genes, %genes, $genes,
		$out, $tmp, $sw_score, $e_val, @tmpcontent, $i, $dir, @dir);
		$qdb=$_[0];
		open (QDB, "$qdb");
		@qdbcont=<QDB>;
		close QDB;
		$tdb=$_[1];
		open(TDB, "$tdb");
		close TDB;
		if ($_[2]){$fastaver=$_[2];}
		else{$fastaver="ssearch";}

	 for ($i=0; $i<@qdbcont; $i++) {

	my $qdbcont=$qdbcont[$i];
	if ($qdbcont=~/^\>(\S+)/) {
	   $gene=$1;
	   push (@genes, $gene);
	}
	if ($qdbcont=~/^(\w+)/) {
	   $seq=$1;
	   $genes{"$gene"}.="$seq";
			 }
	else {next;}
	 }

	 for ($i=0; $i<@genes; $i++) {
				$genes=$genes[$i];
				@dir=split(//, $genes);
				@dir=splice(@dir,0,2);
				$dir=join('',@dir);
				mkdir ("$dir", 0777) unless -d $dir;
				$out="$dir"."/"."$genes".".sso";
				if (-e $out){next;}
				$tmp=&tempname;
				open (TMP, ">$tmp");
				print TMP ">$genes\n", $genes{"$genes"}, "\n";
				close TMP;
				$sw_score=0;
				$e_val=10;
				@tmpcontent=`$fastaver -E 0.1 -H -m 10 $tmp $tdb`;
				open (OUT, ">$out");
				print OUT "@tmpcontent\n";
				close OUT;
				unlink ("$tmp");
				next;
		}

}


#________________________________________________________________________________
# Title     : get_all_mspa_files
# Usage     : @mspa_files=@{&get_all_mspa_files};
# Function  : puts the names of all msp or msp.gz files in the directory and its subdirectories into an array
# Example   :
# Keywords  : read_mspa_files, make_msp_file_array, get_all_mspa_files_in_dir
# Date      : 19th September 1997
# Options   :
# Author    : Sarah A. Teichmann, jong@salt2.med.harvard.edu
# Version   : 1.6
#--------------------------------------------------------------------------------
sub get_all_mspa_files {
	 my (@MSPA_files_main_final);
	 my @MSPA_files_main=@{&read_file_names_only('.msp', '.msp.gz', '.mspa','.mspa.gz')};
	 for(@MSPA_files_main){
			if($_=~/cluster\.msp[a]?/){ next ## xxxx_cluster.mspa is the processed file
			}else{ push(@MSPA_files_main_final, $_) }
	 }
	 my @dirs=@{&read_dir_names_only('n', '.')};
	 my ($i, $j, @mspa_files, $file_in_dir, $dir_file);
	 for ($i=0; $i<@dirs; $i++){
			 my $dir=$dirs[$i];
			 unless( -d $dir){
					next;
			 }
			 if( -d $dir){  chdir($dir);  }

             @mspa_files=@{&read_file_names_only('.','.mspa', '.msp', '.mspa.gz', '.msp.gz')};
			 for ($j=0; $j<@mspa_files; $j++){
					if($mspa_files[$j]=~/cluster\.msp/){ next } ## skipping already existing  20-1_cluster.mspa like files
					$file_in_dir=$mspa_files[$j];
					$dir_file="$dir"."/"."$file_in_dir";
					push(@mspa_files_dirs, $dir_file);
					next;
			 }
			 chdir('..');
			 next;
	 }

	 @mspa_files=sort (@MSPA_files_main_final, @mspa_files_dirs);
     return (\@mspa_files);
}



#______________________________________________________________
# Title     : get_largest_file
# Usage     : $largest_file_name=${&get_largest_file(@ARGV)};
# Function  : checks the size of files and returns the largest
#             one's name. If a file is not present in pwd or
#             specified absolute path, it ignores it.
# Example   :
# Keywords  : choose_largest_file, largest_file, find_largest_file
#             get_the_largest_file, choose_the_largest_file, get_biggest_file
#             fetch_largest_file, take_largest_file, get_bigger_file, get_larger_file
# Options   : _  for debugging.
#             #  for debugging.
#             e  for extract the largest from the input array
#                       leaving it one element less, in this case
#                       there will be two returning refs.
# Category  :
# Version   : 1.4
#--------------------------------------------------------------
sub get_largest_file{
		my @in;
		if(ref $_[0] eq 'ARRAY'){
	 @in = @{$_[0]};
		}else{
	 @in = @_;
		}

		my ($largest_file, $largest, $i, $extract_opt);
		for($i=0; $i< @in; $i++){
	if(($in[$i]=~/^\-?e$/i)&&(!(-f $in[$i])) ){
	   $extract_opt=1;
	   splice(@in, $i, 1);
	   $i--;
	}
		}
		for($i=0; $i< @in; $i++){
	my $size=(-s $in[$i]);
	if($size > $largest){
						 $largest=$size;
						 if($extract_opt ==1){
										print "\$extract_opt is $extract_opt \n";
										push(@in, $largest_file) if defined($largest_file);
										$largest_file = splice(@in, $i, 1);
										print "\n $largest_file \n";
										$i--;
						 }else{
										$largest_file=$in[$i];
						 }
				}
		}
		if($extract_opt==1){
	 return(\$largest_file, \@in);
		}else{ return(\$largest_file); }
}


#______________________________________________________________
# Title     : get_smallest_file
# Usage     : $smallest_file_name=${&get_largest_file(@ARGV)};
# Function  : checks the size of files and returns the smallest
#             one's name. If a file is not present in pwd or
#             specified absolute path, it ignores it.
# Example   :
# Keywords  : choose_smallest_file, smallest_file, find_smallest_file
#             get_the_smallest_file, choose_the_smallest_file,
#             fetch_smallest_file, take_smallest_file, get_smaller_file,
# Options   : _  for debugging.
#             #  for debugging.
#             e  for extract the smallest from the input array
#                       leaving it one element less, in this case
#                       there will be two returning refs.
# Category  :
# Version   : 1.3
#--------------------------------------------------------------
sub get_smallest_file{
		my @in;
		if(ref $_[0] eq 'ARRAY'){
	 @in = @{$_[0]};
		}else{
	 @in = @_;
		}
		my $smallest=10000000000;
		my ($smallest_file, $i, $extract_opt);

		for($i=0; $i< @in; $i++){
	if(($in[$i]=~/^\-?e$/i)&&(!(-f $in[$i])) ){
	   $extract_opt=1;
	   splice(@in, $i, 1);
	   $i--;
	}
		}
		for($i=0; $i< @in; $i++){
	my $size=(-s $in[$i]);
	if($size < $smallest){
	   $smallest=$size;
	   if($extract_opt ==1){
		  print "\$extract_opt is $extract_opt \n";
		  push(@in, $smallest_file) if defined($smallest_file);
		  $smallest_file = splice(@in, $i, 1);
		  print "\n $smallest_file \n";
		  $i--;
	   }else{
		  $smallest_file=$in[$i];
	   }
	}
		}
		if($extract_opt==1){
				 return(\$smallest_file, \@in);
		}else{ return(\$smallest_file); }
}
#______________________________________________________________________________
# Title     : find_program_in_path
# Usage     :
# Function  :
# Example   :
# Keywords  : which, whence
# Options   :
# Author    : Jong, jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.3
#------------------------------------------------------------------------------
sub find_program_in_path{
		my(@found_prog_and_path, $file, $verbose);

		if(@_==0){
			 #print "\n# (ERROR) $0 requires at least one program in path to detect\n\n";
			 print chr(7);
			 die;
		}
		foreach $file (@_) {
			 if($file=~/^v/i and !-s $file){
					$verbose='v'; next;
			 }
			 for $dir (split(/:/,$ENV{'PATH'})) {
					if(-f ($path="$dir/$file") && -f _){
						 $size= -s "$path";
						 $path=~s/\/\//\//g;  ## removing /bin//gzip like entry
						 push(@found_prog_and_path, sprintf("%-30s %12s byte", $path, $size) );
					}
			 }
		}
		@found_prog_and_path=@{&remove_dup_in_array(\@found_prog_and_path)};

		return(\@found_prog_and_path);
}

#____________________________________________________________________________________
# Title     : do_sequence_search
# Usage     : &do_sequence_search("Query_seqs=\%pdb_seq", "DB=$sequence_DB",
#  		         "File=$ARGV[0]", $single_msp, $over_write,
# 	        	 "u=$upper_expect_limit", "l=$lower_expect_limit",
#       		 "k=$k_tuple", $No_processing );
# Function  : do FASTA, SSEARCH or BLASTPGP(psi-blast) search
# Example   : &do_sequence_search(\%pdb_seq, $owl_db_fasta, $ARGV[0], $single_msp, $over_write,
#                    "u=$upper_expect_limit", "l=$lower_expect_limit", "k=$k_tuple" );
#
# Keywords  : sequence_search
# Options   :
#             Query_seqs=  for enquiry sequences eg)  "Query_seqs=$ref_of_hash"
#             DB=   for target DB  "DB=$DB_used"
#             File= to get file base(root) name.  "File=$file[0]"
#             m  for MSP format directly from FASTA or Ssearch result than through sso_to_msp to save mem
#             s  for the big single output (msp file output I mean)
#             s= for the single big msp file name
#             o  for overwrite existing xxxx.fa files for search
#             c  for create SSO file (sequence search out file)
#             d  for very simple run and saving the result in xxxx.gz format in sub dir starting with one char
#             r  for reverse the query sequence
#             R  for attaching ranges of sequences
#             k= for k-tuple value. default is 1 (ori. FASTA prog. default is 2)
#             u= for $upper_expect_limit
#             l= for $lower_expect_limit
#             E= for $Evalue_thresh
#             a= for choosing either fasta or ssearch algorithm
#             d= for defining the size of subdir made. 2 means it creates
#                    eg, DE while 1 makes D
#             d  for $make_gz_in_sub_dir_opt, putting resultant sso files in gz format and in single char subdir
#             D  for $make_msp_in_sub_dir_opt, convert sso to msp and put in sub dir like /D/, /S/
#             n  for new format to create new msp file format with sso_to_msp routine
#          PVM=  for PVM run of FASTA (FASTA only)
#             M  for machine readable format -m 10 option
#             M= for machine readable format -m 10 option
#             N  for 'NO' do not do any processing but, do the searches only.
#       FILE_AGE for defining the age of file in days to be overwritten.
#             L  for Lean output(removes xxxx.fa query seq file)
#   $take_last_iter_PSI_BLA=l by l
#   $PSI_BLA_step_eval=       by e=
#
# Returns   : the names of files created (xxxxx.mspa, yyy.mspa,,)
# Version   : 5.9
#----------------------------------------------------------------------------------------
sub do_sequence_search{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my (@final_out, $add_range, $single_big_msp, $base_name, $create_sso, @nondup,
         $Single_msp_out_file, %duplicate, $Evalue_thresh, $Score_thresh, @SSO, $sequence_DB,
         @sso, @temp, $algorithm, $margin, $out_msp_file, @MSP, @final_msp_file_names_out,
         $upper_expect_limit, $lower_expect_limit, $k_tuple, %seq_input, %MSP, $No_processing,
         $new_format, $PVM_FASTA_run, $over_write, $sub_dir_size, $age_in_days_of_out_file,
         $over_write_by_age, $Lean_output, $gzipped_msp_file, $gzipped_sso_file,
         $defined_all_ok, $make_msp_in_sub_dir_opt, $upper_expect_limit, $Evalue_thresh,
         $take_last_iter_PSI_BLA );

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # DEFAULTS
    #________________________________________
    $k_tuple           =1;  # 1 or 2, 1 is more sensitive
    $algorithm         ='fasta';
    $sub_dir_size      =2;  # the default char number taken from seq name to make sub dirs
    $upper_expect_limit=5;
    $lower_expect_limit=0;
    $Score_thresh      =75; # FASTA or SSSEARCH score
    $margin            =0;  # sequence region margin. If it is 2, 2 more edged residues will be added
    $add_range         ='';
    $pwd               =`pwd`; chomp($pwd);
    $age_in_days_of_out_file=1000; ## If the files being tested is older than this, let's make anew

    if($vars{'a'}=~/\S+/){ $algorithm           = $vars{'a'}            };
    if($vars{'u'}=~/\d+/){ $upper_expect_limit  = $vars{'u'}            };
    if($vars{'l'}=~/\d+/){ $lower_expect_limit  = $vars{'l'}            };
    if($vars{'k'}=~/\d+/){ $k_tuple             = $vars{'k'}            };
    if($vars{'t'}=~/\d+/){ $Score_thresh        = $vars{'t'}            };
    if($vars{'m'}=~/\d+/){ $margin              = $vars{'m'}            };
    if($vars{'d'}=~/\d+/){ $sub_dir_size        = $vars{'d'}            };
    if($vars{'r'}=~/\S+/){ $add_range           = 'r'                   };
    if($vars{'s'}=~/\S+/){ $single_big_msp      = 's'                   };
    if($vars{'j'}=~/\S+/){ $PSI_BLA_iter_limit= $vars{'j'}            };
    if($vars{'e'}=~/\S+/){ $PSI_BLA_step_eval   = $vars{'e'}            };
    if($vars{'DB'}=~/\S+/){            $sequence_DB=$vars{'DB'} ;
        if(-s $sequence_DB){
        }elsif(-s "../$sequence_DB"){  $sequence_DB= "../$sequence_DB"
        }elsif(-s "../../$sequence_DB"){  $sequence_DB= "../../$sequence_DB";
        }else{
                print "\n# (E) do_sequence_search: You set DB param, but I can\'t find $sequence_DB\n";
                die;
        }
    }else{  print "\n# (E) do_sequence_search: I need DB param defined, sorry, aborting\n"; }

    if($vars{'FILE'}=~/\S+/){ $query_File = $vars{'FILE'}; push(@file,$query_File) };
    if($vars{'File'}=~/\S+/){ $query_File = $vars{'File'}; push(@file,$query_File) };
    if($vars{'FILE_AGE'}=~/\S+/){ $age_in_days_of_out_file= $vars{'FILE_AGE'};  };
    if($vars{'Query_seqs'}=~/\S+/){ %seq_input = %{$vars{'Query_seqs'}}};
    if($vars{'Query'}=~/\S+/){      %seq_input = %{$vars{'Query'}}};
    if($vars{'u'}    =~/\S+/){ $upper_expect_limit          = $vars{'u'}            };
    if($vars{'PVM'}  =~/\S+/){ $PVM_FASTA_run  = $vars{'PVM'}; print "\n# PVM opt is set\n";     };
    if($vars{'M'}  =~/\S+/){ $machine_readable = $vars{'M'};           };
    if($vars{'E'}  =~/\S+/){ $Evalue_thresh = $vars{'E'};           };

    if($char_opt=~/l/){    $take_last_iter_PSI_BLA= 'l' }
    if($char_opt=~/r/){    $add_range             = 'r' }
    if($char_opt=~/L/){    $Lean_output           = 'L' }
    if($char_opt=~/o/){    $over_write            = 'o' }
    if($char_opt=~/c/){    $create_sso            = 'c' }
    if($char_opt=~/s/){    $single_big_msp        = 's'; print "\n# Single file opt is set\n"; }
    if($char_opt=~/m/){    $msp_directly_opt      = 'm' }
    if($char_opt=~/M/){    $machine_readable      = 'M' }
    if($char_opt=~/d/){    $save_in_gz_in_sub_dir = 'd' }
    if($char_opt=~/D/){$make_msp_in_sub_dir_opt   = 'D' } # for simple search and storing msp file
    if($char_opt=~/N/){    $No_processing         = 'N'; $create_sso='c'; }
    if($char_opt=~/q/){    $make_MSP_files        = 'q' }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
    # When no %seq is given, but files
    #___________________________________________
    if(@hash==0 and @file > 0){
         print "\n# (i) do_sequence_search: You did not put sequences as in \%seq, but raw sequence file @file!\n";
         print "        I will run \'open_fasta_files\' sub to fetch sequences to store in \%seq_input\n";
         %seq_input=%{&open_fasta_files(\@file)};
         unless(%seq_input > 1){
            print "\n !!!! \%seq_input hash is too small, Error opening \@file : @file, in do_sequence_search sub \n\n";
            die;
         }
    }else{
         #print "\n# (i) do_sequence_search: I will use given seqs in \%seq_input from \%\{\$hash\[0\]\}\n";
         %seq_input=%{$hash[0]};
    }
    my (@seq_names) = keys %seq_input;

    $base_name = ${&get_base_names($query_File)};
    print "\n# (i) line:",__LINE__, ", do_sequence_search, \$algorithm => $algorithm, \$base_name:$base_name
                         $query_File <--> $sequence_DB\n";

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
    # If one of the files is missing, ask if it is for self self search and
    #  make query=target, else die
    #_______________________________________________________________________
    $defined_all_ok=&check_if_defined($query_File, $sequence_DB);
    unless($defined_all_ok){
         print "\n  Did you want to do self self search? ->(y/n) ";
         $answer_for_self_self=getc;
         if($answer_for_self_self =~/y/i){
              if($query_File){  $sequence_DB=$query_File }
              else{ $query_File=$sequence_DB }
         }else{
              print "\n# You seemed made a mistake, O.K., I will kill myself!\n\n";
              print chr(7);  die;
         }
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # [0] If blast is chosen run Blast for the query FASTA file (for all the seqs)
    #_______________________________________________________________________________
    if($algorithm=~/[psi\-]*[pb][last]*/i){
         print "\n# (i) Doing PSI search with @file with j=$PSI_BLA_iter_limit\n";
         @final_out=@{&do_psi_blast_search(\@file, "d=$source_DB_file",
                                            "i=$input_seq_file",
                                            $over_write,
                                            $make_msp_in_sub_dir_opt,
                                            $Lean_output,
                                            $make_MSP_files,
                                            $take_last_iter_PSI_BLA,
                                            "j=$PSI_BLA_iter_limit",
                                            "e=$PSI_BLA_step_eval"
                                            )};
         return(\@final_out); #  @final_out has a bunch of MSPA files
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # [1] Controlling which kind of search it should do. Do save_in_gz_in_sub_dir first if d is set
    #______________________________________________________________________________________________
    if( $make_msp_in_sub_dir_opt ){  ## convert sso to msp and put in sub dir like /D/, /S/
        print "[1] \$make_msp_in_sub_dir_opt";
        for($x=0; $x < @seq_names; $x++){
             my ($over_write_sso_by_age, $over_write_msp_by_age,  %single_seq, $out_file_sso_gz_name, $out_file_msp_name, $out_file_gz_name, $existing_sso);
             my ($seq_name, $seq)= ($seq_names[$x], $seq_input{$seq_names[$x]});
             my $first_char= substr("\U$seq_name", 0, $sub_dir_size);
             mkdir ("$first_char", 0777) unless -d $first_char;
             chdir("$first_char");
             #print "\n# (i) do_sequence_search: You set \'d\' or \'D\' opt\n";
             #print "# (i) making subDIRs ($first_char) with $seq_name $sequence_DB to store MSP files\n";

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # Let's make each fasta file for each seq to be used in searching
             #_____________________________________________________________________
             my $temp_file_name="$seq_name.spfa";
             %single_seq=($seq_name, $seq_input{$seq_name});
             &write_fasta(\%single_seq, $temp_file_name ); ## e for writing each file

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # Making output file name according to the option given
             #_______________________________________________________________________
             if($machine_readable and $algorithm=~/[fastassearch]+/){  $out_file_sso_name="$seq_name\.msso";
             }else{ $out_file_sso_name="$seq_name\.sso";      }
             $out_file_sso_gz_name    ="$out_file_sso_name\.gz";
             $out_file_msp_name       ="$seq_name\.mspa";
             $out_file_gz_name        ="$seq_name\.mspa\.gz";

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # Check if SSO file already there
             #_______________________________________________________________________
             if(-s $out_sso_file){ $existing_sso=$out_file_sso_name }
             elsif(-s $out_sso_gz_name){ $existing_sso=$out_file_sso_gz_name }
             if(-s $out_msp_name){ $existing_msp=$out_file_msp_name }
             elsif(-s $out_gz_name){ $existing_msp=$out_file_gz_name }

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # If the dates of files created are long ago, overwrite to refresh
             #____________________________________________________________________
             if(  (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){  $over_write_sso_by_age='o';  }
             if(  (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){  $over_write_msp_by_age='o';  }

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             #  To check if the target seq DB is in ../
             #________________________________________________
             if(-s $sequence_DB){     #print "\n# (i) Good, target \$sequence_DB $sequence_DB is in this working dir\n";
             }elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB"; }

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # [1.1] Searching: Making MSP files directly,  MSP file format is the major format used in prescop!, Default
             #_____________________________________________________________________________________________________________
             if($char_opt =~/D/){ #### To make MSP file
                 print "\n [1.1] \$char_opt = D";
                 if( !(-s $out_file_gz_name or -s $out_file_msp_name) or $over_write or $over_write_msp_by_age){
                      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                      # (2.1) Running  run_fasta_sequence_search !!
                      #_______________________________________________________
                      print "\n  [1.1.1] Running  run_fasta_sequence_search, \$create_sso = $create_sso!!\n";
                      $gzipped_msp_file=${&run_fasta_sequence_search(
                                                  "a=$algorithm",
                                                  "O=$out_file_msp_name",
                                                  "File=$temp_file_name", "u=$upper_expect_limit", "E=$Evalue_thresh",
                                                  "DB=$sequence_DB", "k=$k_tuple", "$machine_readable", $create_sso,
                                                  $verbose)};

                      $gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
                   }else{
                      print "\n#  [1.1.2] Line No. ", __LINE__,", $out_file_gz_name already exists or
                                     \$over_write is set or NOT older than $age_in_days_of_out_file\n";
                   }
             }
             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # [1.2] To make gzipped SSO files and MSP files
             #_______________________________________________
             elsif($create_sso or $char_opt=~/m/){ ### To make gzipped SSO files
                  print "\n [1.2] \$char_opt = D";
                      if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
                          print "\n#  [1.2.1] Running  run_fasta_sequence_search with \"\$create_sso option\"!!\n\n";
                          $gzipped_msp_file=${&run_fasta_sequence_search(
                                               "a=$algorithm",
                                               "O=$out_file_msp_name", "$create_sso",
                                               "File=$temp_file_name", "u=$upper_expect_limit", "E=$Evalue_thresh",
                                               "DB=$sequence_DB", "k=$k_tuple", "$machine_readable", $create_sso,
                                               $verbose)};

                          $gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
                      }else{
                          print "\n#  [1.2.2]Line No. ", __LINE__,", $out_file_gz_name already exists or
                                      \$over_write is set or NOT older than $age_in_days_of_out_file\n";
                      }
             }else{
                  if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
                          system(" $algorithm -m 10 -H  -E $upper_expect_limit $temp_file_name $sequence_DB $k_tuple > $out_file_sso_name");
                          system("gzip $out_file_sso_name");
                  }else{
                          print "\n#  Line No. ", __LINE__,", $out_file_gz_name already exists or
                                      \$over_write is set or NOT older than $age_in_days_of_out_file\n";
                  }
             }
             if(-s "$seq_name.spfa"){  unlink("$seq_name.spfa")
             }elsif(-s "$first_char/$seq_name.spfa"){ unlink("$first_char/$seq_name\.spfa") ; }
             #print "\n# Sub dir $first_char and $seq_name\.mspa has been made, finishing do_sequence_search\n";
             chdir ('..');
          }
          #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          # F I N I S H
          #________________________________________
          goto EXIT;
	 } # if ($char_opt =~/[dD]/){  is finished


	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # (2) Writing on PWD. This is the big single MSP output, such as 6-16_cluster.mspa
	 #__________________________________________________________________________________
	 $Single_msp_out_file="$base_name\.mspa" if($single_big_msp eq 's');
	 if(-s $Single_msp_out_file and !$over_write ){
			 print "\n# (i) $Single_msp_out_file exists, and no \$over_write is set, skipping \n";
			 push(@final_out, $Single_msp_out_file);
	 }else{  $over_write  ='o';  }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # Check if it is necessary to write each sequences.fa files
	 #______________________________________________________
     if( $over_write ){  &write_fasta_seq_by_seq(\%seq_input, 'e'); } ## e for writing each seq file

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	 #  When, you didn't use "DB=$XXX" and "File=$FXXX" format, first file input is DB etc
	 #_______________________________________________________________________________________
	 $defined_all_ok=&check_if_defined($query_File, $sequence_DB);
	 if(!$defined_all_ok){ print "\n# (E) FATAL: do_sequence_search: You did not use \"DB=\$XXX\" format\n"; die   };

	 print "\n# Finished writing the enquiry fasta files from \%seq_input by write_fasta";
	 print "\n# I am in do_sequence_search sub, Target database used :  $sequence_DB with seqs of \'@seq_names\'\n";


	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 #  Main search with given @seq_names
	 #______________________________________________________________
	 for($j=0; $j< @seq_names; $j++){  # @seq_names has sequence names coming from  (@seq_names) = keys %seq_input;
			 my ($over_write_sso_by_age, @temp, $existing_sso, $out_gz_name,
					 $over_write_msp_by_age, $existing_msp, $out_msp_file, $seq_name);
			 $seq_name=$seq_names[$j];
			 $each_seq_fasta="$seq_name\.spfa";
			 $out_msp_file="$seq_name\.mspa";
			 $out_gz_name="$seq_name\.mspa\.gz";
			 $out_msso_file="$seq_name\.msso";

			 &die_if_file_not_present($each_seq_fasta);

			 print "\n# (i) :-) Found $each_seq_fasta is searched against $sequence_DB\n";
			 if($algorithm=~/fasta/){       $out_sso_file="$seq_name\.fsso";
			 }elsif($algorithm=~/ssearch/){ $out_sso_file="$seq_name\.ssso"; }
			 $out_sso_gz_name="$out_sso_name\.gz";

			 if(-s $out_sso_file){ $existing_sso=$out_sso_file }
			 elsif(-s $out_sso_gz_name){ $existing_sso=$out_sso_gz_name }
			 if(-s $out_msp_file){ $existing_msp=$out_msp_file }
			 elsif(-s $out_gz_name){ $existing_msp=$out_gz_name }
			 if(  (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
						$over_write_sso_by_age='o';
			 }
			 if(  (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
						$over_write_msp_by_age='o';
			 }

			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 #  To check if the target seq DB is in ../
			 #________________________________________________
			 if(-s $sequence_DB){ print "\n# (i) \$sequence_DB $sequence_DB exists, Good\n";
			 }elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB";
			 }elsif( -s "../../$sequence_DB"){ $sequence_DB="../../$sequence_DB"; }

			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # If MSP file already exist
			 #_____________________________________________________________
			 if( -s $out_msp_file and !$over_write_msp_by_age and !$over_write ){
						print "\n# (i) File: $out_msp_file exists, skipping, to overwrite use \'o\' opt or set days";
						push(@final_out, $out_msp_file);
			 }else{  ## -E is for e value cutoff. -b is for num of seq fetched
					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~
					 #  K-tuple is 1 by default. If xxxx.sso exsts, skip running fasta or ssearch
					 #________________________________________________________________________________
					 if(-s $out_sso_file and !$over_write ){ ## If SSO is already present, JUST READ IT!
								print "\n# (i) Just opening existing $out_sso_file $out_sso_file $out_msp_file $over_write_msp_by_age $over_write\n";
                                open(SSO_ALREADY, "$out_sso_file") || warn "\nWarnning. Can not open $out_sso_file\n";
								@temp=<SSO_ALREADY>;
								print "\n# (i) \@temp has ", scalar(@temp), " lines\n";
								close(SSO_ALREADY);
								&compress_files_by_gzip($out_sso_file);
					 }else{ ## Run FASTA HERE
							print "\n# (i) Running \"run_fasta_sequence_search\" ";
							$gzipped_msp_file=${&run_fasta_sequence_search( "a=$algorithm",
																 "O=$out_msp_file", "$create_sso",
																 "File=$each_seq_fasta", "E=$upper_expect_limit",
																 "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
							push(@final_out, $gzipped_msp_file) if -s $gzipped_msp_file ;
							unlink($each_seq_fasta) if $Lean_output;
					 }
			 }
			 if($machine_readable and $create_sso and -s $out_sso_file){ &cp($out_sso_file, $out_msso_file); }
	 } # end of for($j=0; $j< @seq_names; $j++){
	 return(\@final_out);
	 EXIT:

} # do_sequence_search



#______________________________________________________________________________
# Title     : compress_files_by_gzip
# Usage     : &compress_files_by_gzip('file_name_to_be_compressed');
# Function  :
# Example   :
# Keywords  :
# Options   :
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.3
#------------------------------------------------------------------------------
sub compress_files_by_gzip{
		my($file, $input_file_name, @final_output_file_names, $out_file_gz_name);
		for $file(@_){
			 if(ref($file) eq 'SCALAR'){
					 $file=${$file};
			 }
			 $out_file_gz_name="$file\.gz";
			 if(-s $out_file_gz_name){
					 unlink ($out_file_gz_name); # remove existing $out_file_gz_name
			 }
			 if(@{&find_program_in_path('gzip')} > 0){
					 system("gzip $file"); ## gzipping it
			 }else{
					 #print "\n# (ERROR) gzip is not found here. Please put gzip compressing program in exe path\n";
					 #print "\n# (INFO) No compression will be made by gzip \n\n";
			 }
			 push( @final_output_file_names, $out_file_gz_name) if -s $out_file_gz_name;
			 #if(@final_output_file_names < 1){ print "\n# (ERROR) No gz files were written, Error??\n"; }
		}
		if(@final_output_file_names> 1){ return(\@final_output_file_names);
		}else{  return(\$final_output_file_names[0]);   }
}




#________________________________________________________________________
# Title     : parse_arguments
# Usage     : &parse_arguments; or  (file1, file2)=@{&parse_arguments};
# Function  : Parse and assign any types of arguments on prompt in UNIX to
#             the various variables inside of the running program.
#             This is more visual than getopt and easier.
#             just change the option table_example below for your own variable
#             setttings. This program reads itself and parse the arguments
#             according to the setting you made in this subroutine or
#             option table in anywhere in the program.
#             It also imports the ENV variables to your program.
#
# Example   : &parse_arguments(1);
#             @files=@{&parse_arguments(1)};
# Warning   : HASH and ARRAY mustn't be like = (1, 2,3) or (1,2 ,3)
# Keywords  : pass_arguments
# Options   : '0'  to specify that there is no argument to sub, use
#              &parse_arguments(0);
#             parse_arguments itself does not have any specific option.
#             '#' at prompt will make a var  $debug set to 1. This is to
#              print out all the print lines to make debugging easier.
#
#             'e=xxxx' for filtering input files by extension xxxx
#
# Returns   : Filenames in a reference of array
#             and input files in an array (file1, file2)=@{&parse_arguments};
# Argument  : uses @ARGV
# Category  :
# Version   : 2.2
#--------------------------------------------------------------------
sub parse_arguments{
		my( $c, $d, $f, $arg_num, $option_table_seen, $n, $option_filtered,
				$option_table_example, $input_line, @input_files,
				$extension);

		&import_ENV_vars; # this enables  $PDB40D automatically assigns the shell var value of $ENV{'PDB40D'}

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   Checks if there were arguments
		#_______________________________________________________
		if( @ARGV < 1 ){ #<-- If Argument is not given at prompt
				for(@_){  ## this means, parsearguments do not need any argus. So no need to parse anything
						if($_ eq '0'){  last;
						}elsif($_ =~/(\d+)/){ print "\n# (Error) $0 requires $1 number of prompt arguments\n";
								 print "  To get help type \"$0  h\"\n\n\n ";  print chr(7); die;
						}else{
								 print "\n (i) parse_arguments: $_ was given as argument input\n";
								 print chr(7); #<-- This is beeping
								 print "  To get help type \"$0  h\"\n\n\n ";
						}
				}
		}else{  print "\n# (i) parse_arguments:  The program arguments were @ARGV\n"; }

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  If there is only one prompt arg. and is [-]*[hH][elp]*, it calls
		#   &default_help and exits
		#___________________________________________________________________
		if( @ARGV == 1  and  $ARGV[0] =~ /^[\-]*[hH\?][elp ]*$/ ){ print "\n# (i) You put \"@ARGV\" as OPT. Showing help:\n";
					&show_default_help;
					print "\n# (i) Program Normally finished \n\n";
					die;
		}
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#   Some DEFAULT $debug variables for debugging purposes
		#__________________________________________________________
		&set_debug_option;


		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  Checking some input options like 'e=txt' for extension filtering
		#_____________________________________________________________________
		for($i=0; $i< @_; $i++){
				if($_[$i]=~/e=(\S+)/){
										push(@extension, $1);
						}elsif($_[$i]=~/^[\-]?r$/){ ## reversing file order in return stage
						$reverse_out_file_name_order='r';
				}elsif($_[$i]=~/^[\-]*[hH\?][elp ]*$/){ print "\n# (i) You put $_[$i] as OPT. Showing help: ";
						&show_default_help; }
		}

		for($f=0; $f < @ARGV; $f++){
				 if( $ARGV[$f] =~ /\w+[\-\.\w]+$/ and -f $ARGV[$f] ){
							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							# When extension is defined, filter files by it
							#____________________________________________
							if(@extension > 0){
									for($e=0; $e < @extension; $e++){
															$extension=$extension[$e];
															if($ARGV[$f]=~/\S\.$extension/){
																			push(@input_files, $ARGV[$f] );
															}else{ next }
											}
							}else{
											push(@input_files, $ARGV[$f] );
											next;
							}
				 }
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#     Reading the running program script
		#_______________________________________________________
		&assign_options_to_variables;
		if($HELP == 1){ &default_help }
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# reverse order if 'r' opt is set
		#_____________________________________________
		if($reverse_out_file_name_order){
				@input_files=reverse(@input_files);
		}
		return(\@input_files);
}


#________________________________________________________________________
# Title     : assign_options_to_variables
# Usage     : &assign_options_to_variables(\$input_line);
# Function  : Assigns the values set in head box to the variables used in
#             the programs according to the values given at prompt.
#             This produces global values.
#             When numbers are given at prompt, they go to @num_opt
#              global variable. %vars global option will be made
#
# Example   : When you want to set 'a' char to a variable called '$dummy' in
#             the program, you put a head box commented line
#             '#  $dummy    becomes  a  by  -a '
#             Then, the parse_arguments and this sub routine will read the head
#             box and assigns 'a' to $dummy IF you put an argument of '-a' in
#             the prompt.
# Warning   : This is a global vars generator!!!
# Keywords  :
# Options   : '#' at prompt will make a var  $debug set to 1. This is to
#              print out all the print lines to make debugging easier.
# Returns   : Some globaly used variables according to prompt options.
#             @num_opt,
#
# Argument  : None.
# Version   : 2.8
#--------------------------------------------------------------------
sub assign_options_to_variables{
		my($i, %vars, $j, $op, $z, $n, $symb, $value, $var, %val, @val, $ARG_REG,
			 $option_table_example, @input_options, $first_border_and_title, $sym, @arg);

		#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#      Defining small variables for option table reading
		#________________________________________________________________________
		my($g)='gets';                my($if)='if';
		my($is)='is';                 my(@input_files);
		my($o)='or';   my(@arguments) = sort @ARGV;

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#  Assigning global arguments(@num_opt, %vars) variables
		#_______________________________________________________________
		for($i=0; $i< @arguments; $i++){
				 print "\n# \"$arguments[$i]\"";
				 if(($arguments[$i]=~/^(\-?\d+[\.\d+]?)$/)&&   ### it mustn't be a file
						 ( !(-f $arguments[$i]) ) ){                ### getting NUM opt
						 push(@num_opt, $1);
				 }elsif( $arguments[$i]=~/^(\S+)=(\S+)$/){
						 $vars{$1}=$2;
						 print "\n# (1) assign_options_to_variables: Your variable option for \"$1\" is \"$2\"\n";
				 }elsif( $arguments[$i]=~/^(\S+)=[\'\"]([ \S]+)[\'\"]$/){ # just in case Shell fails
						 $vars{$1}="$2";
				 }elsif( $arguments[$i]=~/^(\S+)=([ \S]+)$/){
						 $vars{$1}="$2";
				 }
		}


		#""""""""""""""""""""""""""""""""""""""""""""""""""
		#   Some DEFAULT $debug variables for debugging purposes
		#__________________________________________________________
		&set_debug_option;

		#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		#   The main processing of self
		#_______________________________________________________________
		open(SELF, "$0");    ## opens the program you ran to get the options table.
		while(<SELF>){

			 if( $first_border_and_title > 5 ){  ## This is to make it read only the first headbox.
						last;                            #  $first_border_and_title is an incremental counter.
			 }elsif( /^ *#[_\*\-]{15,}$/ and /^ *# *[Tt][itle]*[ :]*/ ){
						$first_border_and_title++;
						print __LINE__, "# assign_options_to_variables : Title line found\n" if $debug eq 1;
			 }elsif(/^ {0,5}#[\t ]{1,50}([\$\%\@].+)$/){
						$op = $1;  ## $op is for the whole input option line which has $xxxx, @xxx, %xxxx format
						$op =~ s/^(\W\w+.+)(\s+\#.*)$/$1/;  ## This is removing any comments in the line.
						#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
						## matching the following line input format.
						## $av_sc_segment     becomes    a  by  a  # To smooth the SC rates. Gets the averages of
						## $ARG_REG is for arguments regular expression variable.
						##  This reg. exp. matches = 'a or A or E or e' part
						##  which represents alternative prompt arguments possibilities. \=$b$g$is$e$set
						#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# These are optional argument flag like (-a as well as a)
						#__________________________________________________________
						$ARG_REG ='(\S*) *[or=\,]* *(\S*) *[=or\,]* *(\S*) *[=or\,]* *(\S*) *[=or\,]* *(\S*)';

						if(#$op=~/^([\$\@\%])([\w\-]+)[\t ]{0,25}[=|$g|$is]([\t ]+)[bB]y +$ARG_REG/
							 #or
							 $op=~/^([\$\@\%])([\w\-]+)[\t ]{0,25}[=|$g|$is][\t ]*[\$\@\%]?(\S*) *[bB]y +$ARG_REG/){
																						## $sym     $var        becomes          a [$a...]       by       a -a -A
								 my $sym = $1;  #### The symbols like ($, @, %), '$' in the above.
								 my $var = $2;  #### Actual variable name 'var' from $var, 'av_sc_segment' in the above.
								 my $val = $3;  #### The becoming value  first 'a' in the above.
								 my @arg = ($4, $5, $6, $7, $8);  ## The alternative prompt arguments, second 'a' in the above..
								 print "\n $sym $var $val \n" if $debug==1;
								 print "\n \@arg are @arg \n" if $debug==1;

								 #""""""""""""""""""""""""""""""""""""""""""""""""""""
								 #  Going through the PROMPT args.
								 #""""""""""""""""""""""""""""""""""""""""""""""""""""
								 for($z=0; $z < @arguments; $z++){     ## $arguments[$z]  is from @ARGV
											print "\n# Prompt:  \"$arguments[$z]\"\n" if $debug;
											if($arguments[$z]=~/^\-\w+$/){
													 $arguments[$z] =~ s/\-//;
											}
											for ($i=0; $i < @arg; $i ++ ){
													 if( ("$arg[$i]" eq "$arguments[$z]" )&& ($arg[$i] !~ /\=/)
															 && ($sym eq '$') ){
															 ${"$var"}="$val";
															 if($debug == 1){
																			 print __LINE__," \$${var} is set to \"$1\"\n";
															 }

													 }#'''''''''''''''' $arg = by s=  syntax ~~~~~~~~~~~~~~~~~~~~~~~~~~~
													 elsif( ( $arg[$i] =~ /^(\w+) *\=/ ) &&
															 ( $arguments[$z] =~ /^${1}= *([ \S]+)$/) &&  ## NOTE  '[ \S]+' not \S+
															 ( $sym eq '$') ){
																 ${"$var"}="$1";
																 print "\n# I am assigning $1 to $var\n";
																 if($debug eq 1){ print __LINE__,"\$${var} is set to \"$1\"\n";  }
													 }
											}
								 }
						 }
					}
		 }
		 close(SELF);
}



#________________________________________________________________________
# Title     : get_base_names
# Usage     : $base =${&get_base_names(\$file_name)};
#             :   or @bases = &get_base_names(\@files);  # <-- uses `pwd` for abs directory
# Function  : produces the file base name(eg, "evalign"  out of "evalign.pl" ).
#              when xxxx.xx.gz form file is given, it removes gz as well
#
# Example   : $base => 'test'  with 'test.txt' or '/home/dir/of/mine/text.txt'
# Warning   :
# Keywords  : get_base_name{, base_name, file_base_name ,  get_file_base_name
#             get_basename, basename, get_root_name, base , root, get_file_root
# Options   :
# Returns   :
# Argument  : handles both ref and non-ref.
# Version   : 1.5
#--------------------------------------------------------------------
sub get_base_names{
	my($x, $pos, $pos1, @out_file, $file_only, $file, @file, $base, @base);
	@file=@{$_[0]} if (ref($_[0]) eq 'ARRAY');
	@file=@_ if !(ref($_[0]) eq 'ARRAY');
	for($x=0; $x < @file; $x ++){
		if( ref($file[$x]) ){
			$file = ${$file[$x]};
			$pos1=rindex($file, "/");
	        $file_only=substr($file, ($pos1+1));
						if($file_only=~/(\S+\.\S+)\.gz$/){
							 $pos = rindex($1, ".");
			   $base= substr($1, 0, $pos);
			}elsif($file_only=~/^[^\.]+$/){ ## when file does not have '.' in its name
							 $base= $file_only;
						}else{
							 $pos = rindex($file_only, ".");
							 $base= substr($file_only, 0, $pos);
						}
		}else{
			$file = $file[$x];
			$pos1=rindex($file, "/");
	        $file_only=substr($file, ($pos1+1));
			if($file_only=~/(\S+\.\S+)\.gz$/){
							 $pos = rindex($1, ".");
			   $base= substr($1, 0, $pos);
						}elsif($file_only=~/^[^\.]+$/){ ## when file does not have '.' in its name
							 $base= $file_only;
			}else{
							 $pos = rindex($file_only, ".");
							 $base= substr($file_only, 0, $pos);
						}
		}
		push(@base, $base);
	}
	if(@base == 1 ){ \$base[0] }else{ \@base }
}

#________________________________________________________________________________
# Title     : write_mspa_files
# Usage     : &write_mspa_files(\%in1, \%in2, ['s'], [$filename, o=$output_MSP_file_name],)
# Function  : Writes input which is already in mspa file format to
#              files either the name is given or generated
#              If more than one ref of hash is given, this will
#              concatenate all the hashes to one big one to
#              make one file.
#             When NO output xxx.mspaa file name is given, it creates
#              with the query sequence name.
# Example   :  &write_mspa_files(@sso, 's', $out_file);
# Warning   : When NO output xxx.mspa file name is given, it creates
#              with the query sequence name.
# Keywords  : write_mspa, write_MSP, write_MSP_file, write_mspa_file
# Options   : _  for debugging.
#             #  for debugging.
#             s  for each single file output for each hash input
#  filename  for putting output to the specified filename, should be xxx.mspa
#  $output_MSP_file_name= by o=  # same as filename, but you cleary pass it.
#
# Returns   : if 's' option is set, it will make say,
#               HI001.mspa HI002.mspa HI003.mspa  rather than
#
#               HI001HI002HI003.mspa
#  eg of one output(single file case)
#
#   1027     0.0     1     154   HI0004     1     154   HI0004
#   40       0.0     84    132   HI0004     63    108   HI0001
#   31       0.0     79    84    HI0004     98    103   HI0003
#
# Category  :
# Version   : 3.3
#----------------------------------------------------------------------------------
sub write_mspa_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my ($output_MSP_file_name, $add_range, @final_out, $mspa_file_out,$output_MSP_file_name,
        @keys, $N, $temp_1, %hash, $query_seq_name, $single_out_opt);

    if($char_opt=~/r/){ $add_range      ='r' };
	if($char_opt=~/s/){ $single_out_opt ='s' };
    if(@file == 1){ $output_MSP_file_name=$file[0]; $single_out_opt='' } # s is for single file output
    if($vars{'o'}){ $output_MSP_file_name=$vars{'o'} }
    if(@hash < 1){ warn "\n WARNING: write_mspa_files, \@hash is empty. This can be fatal \n\n"; }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (0) When outfilename is not given
    #___________________________________________________________
    if($single_out_opt eq 's' and !$output_MSP_file_name){
		 $mspa_file_out='default_single_out.mspa';
		 for($i=0; $i< @hash; $i++){
			 my %hash=%{$hash[$i]};
			 my @keys =sort keys %hash;
			 #------------------ Writing the first line ---------------------------
			 for($j=0; $j< @keys; $j++){
                if($keys[$j]=~/(\S+)_\d+\-\d+/){ $N = $1 }else{ $N = $keys[$j] }
                if($hash{$keys[$j]}=~/\s+$N[\_\d+\-\d+]*\s+\d+\s+\d+\s+$N[\_\d+\-\d+]*/){
                    open(MSP_FILE, ">$mspa_file_out") ||
                            die "# write_mspa_files: I can not create $mspa_file_out, check permission\n";
                    chomp( $hash{$keys[$j]} ); ## precaution
                    print MSP_FILE "# (H) $0 write_mspa_files: $keys[$j]\n";
                    print MSP_FILE $hash{$keys[$j]}, "\n";
                    splice(@keys, $j, 1);
                    $j--; last;
                }
             }
             #------------- Writing the rest of the lines ____________________
             for($j=0; $j< @keys; $j++){
                 chomp( $hash{$keys[$j]} );
                 print MSP_FILE $hash{$keys[$j]}, "\n";
             }
             print MSP_FILE "\n";
          }
          if(-s $mspa_file_out){
                 print "\n# write_mspa_files: $mspa_file_out is written \n";
          }else{
                 print "\n# Error, write_mspa_files\n"; die
          }
          push(@final_out, $mspa_file_out);
          close(MSP_FILE);
          return(\@final_out);
    #~~~~~~~~~~~~~ DEfault ~~~~~~~~~~~~~~~~~~
    #  (1) When output file name was given!
    #________________________________________
    }elsif($output_MSP_file_name){
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # When FILE input is given (NOT hash )
        #___________________________________________________________
        if(@file==1){
            my($temp_1);
            open(MSP_FILE, ">$output_MSP_file_name") ||  die "# write_mspa_files: I can not create $output_MSP_file_name, check permission\n";
            print MSP_FILE "# (H) $0 write_mspa_files: @file\n";
            for($i=0; $i< @hash; $i++){
                 %hash=%{$hash[$i]};
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # Sorting %hash values by the second column(Evalue)
                 #_______________________________________________________
                 @keys= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { $hash{$_}=~/^\s*\S+\s+(\S+)\s+/ and [$_, $1] } keys %hash;

                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # for Final output
                 #_____________________________
                 push(@final_out, $output_MSP_file_name);

                 #--------- Writing the first line only --------------
                 for($j=0; $j< @keys; $j++){
                     if($keys[$j]=~/(\S+)_\d+\-\d+$/){ $N = $1 }else{ $N = $keys[$j] }

                     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                     # Following is to put the self match on top of the list
                     #________________________________________________________
                     if($hash{$keys[$j]}=~/\s+$N[\_\d+\-\d+]*\s+\d+\s+\d+\s+$N[\_\d+\-\d+]*/){
                          $temp_1=$keys[0]; $keys[0]=$keys[$j]; $keys[$j]=$temp_1;
                     }
                 }
                 for($j=0; $j< @keys; $j++){
                     chomp($hash{$keys[$j]});
                     print MSP_FILE $hash{$keys[$j]}, "\n";
                 }
                 print MSP_FILE "\n";
            }
            print MSP_FILE "\n";
            close(MSP_FILE);
            if(-s $output_MSP_file_name and $output_MSP_file_name !~/^\s*\.mspa$/){
                    print "\n# write_mspa_files: $output_MSP_file_name is written\n" if(-s $output_MSP_file_name);
            }else{
                    print "\n# write_mspa_files: ERROR. Either $output_MSP_file_name is empty or \".mspa\" is written\n";
            }
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # When HASH INPUT IS given
        #__________________________________________
        }else{
            for($i=0; $i< @hash; $i++){
                my %hash=%{$hash[$i]};
                my @keys =sort keys %hash;
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # If s option with $output_MSP_file_name is given
                #____________________________________________________
                if($single_out_opt eq 's' ){      $mspa_file_out=$output_MSP_file_name;
                }elsif($hash{$keys[0]}){
                    ($query_seq_name)=$hash{$keys[0]}=~/\S+\s+\d+\s+\d+\s+(\S+)\s+\d*\s+\d*\s+\S+\s*\d*$/;
                    $mspa_file_out="$query_seq_name\.mspa";
                }else{ die "\n :-<  \$query_seq_name is not defined. Error\n\n\n"; }

                open(MSP_FILE, ">$mspa_file_out") or die "\n# write_mspa_files: Failed to open $mspa_file_out\n";
                print MSP_FILE "# (H) $0 write_mspa_files: $query_seq_name\n";
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # for Final output
                #_____________________________
                push(@final_out, $mspa_file_out);
                #~~~~~~~ Writing the first line only ~~~~~~~~~~~~~~~~~~
                for($j=0; $j< @keys; $j++){
                    if($keys[$j]=~/^(\S+)_\d+\-\d+/){
                       $N = $1
                    }elsif($keys[$j]=~/^(\S+)\s+(\S+)/){
                       $N = $1;
                    }else{ $N = $keys[$j] }
                    if($hash{$keys[$j]}=~/\s+$N[\_\d+\-\d+]*\s+\d+\s+\d+\s+$N[\_\d+\-\d+]*/){
                       $keys[0]=$temp_1; $keys[0]=$keys[$j]; $keys[$j]=$temp_1;
                    }
                }
                for($j=0; $j< @keys; $j++){
                    chomp($hash{$keys[$j]});
                    print MSP_FILE $hash{$keys[$j]}, "\n";

                }
                print MSP_FILE "\n";
            }
            print MSP_FILE "\n";
            if(-s $mspa_file_out and $mspa_file_out=~/\S+\.mspa/){
                 print "\n :-) write_mspa_files: $mspa_file_out is written\n";
            }else{
                 print "\n\n :-(  write_mspa_files: Either $output_MSP_file_name empty or only \".mspa\" is written\n\n\n";
            }
            close MSP_FILE;
         }
    }
    if(@final_out ==1){ return( \$final_out[0] );
    }else{    return(\@final_out);    }
}





#________________________________________________________________________
# Title     : set_debug_option
# Usage     : &set_debug_option;
# Function  : If you put '#' or  '##' at the prompt of any program which uses
#             this sub you will get verbose printouts for the program if the program
#             has a lot of comments.
# Example   : set_debug_option #    <-- at prompt.
# Warning   :
# Keywords  :
# Options   : #   for 1st level of verbose printouts
#             ##  for even more verbose printouts
# $debug  becomes 1 by '#'  or '_'
# $debug2 becomes 1 by '##'  or '__'
#
# Returns   :  $debug
# Argument  :
# Version   : 1.8
#--------------------------------------------------------------------
sub set_debug_option{
	my($j, $i, $level);
	unless( defined($debug) ){
	 for($j=0; $j < @ARGV; $j ++){
		 if( $ARGV[$j] =~/^(_+)$|^(#+)$/){ # in bash, '#' is a special var, so use '_'
			 print __LINE__," >>>>>>> Debug option is set by $1 <<<<<<<<<\n";
			 $debug=1;
				  print chr(7);
			 print __LINE__," \$debug  is set to ", $debug, "\n";
			 splice(@ARGV,$j,1); $j-- ;
			 $level = length($1)+1;
			 for($i=0; $i < $level; $i++){
				 ${"debug$i"}=1;
				 print __LINE__," \$debug${i} is set to ", ${"debug$i"}, "\n";
			 }
		 }
	 }
	}
}


#________________________________________________________________________
# Title     : show_default_help
# Usage     : &show_default_help2;  usually with 'parse_arguments' sub.
# Function  : Prints usage information and others when invoked. You need to have
#             sections like this explanation box in your perl code. When invoked,
#             show_default_help routine reads the running perl code (SELF READING) and
#             displays what you have typed in this box.
#             After one entry names like # Function :, the following lines without
#             entry name (like this very line) are attached to the previous entry.
#             In this example, to # Function : entry.
# Example   : &show_default_help2; &show_default_help2(\$arg_num_limit);   &show_default_help2( '3' );
#             1 scalar digit for the minimum number of arg (optional),
#             or its ref. If this defined, it will produce exit the program
#             telling the minimum arguments.
# Warning   : this uses format and references
# Keywords  : default_help, show_headbox_help, show_help
# Options   :
# Returns   : formated information
# Argument  :
# Category  :
# Version   : 3.5
#--------------------------------------------------------------------
sub show_default_help{
		 my($i, $perl_dir, $arg_num_limit, $head ,$arg_num_limit, $key_press, $e, $help_item,
			 @entries, @entries_I_want_write, $option_tb_found, $extension, $logname, $tmp );
		 $logname=getlogin();
		 my($pwd)=`pwd`;
		 my($date)=`date`;
		 chomp($date,$pwd);
		 my($not_provided)="--- not provided ---\n";
		 my($file_to_read) = $0;

		 for($i=0; $i < @_; $i ++){
			 if((ref($_[$i]) eq 'SCALAR')&&(${$_[$i]} =~ /^\d$/)){
							 $arg_num_limit = ${$_[$i]};
			 }elsif( (!(ref($_[$i]))) && ($_[$i] =~ /^\d$/)){
							 $arg_num_limit = $_[$i];     }
		 }
		 my %entries = %{&read_head_box(\$file_to_read )};
		 if($option_tb_found ==1){
				@option_tb=@{&read_option_table(\$file_to_read)};
		 }

		 @entries = keys %entries;
		 for $help_item (@entries){
				if( $entries{$help_item}=~/^[\W]*$/  and  !defined($entries{$help_item}) ){
						$entries{$help_item}= $not_provided;
				}
		 }

		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
		 #  Writing the format <<<<<<<<<<<
		 #________________________________________
		 $~ =HEADER_HELP;     write;   ## <<--  $~ is the selection operator
		 $~ =DEFAULT_HELP_FORM;

		 @entries_I_want_write=sort keys %entries;

		 for ( @entries_I_want_write ){  write  }

		 print chr(7);  print "_"x72,"\n\n";

		 if(@ARGV < $arg_num_limit){ print "\* $0 fataly needs $arg_num_limit arguments\n\n" }

		 if(  $option_tb_found == 1){
				#########  Printing the OPTION table contents <<<<<<<<<<<<
				print "  Press \"Return\" key to see what options $logname\'s \n\n    \"$0\" take... \n";
															$key_press=getc();
				print @option_tb, "\n"x2 if(@option_tb > 0);
		 }
format HEADER_HELP  =
_____________________________________________________________________
		  __  __      ______     __          _____
		 /\ \/\ \    /\  ___\   /\ \        /\  _ `\
		 \ \ \_\ \   \ \ \__/   \ \ \       \ \ \L\ \
		  \ \  _  \   \ \  _\    \ \ \       \ \ ,__/
		   \ \ \ \ \   \ \ \/___  \ \ \_____  \ \ \/
		    \ \_\ \_\   \ \_____\  \ \______\  \ \_\
		     \/_/\/_/    \/_____/   \/______/   \/_/ V 3.5`
_____________________________________________________________________
.

format DEFAULT_HELP_FORM =
 @<<<<<<<<<: @*
 $_,        $entries{$_}
.

}

#________________________________________________________________________
# Title     : open_fasta_files
# Usage     : %fasta_seq=%{&open_fasta_files($fasta_file, ['MJ0084'])};
#             if you put additional seq name as MJ0084 it will
#             fetch that sequence only in the database file.
#
#             %out=%{&open_fasta_files(@ARGV, \%index)};
#               while  %index has (seq indexpos seq2 indexpos2,,,)
#               In this case, the fasta file should have xxxx.fa format
#
# Function  : open fasta files and put sequences in a hash
#              If hash(es) is put which has sequence names and seek position
#              of the index file, it searches the input FASTA file to
#              fetch at that seek position. This is useful for Big fasta DBs
#             If the seq name has ranges like  XXXXXX_1-30, it will only
#              return 1-30 of XXXXXX sequence.
#
#             FASTA sequence file format is like this;
#
#             > 1st-seq
#             ABCDEFGHIJKLMOPABCDEFGHIJKLMOPABCDEFGHIJKLMOPABCDEFG
#             > 2nd.sequ
#             ABCDEFGHIJKLMOYYUIUUIUIYIKLMOPABCDEFGHIJKLMOPABCDEFG
#             >owl|P04439|1A03_HUMAN HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, A-3 ALPHA CHAIN PRECURSOR....
#             MARGDQAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDT
#
#             This can also return the sizes of sequences rather than seqs.
#
#             This ignores any dup entrynames coming later.
#
# Example   : %out = %{&open_fasta_files(@ARGV)};
#             %out2=%{&open_fasta_files('seq.fa', \%index)};
#             %out3=%{&open_fasta_files('seq.fa', \%range)};
#             %seq=%{&open_fasta_files($PDB40_FASTA, \@seq_to_fetch)};
#
#             while @ARGV at prompt was: 'GMJ.pep MJ0084'
#
# Keywords  : open_fasta, open_fa_files, open_FASTA_files,
# Options   : Seq name to fetch the specified seq only.
#             as open_fasta_files.pl MY_SEQ_NAME Swissprot.fasta
#            -d  for giving back desc as well as the name. so it
#                gives  'HI0002 This is the description part'
#                as the key
#             If you put hash which is like('seq_name', ['20-30', '30-44',..])
#              it will produce hash which has got:
#              ( seq_name_20-30 'asdfasdfasdfasdfasd',
#                seq_name_30-44 'kljkljkjkjljkjljkll',
#                ....           .... )
#            -s for returning sequence size only
#   $min_seq_leng_accepted=       by m=
#   $No_gap_char_allowed=N        by N  # removes all \W
#   $remove_strange_and_masked_residues=R by R  # removes X, B, Z etc
#   $make_Upper_case_seq=U        by U
#   $keep_special_char_in_names=k by k
#
# Version   : 4.7
#--------------------------------------------------------------------
sub open_fasta_files{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

    my (%sequence, %HASH, @Keys, $seq_found1, $S_start, $S_end, $seq_found,
        $present_seq, @seq_Names, %Sizes, $bare_seq_name, $fasta_seq_idx_file,
        %seq_fragments, $min_seq_leng_accepted, $No_gap_char_allowed,
        $remove_strange_and_masked_residues, $make_Upper_case_seq,
        $DC_fold_type, $DDD_seq_range, $DDD_seq_size, $keep_special_char_in_names);
    $min_seq_leng_accepted=0;

    if(@file<1){  print "\n \@file has less than 1 elem. There is no fileinput for open_fasta_files\n"; die }
    if($vars{'m'}=~/(\d+)/){ $min_seq_leng_accepted=$1; }
    if($char_opt=~/N/){  $No_gap_char_allowed='N'; }
    if($char_opt=~/R/){  $remove_strange_and_masked_residues='R'; }
    if($char_opt=~/U/){  $make_Upper_case_seq='U'; }
    if($char_opt=~/k/){  $keep_special_char_in_names='k'; }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #  (1) When perl file INDEX pos info is given in hash, this speeds up
    #_________________________________________________________________________________
    for($d=0; $d < @hash; $d++){
	   my ($sequence, $NAME, $range_start, $range_leng);
	   %HASH=%{$hash[$d]};
	   my @Keys=keys %HASH;  ## <<< NOTE it is @Keys, not @keys
	   for($f=0; $f< @file; $f++){
		  #====== It must be xxxx.fa format =======
          unless($file[$f]=~/\S\.[mpsn]{0,2}fa[sta]?$/){
			  print "\n# open_fasta_files: \$file\[\$f\] does not have fasta extension, skipping"; next; }
		  open(FASTA, $file[$f]);
		  F0: for($e=0; $e< @Keys; $e++){
			 my $sequence;
			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # When seq name has range attachment, it handles
			 #________________________________________________
			 if($Keys[$e]=~/^(\S+)_(\d+)\-(\d+)/){
			      $NAME=$1;
			      $range_start=$2-1;    ## to fit in substr function
			      $range_leng =$3-$2+1; ## to fit in substr
			 }else{
			      $NAME=$Keys[$e];
			 }
			 if($HASH{$Keys[$e]}=~/^(\d+)$/){
                  splice(@hash, $d, 1);
                  $d--;
                  splice(@file, $f, 1);
                  $f--;
                  seek(FASTA, $1-220, 0);  # -220 is necessary
                  while(<FASTA>){
                     if( /^\> *$NAME/  or
                             /^\> *\S+$NAME/){  # to handle ">owl|P04439|1A03_HUMAN HLA CLASS I HISTOCOMPATIBILITY
                            $seq_found1=1;
                     }elsif(/^(\w+)$/ and $seq_found1==1){   $sequence .=$1;
                     }elsif(/^\> *\S+/ and $seq_found1==1 and length($sequence) > $min_seq_leng_accepted){
     # Filtering by seq size
                         if($No_gap_char_allowed){
                                  $sequence=~s/\W+[0-9]//g;
                         }
                         if($remove_strange_and_masked_residues){
                                  $sequence=~s/[oOjJxXzZbB\W0-9]+//g;
                         }
                         if($make_Upper_case_seq){
                                  $sequence="\U$sequence";
                         }
                         if($range_start =~/\d+/){
                                         $sequence{$Keys[$e]}=substr($sequence, $range_start, $range_leng);
                         }else{         $sequence{$Keys[$e]}=$sequence; }
                         $range_start='';
                         $sequence='';
                         $seq_found1=0; next F0;
                     }
                  }
			  }
		  }
	  }
	}
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	# !(2)! -- opening FASTA files (NORmal, when no perl index pos number is given)
	#_______________________________________________________________________
	for($i=0; $i< @file; $i++){
	   unless(-s $file[$i]){ next; } ## this is essential as handle_arguments has a problem
	   my($entry_found, $input_file, $matched_FASTA_seq_name, $matched_seq);
	   if(ref( $file[$i] ) eq 'SCALAR' ){  $input_file = ${$file[$i]};
	   }else{  $input_file=$file[$i];  }

	   unless (-e $input_file){  print chr(7);  print "\n\n\t This is sub open_fasta_files in $0  \n\n";
           print "\t Fatal: The input file $input_file is not in the directory \n"; die;
	   }
	   open(FILE_1,"$input_file");
	   if(@hash >=1){  ## if seq names are given in hash
           for($h=0; $h< @hash; $h++){
                 @string=(@string, keys %{$hash[$h]});
           }
	   }
	   @string=sort @string;
	   $num_of_seq_to_fetch=@string;
	   if(@string > 0){ print "\n# open_fasta_files(normal fasta fetch): \$num_of_seq_to_fetch is $num_of_seq_to_fetch\n";	   }

	   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	   # (2.1) Not default. when seq to fetch is given by \@sequences  format
	   #_______________________________________________________________________
	   if( @_ > 1  and  @string > 0 ){
		   print "\n#  open_fasta_files is fetching sequences from \$input_file= $input_file\n";
		   %sequence=%{&fetch_sequence_from_db($input_file, \@string)};
		   print "\n# $fasta_seq_idx_file file is made by open_fasta_files(fetch_sequence_from_db), you may remove it\n";
	   }
	   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	   # (2.2) !!! When seq names NOT given, fetches all (THE DEFAULT)
	   #____________________________________________________________
	   else{
           while(<FILE_1>){                # file1 needs to be xxxx.fasta for the moment, automatic later
               if(/^\> *gi\|\d+\|\S+\|(\S+)\|.*/){  ## for >gi|1669546|dbj|D84107|D84107 Human mRNA for Werner syndrome-1/type 1, complete cds
                    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    # Checks if the previous sequence reading was good
                    #____________________________________________________________
                    if(!$matched_seq){ warn " $matched_FASTA_seq_name didn\'t seem to have matching sequence. Empty sequence ????\n";  }
                    if($char_opt=~/[\-]?d/i){  # To add the description
                         $matched_FASTA_seq_name=$_;  # entire line becomes the name of the seque.
                    }else{
                         if( $sequence{$1} ){
                             #------- To avoid identical entry reading repeatedly -----
                             print "\n# I am open_fasta_files: $1 seems to be the same as previous entry, ERROR??\n";
                             $entry_found=0;
                         }else{      $matched_FASTA_seq_name=$1;   $entry_found=1;     }
                    }
               }elsif(/^\> *owl\|\S*\|(\S+)/ or /^> *pir\|\S*\|(\S+)/){  ## for ">owl|P04439|1A03_HUMAN HLA CLASS I HISTOCOMPATIB
                    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    # Checks if the previous sequence reading was good
                    #____________________________________________________________
                    if(!$matched_seq){ warn " $matched_FASTA_seq_name did seem to have matching sequence. Empty sequence ????\n";  }
                    if($char_opt=~/[\-]?d/i){  # To add the description
                         $matched_FASTA_seq_name=$_;  # entire line becomes the name of the seque.
                    }else{
                         if( $sequence{$1} ){
                                         #------- To avoid identical entry reading repeatedly -----
                                         print "\n# I am open_fasta_files: $1 seems to be the same as previous entry, ERROR??\n";
                                         $entry_found=0;
                         }else{      $matched_FASTA_seq_name=$1;   $entry_found=1;     }
                    }

               #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               # Reading DDD.fasta file: >1jlyA:2        fold 37; range 1 159; size 159; MOL_ID: 1;
               #__________________________________________________________
               }elsif(/^> *([\w\:]+)\s+fold\s+(\d+)\;\s+range([\d\s]+)\;\s*size\s+(\d+)\;/){
                    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    # Checks if the previous sequence reading was good
                    #____________________________________________________________
                    if(!$matched_seq){ warn " $matched_FASTA_seq_name didnt seem to have matching sequence. Empty sequence ????\n";  }

                    $DC_fold_type=$2;
                    $DDD_seq_range=$3;
                    $DDD_seq_size=$4;
                    $matched_seq=''; # resetting the previous SEQUENCE
                    if($char_opt=~/[\-]?d/i){  # To add the description
                         $matched_FASTA_seq_name=$_;  # entire line becomes the name of the seque.
                    }else{
                         if( $sequence{$1} ){
                             #------- To avoid identical entry reading repeatedly -----
                             print "\n# I am open_fasta_files: $1 seems to be the same as previous entry, ERROR??\n";
                             $entry_found=0;
                         }else{
                             $matched_FASTA_seq_name=$1;
                             $matched_FASTA_seq_name=~s/\:/_/; ## changing : to _ in the name.
                             $entry_found=1;
                         }
                    }
               #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
               # (DEfault) The most common FASTA format entry
               #_______________________________________________________
               }elsif(/^> {0,5}(\S+)/){
                    $matched_FASTA_seq_name=$1;
                    unless($keep_special_char_in_names){ $matched_FASTA_seq_name=~s/[\,]//g; }
                    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    # Checks if the previous sequence reading was good
                    #____________________________________________________________
                    if(!$matched_seq){ warn " Reading common FASTA format: $matched_FASTA_seq_name didnt seem to have matching sequence. Empty sequence ????\n";  }

                    if($char_opt=~/[\-]?d/i){   $matched_FASTA_seq_name=$_;  # To add the description
                    }else{
                         if( $sequence{$matched_FASTA_seq_name} ){ # check if the entry already exists
                             print "\n# (i) open_fasta_files: $1 seems to be the same as previous entry, ERROR??\n";
                             $entry_found=0;
                         }else{   $entry_found=1;     }
                    }
               #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
               # (Default) I write at this point <----------<<<
               #_________________________________________________
               }elsif(/^([\w\.\- ]+)\s*$/ and $entry_found == 1){
                    $matched_seq=$1;
                    if($No_gap_char_allowed){          $matched_seq=~s/\W+//g;                    }
                    if($remove_strange_and_masked_residues){ $matched_seq=~s/[\,oOjJxXzZbB\W0-9]+//g; }
                    if($make_Upper_case_seq){    $matched_seq="\U$matched_seq";                  }
                    if($char_opt=~/s/){ # when SIZE(length of seq) return only option is set
                        $Sizes{$matched_FASTA_seq_name}+=length($matched_seq);
                    }else{
                        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                        # Filtering by seq size and Actual ASSIGNMENT
                        #____________________________________________________
                        if(length($matched_seq) > $min_seq_leng_accepted){
                             $sequence{$matched_FASTA_seq_name}.= $matched_seq if defined($matched_FASTA_seq_name);
                        }
                    }
               }elsif(/^$/){  next;
               }else{  $entry_found=0;  } ## this is when rubbish is matched
           }# end of while
	   }
	   close FILE_1;
	 }


	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~~~~~~~~`
	 # (3) When ranges information is given(via \@range), seq in those ranges are returned
	 #______________________________________________________________________________________
	 if(defined(@range)){
         %seq_fragments=%{&get_seq_fragments(\%sequence, \@range)};
	     return(\%seq_fragments);
	 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	 # (4) When only size is asked with -s option
	 #_____________________________________________________________________________
	 elsif($char_opt=~/s/){ # when SIZE(length of seq) return only option is set
	    #@seq_Names=keys %sequence;
	    #for($i=0; $i<@seq_Names; $i++){
	    #	  $Sizes{$seq_Names[$i]}=length($sequence{$seq_Names[$i]});
	    # }
	    return(\%Sizes);
	 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	 # (5) when hash which has range info is given(@range should not be defined)
	 #_____________________________________________________________________________
	 elsif(@hash >=1){
         for($h=0; $h< @hash; $h++){
              my %hash=%{$hash[$h]};
              my @Keys=keys %hash;
              for($k=0; $k<@Keys; $k++){
                  if(defined($hash{$Keys[$k]})){
                      ($S_start, $S_end)=$hash{$Keys[$k]}=~/(\d+)\-(\d+)/;
                      $sequence{$Keys[$k]}=substr($sequence{$Keys[$k]}, ($S_start-1), ($S_end-$S_start));
                  }
              }
         }
         return(\%sequence);
	 }else{
         return(\%sequence);
	 }
}






#________________________________________________________________________________
# Title     : tempname
# Usage     : $tmp=&tempname;
# Function  : Returns a unique temporary filename.
#             Reasonably robust but not completely immune to race conditions
#             with other processes simultaneously requesting a tempname.
# Example   :
# Keywords  :
# Options   :
# Version   : 1.0
#--------------------------------------------------------------------------------
sub tempname{
	 foreach $suffix (0..99) {
 	if (! (-e "tmpxx$suffix")) {
		open(TMP,">tmpxx$suffix"); # Stamp it to reserve it.
		close(TMP);
		return "tmpxx$suffix";
	}
	 }
}
#__________________________________________________________________________
# Title     : if_file_older_than_x_days
# Usage     : if( ${&if_file_older_than_x_days($ARGV[0], $days)} > 0){
# Function  : checks the date of last modi of file given and compares with
#             present time. Substracts diff and returns the actual diff days.
# Example   :
# Keywords  : how_old_file, how_old, is_file_older_than_x_days, file_age,
#             file_age_in_days, if_older_than_x_days,
# Options   :
# Returns   : the actual days older, so NON-ZERO, otherwise, 0
# Version   : 1.3
#----------------------------------------------------------------------------
sub if_file_older_than_x_days{
	if(@_ < 2){ print "\n# FATAL: if_file_older_than_x_days needs 2 args\n"; die; }
	my $file=${$_[0]} || $_[0];
	my $days=${$_[1]} || $_[1];
	my ($new_idx_file, $how_old_days);
	unless(-s $file){
	    print "\n# FATAL, nearly!: if_file_older_than_x_days: $file does NOT exist !\n";
		$new_idx_file=${&make_seq_index_file($file)};
		print "        if_file_older_than_x_days called make_seq_index_file to make $new_idx_file\n";
				$how_old_days=(localtime(time- (stat($new_idx_file))[9]))[3];
	}else{
				$how_old_days=(localtime(time- (stat($file))[9]))[3];
		}
	if($how_old_days > $days){
		print "\n# if_file_older_than_x_days: $file is older than $days\n";
		return(\$days);
	}else{
		print "\n# if_file_older_than_x_days: $file is NOT older than $days\n";
		return(0);
	}
}



#________________________________________________________________________
# Title     : read_file_names_only
# Usage     : @all_files=@{&read_file_names_only(<dir>, [extension])};
# Function  : read any file names and REMOVES the '.', '..' and dir entries.
#             And then put in array.  This checks if anything is a real file.
#             You can use 'txt' as well as '.txt' as extension
#             You can put multiple file extension (txt, doc, ....)
#               and multiple dir path (/usr/Perl, /usr/local/Perl....)
#               It will fetch all files wanted in all the direc specified
#
#             It can handle file glob eg)
#             @all_files=@{&read_file_names_only(\$abs_path_dir_name, 'G1_*.txt')};
#               for all txt files starting with 'G1_'
#
# Example   : @all_files=@{&read_file_names_only(\$abs_path_dir_name, ..)};
#             @all_files=@{&read_file_names_only(\$dir1, '.pl', '.txt')};
#             @all_files=@{&read_file_names_only(\$dir1, '.', \$dir2, \$dir3, 'e=pl')};
#             @all_files=@{&read_file_names_only(\$abs_path_dir_name, 'G1_*.txt')};
#             @all_files=@{&read_file_names_only(\$abs_path_dir_name, \@target_file_names)};
#
# Warning   : This does not report '.', '..'
#             Only file names are reported. Compare with &read_any_dir
#             extension size should be less than 15 char.
#             It sorts the results!
# Keywords  : filename only, filename_only, read_files_only, read files
#             get_file_names_only, get_files_only, read_files_only
# Options   : "extension name". If you put , 'pl' as an option, it will show
#             files only with '.pl' extension.
#  '-p'      for path also included resulting in '/path/path/file.ext'
#              rather than 'file.ext' in output @array
#  '-s'      for sorting the results
#  e='xxx'  for extention xxx
#  '.pl'    for files extended by '.pl'
#  'pl'     for files extended by 'pl', same as above
#  D=       for dir name input
#  d=       for dir name input
#
# Category  :
# Version   : 3.1
#--------------------------------------------------------------------
sub read_file_names_only{
		my($in_dir, $i, $j, $x, $k, $dir, @final_files, @possible_dirs, $sort_opt, $ext, @extensions,
			 $path_include, @in, $glob_given, @files_globed, @in_dir, $pwd, $extension_given,
			 %target_file_names, @target_file_names, @read_files);
		$pwd=`pwd`; chomp($pwd);
		$in_dir=$pwd;
		@in=@_;

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  Directory entry and opts detection
		#_________________________________________
		for($k=0; $k < @in; $k++){
			 if   ( $in[$k] eq '.'){ push(@in_dir,$pwd); splice(@in, $k, 1);  $k--; next }
					if( !(ref($in[$k]))){
				if($in[$k]=~/D=(\S+)/i){
						push(@in_dir, $1); splice(@in, $k, 1);    $k--; next;  }
				if( -d "$in[$k]" ){
						if($in[$k]=~/\/\S+$/){
								$path_include=1;  ## If the input dir has '/', I assume path should be added to out file names
			}
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# Removes the last slash '/' of input dir name
			#________________________________________________
			if($in[$k]=~/\/$/){   chop($in[$k]);  }
						push(@in_dir, $in[$k]);
						splice(@in, $k, 1);    $k--; next;
		}
		if(!(-f $in[$k]) and $in[$k] =~ /^\-p *$/ ){ ## somehow, ' *' is essential
			$path_include=1;
			splice(@in, $k, 1); $k--;
								}elsif(!(-f $in[$k]) and $in[$k] =~ /^\-s *$/   ){$sort_opt=1; splice(@in, $k, 1); $k--;
								}else{
										 #print "\n# (W) read_file_names_only: $in[$k] not a file, nor dir, a file extnsion?\n";
								}
	 }elsif(ref($in[$k])){
				if(ref($in[$k]) eq 'SCALAR'){
					 if( -d ${$in[$k]}){
							 if(${$in[$k]}=~/\/$/){ chop(${$in[$k]}) }
							 push(@in_dir,${$in[$k]});
							 splice(@in, $k, 1);
							 $k--;
					 }elsif(!(-f $in[$k]) and ${$in[$k]} =~ /^\-p$/ ){$path_include=1; splice(@in, $k, 1); $k--;
					 }elsif(!(-f $in[$k]) and ${$in[$k]} =~ /^\-s$/ ){$sort_opt=1; splice(@in, $k, 1); $k--;}
				}elsif(ref($in[$k]) eq 'ARRAY'){
					 @target_file_names=@{$in[$k]}; splice(@in, $k, 1); $k--;
					 for($x=0; $x < @target_file_names; $x++){  # making a hash out of @array
							 $target_file_names{$target_file_names[$x]}=$target_file_names[$x];
					 }
				}
	 }
	}
	if(@in_dir < 1){ push(@in_dir, $pwd) }

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#  File name and extension detection
	#_________________________________________
	for $dir (@in_dir){
						chdir($dir);

						for($k=0; $k < @in; $k++){
								 if( !(ref($in[$k]))){
													if($in[$k]=~/\*/){
																					$glob_given=1;
																					#~~~~~~~~~~~~~~~~~~~~~  Reads globbed files and attaches path if opt -p is set
																					if($path_include==1){  @final_files=map{ "$dir/$_" } <$in[$k]>;
																					}else{ @final_files=<$in[$k]>;  }
																					splice(@in, $k, 1); $k--;
													}elsif(!(-f $in[$k]) and $in[$k] =~/e=\.?(\S+)/){ $extension_given =1; push(@extensions, $1); splice(@in, $k, 1);$k--;
													}elsif(!(-f $in[$k]) and $in[$k] =~/\.*(\S+)/){
																					print "\n# read_file_names_only: pushing $1 as an extension" if $verbose;
																					$extension_given =1; push(@extensions, $1);
																					splice(@in, $k, 1); $k--;
													}elsif(!(-f $in[$k]) and $in[$k] =~/^([^\-]{0,8})$/){  ## extension name can not be larger than 8 chars
																					print "\n# read_file_names_only: pushing $1 as an extension" if $verbose;
																					$extension_given =1; push(@extensions, $1);
																					splice(@in, $k, 1); $k--;
													}
								 }elsif(ref($in[$k])){
																 if(ref($in[$k]) eq 'SCALAR'){

																					if(${$in[$k]}=~/\*/){
																													$glob_given=1;
																													if($path_include==1){  @final_files=map{ "$dir/$_" } <${$in[$k]}>;
																													}else{ @final_files=<${$in[$k]}> }
																													splice(@in, $k, 1); $k--;
																					}elsif(!(-f ${$in[$k]}) and ${$in[$k]} =~/e=(\S+)/ ){ $extension_given = 1;
																													push(@extensions, $1); splice(@in, $k, 1);  $k--;
																					}elsif(!(-f ${$in[$k]}) and ${$in[$k]} =~/^\.?(\S+)/ ){$extension_given =1;
																													push(@extensions, $1);  splice(@in, $k, 1);  $k--;
																					}
																 }
								 }
						}
						chdir($pwd);
	}
	if( $glob_given == 1 and  $extension_given !=1 ){  # when glob input is given only(without any extension input!
		 print "\n# read_file_names_only: You used glob for file name, but without extension name\n" if $verbose;
	 return(\@final_files);
	}
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	#  Main READING PART
	#_________________________________________________
	#print "\n# read_file_names_only: \@in_dir is  @in_dir\n";
	for($k=0; $k< @in_dir; $k++){
	  chdir($in_dir[$k]) or die "\n# read_file_names_only: could not get into $in_dir[$k]\n";
	  opendir(DIR1, ".");
		@read_files = readdir(DIR1);
	  if(@read_files < 1){ print "\n# read_file_names_only: ERROR??, \@read_files is empty\n\n\n"; }
	  for($i=0; $i < @read_files; $i ++){
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# If the user has specified the target file names
						#____________________________________________________
	    if( @target_file_names > 0){
									if( -f "$read_files[$i]" and -s $target_file_names{$read_files[$i]} ){ ##
											 if($extension_given ==1 ){
													 for $ext (@extensions){
																if( $read_files[$i] =~ /\.$ext$/){
																		 if($path_include==1){
																										 push(@final_files, "$in_dir[$k]\/$read_files[$i]" );
																		 }else{
																										 push(@final_files, "$read_files[$i]" );
																		 }
																}
													 }
											 }else{ ## reading everything !!!
																			 push(@final_files, $read_files[$i]);
											 }
									}
							}else{
									if( -f "$read_files[$i]" ){ ##
											 if($extension_given ==1 ){
														for $ext (@extensions){
																 if( $read_files[$i] =~ /\.?$ext$/){
																			if($path_include==1){
																					 push(@final_files, "$in_dir[$k]\/$read_files[$i]" );
																			}else{
																					 push(@final_files, "$read_files[$i]" );
																			}
																 }
														}
											 }else{ ## reading everything !!!
														push(@final_files, $read_files[$i]);
											 }
									}
							}
	 }
	 chdir($pwd);
		 }
		 @final_files=sort @final_files if $sort_opt == 1;
		 return(\@final_files);
}


#______________________________________________________________________
# Title     : sort_string_by_length
# Usage     : @output = @{&sort_string_by_length(@any_input_strings, [-r], @more)};
# Function  : sorts strings in array according to their sizes
#             bigger comes first.
# Example   :
# Warning   :
# Keywords  : sort_array_by_length, sort_str_by_length, sort_array_string_by
#             sort_string_by_leng, sort_by_length, sort_by_leng,
#             sort_array_by_string_length, sort_array_elements_by_string_length
# Options   : -r  reverse the order
# Version   : 1.2
#-------------------------------------------------------------------
sub sort_string_by_length{
	my(@input, $i, $small_first, @output);
	for($i=0; $i<@_; $i++){
		if( $_[$i]=~/^\-?r$/i){
			$small_first =1;
			splice(@_, $i, 1);
		}elsif(ref($_[$i]) eq 'ARRAY'){
		    push(@input, @{$_[$i]});
		}elsif(ref($_[$i]) eq 'SCALAR'){
			if(${$_[$i]}=~/^\-?r$/i){
			   $small_first=1;
			   splice(@_, $i, 1);
			}else{
			   push(@input, ${$_[$i]});
			}
		}elsif( !ref($_[$i]) ){
		    push(@input, $_[$i]);
		}
	}
	if($small_first ==1){
	    @output = sort {length($a) <=> length($a) || ($b cmp $a)} @input;
	}else{
	    @output = sort {length($b) <=> length($a) || ($a cmp $b)} @input;
	}
	return (\@output);
}
#________________________________________________________________________
# Title     : read_option_table
# Usage     :
# Function  : Reads the option table made by Jong in any perl script. The
#             option table is a box with separators.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Version   : 1.0
#--------------------------------------------------------------------
sub read_option_table{
	my($table_found, @option_tb, $head, );
	 open(SELF, "${$_[0]}");
	 while(<SELF>){
		if( (/^ *#+/) && ( $table_found== 1) ){
		  push (@option_tb, "$_");
		}elsif( ($table_found != 1)&&(/^ *\#+ *[Oo]ption *[Tt]able */) ){
			$table_found=1; $head="############## Option Table  \"$0\"\n"; ##
			push(@option_tb, $head);
		}
		if( ($table_found==1)&&(/^ *###################+ *$/)){  ### to find the end point of reading
			$table_found =0; last; }
	 }
	 return(\@option_tb);
}


#_____________________________________________________________________________
# Title     : fetch_sequence_from_db
# Usage     : %sequence=%{&fetch_sequence_from_db($input_file, \@string)};
# Function  : accept seq names (with or without ranges like _10-111 )
#              and produces hash ref.
#             As an option, you can write(xxxx.fa) the sequences in pwd
#              with the file names with sequence names.
#             The default database used is FASTA format OWL database.
#              You can change this by S (for Swissprot either fasta
#              or full format), P for PDB fasta format data.
#             If you give the path name of DB, it will look for the
#              DB given.
#
#             This automatically checks sequence family number as
#               in >d1bpi___7.6.1
#               and attaches the number in final %sequence output
#
# Example   : %seq=%{&fetch_sequence_from_db(\@input, seq.fa, seq.fa.idx)};
#              while @input=qw( 11S3_HELAN_11-31 A1AB_CANFA A1AT_PIG )
# Keywords  : fetch_seq_from_db, fetch_sequence_from_database
# Options   : _  or #  for debugging.
#     w       for write fasta file
#     s=      for putting source DB file name manually
#     d=p100  for PDB100 fasta database from ENV
#     d=p40   for PDB40  fasta database from ENV
#     d=p     for PDB database (usually p100) from ENV
#     d=s     for Swissprot database from ENV
#     d=o     for OWL database from ENV
#     i=      for index filename. If not specified, this looks for it in the same dir as fast     �
#     t=      for msp_threshold
#  msp_threshold=0.0005  # when MSP file is given as input for getting seq names
#
# Returns   : ref of hash
# Argument  : gets names of sequences
#             eg) \@array, \%hash, \$seq, while @array=(seq1, seq2), $seq='seq1 seq1'
#                                               %hash=(seq1, xxxx, seq2, yyyy);
#
# Version   : 3.5
#------------------------------------------------------------------------------
sub fetch_sequence_from_db{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

				my(@DATABASE, @INDEX_FILE, %sequence, %seq_with_index, @input_seq_names,
	   %long_index, @Keys, $R_start, $NAME, $R_leng, $found_seq_count,
	   $eval_blastpgp_thresh, $seq_found1, $sequence, @keys, $index_file,
	   $source_DB_file, $matching_seq, $match_start, $match_stop);

                $eval_blastpgp_thresh=0.001; # default

				if($vars{'msp_threshold'}=~/(\S+)/ or $vars{'mt'}=~/(\S+)/ or $vars{'t'}=~/(\S+)/ ){
							$eval_blastpgp_thresh=$1;
							print "\n# (i) YOU have set the \$eval_blastpgp_thresh $eval_blastpgp_thresh\n\n";
				}
				if($vars{'s'}=~/(\S+)/ or $vars{'DB'}=~/(\S+)/ ){
							$source_DB_file=$1; push(@DATABASE, $source_DB_file);
				}

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# getting input seq names from all sources
	#________________________________________________________
	for(0..@hash){ # from the given hash (keys names)
	     push(@input_seq_names, keys %{$hash[$_]} );
	}
	for(0..@raw_string){  ## from given sequence names
	     push(@input_seq_names, split(/ +/, $raw_string[$_]) );
	}
				for($i=0; $i<@file; $i++){  ## From MSP file input (get only MATCHED sequences)
						 if($file[$i]=~/\.mspa/){
									print "\n# (i) MSP file input is detected !\n";
									my ($seq_with_range);
									open(MSP, $file[$i]);
									while(<MSP>){
											 if(/^ *\S+ +(\S+) +\S+ +\S+ +\S+ +\S+ +\S+ +\S+ +(nr_\S+_DROME_\S+) +/){
														push(@input_seq_names, $2) if $1 < $eval_blastpgp_thresh;
											 }elsif(/^[\t ]*\S+[\t ]+(\S+)[\t ]+\S*[\t ]*(\d+)[\t ]+(\d+)[\t ]+\S+[\t ]+(\d+)[\t ]+(\d+)[\t ]+(\S+) */){
														$matching_seq=$6; $match_start=$4; $match_stop=$5;
														$Evalue_thresh=$1;
														if($matching_seq=~/^(\S+)_\d+\-\d+/){
																 $seq_with_range=$matching_seq;
														}else{
																 $seq_with_range="$matching_seq\_$match_start\-$match_stop";
														}
														push(@input_seq_names, $seq_with_range) if $Evalue_thresh < $eval_blastpgp_thresh;
														print "\n# (i) pushing $seq_with_range" if $verbose;
											 }
									}
									close(MSP);
									splice(@file, $i, 1);
									$i--;
						 }
				}

	print "\n# (1) fetch_sequence_from_db: \@raw_string has: ", scalar(@raw_string), " elements";
	print "\n# (2) fetch_sequence_from_db: No. of seq to fetch is:",scalar(@input_seq_names);
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Choose the DBs and INDEX for fetching sequences. All files input must be DATABAE or INDEXfile
	#___________________________________________
	if(@file > 0){
						 for($i=0; $i< @file; $i++){
								if(-T $file[$i] and $file[$i]=~/\.[sp]?fa[sta]?$/){      push(@DATABASE, $file[$i]);   next}
								elsif((-T $file[$i]) and ($file[$i]=~/\.seq$/)){    push(@DATABASE, $file[$i]);   next}
								elsif((-T $file[$i]) and ($file[$i]=~/\.dat$/)){    push(@DATABASE, $file[$i]);   next}
								elsif(-T $file[$i] and $file[$i]=~/\.idx$/){        push(@INDEX_FILE, $file[$i]); next }
								if($file[$i] !~/\.idx/ and -T "$file[$i]\.idx"){    push(@INDEX_FILE, "$file[$i]\.idx"); }
								else{
										print "\n#  WARN:  fetch_sequence_from_db:
										You put a file-name-like which is not a fasta DB. Error. I am removing $file[$i]";
										splice(@file, $i, 1);
										$i--;
								}
						 }
	}

	if($vars{'d'}=~/^p[100]*$/){
	   if( -T  $ENV{'PDB_FASTA'} ){             push(@DATABASE,   $ENV{'PDB_FASTA'} );     }
	   elsif(  -T $ENV{'PDB_SEQ_FASTA'} ){      push(@DATABASE,   $ENV{'PDB_SEQ_FASTA'}  ); }
	   elsif(  -T $ENV{'PDB100_FASTA'} ){       push(@DATABASE,   $ENV{'PDB100_FASTA'} ); }
	   if(  -T $ENV{'PDB_FASTA_INDEX'} ){       push(@INDEX_FILE, $ENV{'PDB_FASTA_INDEX'} ); }
	}elsif( $vars{'d'}=~/^p\d+d$/ ){
	   if(  -T $ENV{'PDB100D_FASTA'} ){         push(@DATABASE,   $ENV{'PDB100D_FASTA'});     }
	   elsif(  -T $ENV{'PDBD100_FASTA'}  ){     push(@DATABASE,   $ENV{'PDBD100_FASTA'}); }
	   elsif(  -T $ENV{'PDB100D_SEQ_FASTA'}  ){ push(@DATABASE,   $ENV{'PDB100D_SEQ_FASTA'}); }
	   elsif(  -T $ENV{'PDBD100_SEQ_FASTA'}  ){ push(@DATABASE,   $ENV{'PDBD100_SEQ_FASTA'}); }
	   if(  -T $ENV{'PDB100D_SEQ_FASTA_INDEX'} ){    push(@INDEX_FILE, $ENV{'PDB100D_SEQ_FASTA_INDEX'}); }
	   elsif(  -T $ENV{'PDBD100_SEQ_FASTA_INDEX'} ){    push(@INDEX_FILE, $ENV{'PDBD100_SEQ_FASTA_INDEX'}); }
	}elsif( $vars{'d'}=~/^p40/ ){
	   if(  -T $ENV{'PDB40_FASTA'} ){          push(@DATABASE,   $ENV{'PDB40_FASTA'});     }
	   elsif(  -T $ENV{'PDB40_SEQ_FASTA'}  ){  push(@DATABASE,   $ENV{'PDB40_SEQ_FASTA'}); }
	   if(  -T $ENV{'PDB40_FASTA_INDEX'} ){    push(@INDEX_FILE, $ENV{'PDB40_FASTA_INDEX'}); }
	}elsif( $vars{'d'}=~/^p90/ ){
	   if(  -T $ENV{'PDB90_FASTA'}  ){         push(@DATABASE,   $ENV{'PDB90_FASTA'}    ); }
	   elsif(  -T $ENV{'PDB90_SEQ_FASTA'} ){   push(@DATABASE,   $ENV{'PDB90_SEQ_FASTA'}); }
	   if(  -T $ENV{'PDB90_FASTA_INDEX'} ){    push(@INDEX_FILE, $ENV{'PDB90_FASTA_INDEX'}); }
	}
	if( $vars{'d'}=~/^s *$/){
	   if(  -T $ENV{'SWISS_FASTA'} ){          push(@DATABASE,   $ENV{'SWISS_FASTA'});     }
	   elsif(  -T $ENV{'SWISS_SEQ_FASTA'} ){   push(@DATABASE,   $ENV{'SWISS_SEQ_FASTA'}); }
	   elsif(  -T $ENV{"SWISS_DIR\/seq.spfa"} ){ push(@DATABASE,   $ENV{"SWISS_DIR\/seq.fa"}); }
	   if(  -T $ENV{'SWISS_FASTA_INDEX'} ){    push(@INDEX_FILE, $ENV{'SWISS_FASTA_INDEX'}); }
	   elsif(  -T $ENV{'SWINDEX'} ){           push(@INDEX_FILE, $ENV{'SWINDEX'}); }
	}
	elsif( $vars{'d'}=~/^o *$/){
		if(  -T $ENV{'OWL_FASTA'} ){            push(@DATABASE,   $ENV{'OWL_FASTA'});     }
		elsif(  -T $ENV{'OWL_SEQ_FASTA'} ){     push(@DATABASE,   $ENV{'OWL_SEQ_FASTA'}); }
		elsif(  -T $ENV{"OWL_DIR\/owl.fa"} ){   push(@DATABASE,   $ENV{"OWL_DIR\/owl.fa"}); }
		if(  -T $ENV{'OWL_FASTA_INDEX'} ){      push(@INDEX_FILE, $ENV{'OWL_FASTA_INDEX'}); }
		print "\n# Fetching sequences from OWL\n";
	}
	elsif( $vars{'d'}=~/^n *$/){
	   if(  -T $ENV{'NRDB_FASTA'} ){            push(@DATABASE,   $ENV{'NRDB_FASTA'});     }
 	   elsif(  -T $ENV{'NRDB_SEQ_FASTA'} ){     push(@DATABASE,   $ENV{'NRDB_SEQ_FASTA'}); }
			 if(  -T $ENV{'NRDB_FASTA_INDEX'} ){      push(@INDEX_FILE, $ENV{'NRDB_FASTA_INDEX'}); }
			 elsif(  -T $ENV{'NRDB_FASTA_IDX'} ){     push(@INDEX_FILE, $ENV{'NRDB_FASTA_IDX'}); }
	   print "\n# Fetching sequences from OWL\n";
	}
	elsif( $vars{'d'}=~/^\S+\.\S+$/ and -T $vars{'d'} ){ push(@DATABASE, $vars{'d'} );     }
	if( $vars{'i'}=~/\S+\.\S+$/ and -T $vars{'i'} ){ push(@INDEX_FILE, $vars{'i'} );   }
	if(@INDEX_FILE > 0 and @DATABASE > 0){
						 if( ${&if_file_older_than_x_days("$DATABASE[0]\.idx", 5)} > 0 ){
									$index_file=${&make_seq_index_file(\@DATABASE)};
									push(@INDEX_FILE, $index_file);
						 }elsif((-s "$DATABASE[0]\.idx") > 50){
										 push(@INDEX_FILE, "$DATABASE[0]\.idx");
						 }else{
										 print "\n# (E) fetch_sequence_from_db: Some weird error in pushing \$index_file to \@INDEX_FILE\n"; die;
						 }
	}

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#  Final check for ALL the inputs
	#___________________________________________________
	if( @DATABASE  < 1){ print "\n# fetch_sequence_from_db: DATABASE file no found. Error\n"; die     }
	if( @INDEX_FILE < 1){
						 print "\n# fetch_sequence_from_db: \@INDEX_FILE has less than 1 elem. Error\n";
						 push(@INDEX_FILE, ${&make_seq_index_file(@DATABASE)});
						 print "     fetch_sequence_from_db called make_seq_index_file to make @INDEX_FILE\n";
	}
 	if($debug==1){
						 print "\n# DATABASE used     : @DATABASE";
						 print "\n# INDEX_FILE used   : @INDEX_FILE";
						 print "\n# input_seq_names   : @input_seq_names";
	}


				##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				##  Now I have @DATABASE, @INDEX_FILE, @input_seq_names
				##_______________________________________________________________

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	#  Reading in index file to get 'seq' 'seek pos' to make %seq_with_index
	#__________________________________________________________________________
	print "\n#  fetch_sequence_from_db: \@INDEX_FILE @INDEX_FILE, \@DATABASE :@DATABASE\n";
	for($i=0; $i< @INDEX_FILE; $i++){
	   open(INDEX, "$INDEX_FILE[$i]");
	   while(<INDEX>){ if(/(\S+) +(\S+)/){  $long_index{$1}=$2;  }  }

	   for($j =0; $j < @input_seq_names; $j++){

								 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
								 #  If DATABASE has sequence names with ranges already index the seq with ranges
								 #____________________________________________________________________________________
								 if($input_seq_names[$j]=~/(\S+\_\d+\-\d+)$/ and $long_index{$1}){
										 $seq_with_index{$1}=$long_index{$1};

								 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
								 #  If DATABASE has sequence names without ranges index the seq without ranges
								 #____________________________________________________________________________________
								 }elsif($input_seq_names[$j]=~/(\S+)\_\d+\-\d+$/ and $long_index{$1}){

												 $seq_with_index{$input_seq_names[$j]}=$long_index{$1}; # !!!! <--- This line is critical

								 }elsif($input_seq_names[$j]=~/(\S+)\_\d+\-\d+/ and $long_index{"$1\_"}){ # to handle Tim's new pdb100.fa files

										 $seq_with_index{$input_seq_names[$j]}=$long_index{"$1\_"};
										 print "\n# Warning: $1 (from $input_seq_names[$j]) matched with $1\_ in $INDEX_FILE[$i],
																	 I hope this is correct!!\n";
								 }
								 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
								 #  If input_seq_name has SCOP superfamily numbers
								 #____________________________________________________________________________________
								 elsif($input_seq_names[$j]=~/^(\S+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/ and $long_index{$1}){

												 $seq_with_index{"$1\_$2"}=$long_index{$1}; # !!!! <--- This line is critical

								 }elsif($input_seq_names[$j]=~/\S/ and $long_index{$input_seq_names[$j]}){
												 $seq_with_index{$input_seq_names[$j]}=$long_index{$input_seq_names[$j]}
								 }else{
												 print "\n#  $input_seq_names[$j](with, without range) have NO corresponding index in $INDEX_FILE[$i], ERR";
								 }
	   }
	   close INDEX;
	   if ( scalar(keys %seq_with_index) < 1){
									print "\n# fetch_sequence_from_db: \%seq_with_index is too small, ERROR?\n";
	   }
	}

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
	#  Fetching sequences from DATABASE
	#_______________________________________________________________
	print "\n# fetch_sequence_from_db: Fetching seqs from @DATABASE with  @INDEX_FILE ";
	@Keys= sort {$seq_with_index{$a} <=> $seq_with_index{$b} } keys %seq_with_index;        ## <<< NOTE it is @Keys, not @keys
	print "\n# (3) fetch_sequence_from_db: No. of seq indexed is:", scalar(@Keys);

	for($f=0; $f< @DATABASE; $f++){

					 open(DB_FASTA, $DATABASE[$f]);

	   F0: for($e=0; $e< @Keys; $e++){
		  my ($seq_found1, $super_fam_class, $NAME, $R_leng, $R_start, $sequence);
		  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		  # When seq name has range attachment, it handles
		  #________________________________________________
		  if($Keys[$e]=~/(\S+)_(\d+)\-(\d+)$/){
		      $NAME=$1;
		      $R_start=$2-1;      ## to fit in substr function
		      $R_leng =$3-$2+1; ## to fit in substr
		      print "\n# (4) fetch_sequence_from_db: Sequences have ranges ($R_start-$R_leng) only (not superfamily numb.) \n";
		  }
		  elsif($Keys[$e]=~/(\S+)_(\d+)\-(\d+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/){
		      $NAME=$1;
		      $R_start=$2-1;      ## to fit in substr function
		      $R_leng =$3-$2+1; ## to fit in substr
		      $super_fam_class=$4;
		      print "\n# (4) fetch_sequence_from_db: Sequences have ranges and superfamily numb.\n";
		  }
		  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		  # When superfamily (scop) number is attached
		  #___________________________________________________
		  elsif($Keys[$e]=~/(\S+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/){
											$NAME=$1;
		      $super_fam_class=$2;
		      print "\n# (4) fetch_sequence_from_db: Sequences have SCOP superfamily numbers only \n";
		  }elsif($Keys[$e]=~/^ *(\S+)[\,]*$/){
		      print "\n# (4) fetch_sequence_from_db: Sequences DON't have ranges or SCOP superfam numb.\n";
							$NAME=$1;
		  }

					print "\n# BEFORE reading in DB file. \$NAME is  $NAME";

		  if($seq_with_index{$NAME}=~/(\d+)/        # It is importnt having $seq_with_index{$Keys[$e]}
			   or $seq_with_index{$Keys[$e]}=~/(\d+)/
			   or $seq_with_index{"$NAME\,"}=~/(\d+)/    # this is for overcoming '>xxxx,'  entry(the comma)
													 or $seq_with_index{"$NAME\_"}=~/(\d+)/    # to handle Tim's  >c1eru_ 3.30.1.1.4
													 or $seq_with_index{"pdb\_$NAME"}=~/(\d+)/ # to handle Sarah's pdb_xxxxx
													 or $seq_with_index{"nr\_$NAME"}=~/(\d+)/ # to handle Sarah's nr_xxxxx
			   ){
			   my $finding_position= $1-300;
			   if( $finding_position >= 0 ){   seek(DB_FASTA, $1-300, 0);  # -300 is necessary
			   }elsif($finding_position < 0){  seek(DB_FASTA, 0, 0); }      ## This is essential !!!

			   while(<DB_FASTA>){
				  if(!$seq_found1){
					if(/\> *$NAME[\,_]? *\d*/){
						  $seq_found1=1;
						  print "\n# $NAME  is found in DB, Good ";
					  }
				  }else{
					  if(/^ *(\w+) *$/ ){
					      $sequence .=$1;  ## you should use $1 to avoid including NEW line
						  unless(eof DB_FASTA){ next   ## This is critically important to prevent error.
						  }else{ goto PUT_SEQ }     ## If the last seq has only one single line seq string, it could be a problem
					  }elsif( (/^ *\> *\S+/)  or (eof DB_FASTA) ){
																							 #======= When range is defined ==================
																							 PUT_SEQ:
																							 if($R_start =~/\d+/){
																																																										 $sequence{$Keys[$e]}=substr($sequence, $R_start, $R_leng); next; #
																																																										 print "\n# $sequence{$Keys[$e]}  is put to \%sequence";
																							 }
																							 #======= To handle superfamily information given ==========
																							 if($super_fam_class){
																											 $sequence{$Keys[$e]}=$sequence;
																											 $acquired_seq_count++;
																							 }
																							 #======= When range is NOT defined ==================
																							 else{
																											 $sequence{$Keys[$e]}=$sequence;
																																																										 print "\n# $sequence  is put to \%sequence";
																							 }
																							 $R_start='';
																							 $sequence='';
																							 $seq_found1='';  ## reset $R_start, $seq_found1,,
																							 next F0;
					  }
				  }
			   }

		  }else{
			   print "\n# Error, the sequence pos for $NAME (from $Keys[$e]) in DB doesnt exist in xxxx.idx file?\n";
		  }
	   }
	   close DB_FASTA;
	}
	#print "\n# (6) fetch_sequence_from_db: counted fetched seqs: $found_seq_count, $acquired_seq_count";
	#print "\n# (7) fetch_sequence_from_db: Fetching seq has finished \n";

	return(\%sequence);
}

#_________________________________________________________________________________
# Title     : open_sso_files
# Usage     :  @sso=@{&open_sso_files(@file, $add_range, $add_range2, "u=$upper_expect_limit",
#			                            "l=$lower_expect_limit", "m=$margin", $new_format)};
# Function  : This reads the parseable( -m 10 option)
#             and non-parseable form of ssearch program output
#             If you give 5 files, it produces 5 hashes as a ref of array.
#             This understands xxxx.gz files.
#             This reads FASTA -m 10 output, too.
#             SSO means sequence search output (SSSO = ssearch seq search output)
#             FSSO=fasta seq. search output, MSSO=machine readable seq search output
# Example   :
#  717    0         0.343  16    373    EC1260_16-373              74    434    YBL6_YEAST_74-434
#  348    9e-16     0.500  113   233    EC1260_113-233             27    146    YDBG_ECOLI_27-146
#  472    2.9e-08   0.271  13    407    EC1260_13-407              148   567    YHJ9_YEAST_148-567
#  459    1.9e-22   0.260  1     407    EC1260_1-407               65    477    YLQ6_CAEEL_65-477
#  452    4.5e-14   0.275  1     407    EC1260_1-407               103   537    YSCPUT2_103-537
#  1131   0         0.433  1     407    EC1260_1-407               112   519    ZMU43082_112-519
#
#  Input SSO file example)-> below
#
#   >>MG032 ATP-dependent nuclease (addA) {Bacillus subtilis  (666 aa)
#    Z-score: 88.3 expect()  1.9
#   Smith-Waterman score: 77;  27.143% identity in 70 aa overlap
#
#           30        40        50        60        70        80
#   MJ0497 RSAGSKGVDLIAGRKGEVLIFECKTSSKTKFYINKEDIEKLISFSEIFGGKPYLAIKFNG
#                                        : .. ...  . .:.:::. :: : ..:
#   MG032  HDKVRYAFEVKFNIALVLSINKSNVDFDFDFILKTDNFSDIENFNEIFNRKPALQFRFYT
#        200       210       220       230       240       250
#
#           90       100             110       120       130
#   MJ0497 EMLFINPFLLSTNGK------NYVIDERIKAIAIDFYEVIGRGKQLKIDDLI
#          .   ::   :: ::.      : ....... . ::. . :
#   MG032  K---INVHKLSFNGSDSTYIANILLQDQFNLLEIDLNKSIYALDLENAKERFDKEFVQPL
#        260          270       280       290       300       310
#
# Parseable form -m 10 option =========================================
#   >>>MJ0497.fa, 133 aa vs GMG.fa library
#   ; pg_name: Smith-Waterman (PGopt)
#   ; pg_ver: 3.0 June, 1996
#   ; pg_matrix: BL50
#   ; pg_gap-pen: -12 -2
#   >>MG032 ATP-dependent nuclease (addA) {Bacillus subtilis
#   ; sw_score:  77
#   ; sw_z-score: 88.3
#   ; sw_expect    1.9
#   ; sw_ident: 0.271
#   ; sw_overlap: 70
#   >MJ0497 ..
#   ; sq_len: 133
#   ; sq_type: p
#   ; al_start: 58
#   ; al_stop: 121
#   ; al_display_start: 28
#
# Warning   : By default, the SW score comes to the first
#             If expect value is not found, it becomes '0'
#             By default, the offset of seq match with a seq name like seq_30-40
#               will be 30 not 1.
#             It ignores special chars like , : .prot in the name (eg, AADF_FASDF: will be AADF_FASDF)
# Keywords  : open_ssearch_output_files, ssearch_output, ssearch, FASTA,
# Options   : _  for debugging.
#             #  for debugging.
#             u= for upper E value limit
#             l= for lower E value limit
#             r  for attaching ranges to out seq names (eg> HI0001_1-20 as a key)
#             U  for making the matched seqname to upppercase
#             L  for making the matched seqname to lowercase
#             R  for attaching ranges to out seq names for both TARGET and MATCH
#             n  for new format (msp2)
#             a  for getting alignments of the pair
#
# Category  :
# Version   : 4.6
#----------------------------------------------------------------------------
sub open_sso_files{
		#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
		my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
		my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
		my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
		my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
		my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
		if($debug==1){print "\n\t\@hash=\"@hash\"
		\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
		\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
		#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		my (@out_refs, @SSO, $create_sso, $parseable, @OUT, @temp_sso_lines,
						%match, $attach_range_in_names, $margin, $uppercase_seq_name,
						$lowercase_seq_name, $target_seq, $new_format, $get_alignment,
						$pvm_version_fasta_out, $original_target_seq, $big_msp_out_file);

		my ($upper_expect_limit, $lower_expect_limit)=(50,0);

		if($char_opt=~/R/){  $attach_range_in_names2=1; };
		if($char_opt=~/r2/){ $attach_range_in_names =1; $attach_range_in_names2=1 };
		if($char_opt=~/r/){  $attach_range_in_names =1; };
		if($char_opt=~/c/){  $create_sso   ='c' };
		if($char_opt=~/n/){  $new_format   ='n' };
		if($char_opt=~/a/){  $get_alignment='a' };
		if($char_opt=~/U[pperPPER]*/){ $uppercase_seq_name='U' };
		if($char_opt=~/L[owerOWER]*/){ $lowercase_seq_name='L' };
		if($vars{'u'}=~/(\.?\d+)/){ $upper_expect_limit = $vars{'u'} };
		if($vars{'l'}=~/(\.?\d+)/){ $lower_expect_limit = $vars{'l'} };
		if($vars{'m'}=~/\d+/){ $margin = $vars{'m'} };
		$attach_range_in_names2=$attach_range_in_names=1;

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# opening file input (can handle .gz  files)
		#_______________________________________________
		if(@file < 1 and @array > 0){
				 for($i=0; $i< @array; $i++){
							@sso=@{$array[$i]};
				 }
				 print "\n# (I) \@sso has ", scalar(@sso), " lines. \n"  if $verbose;
				 if(@sso > 3000){ # if @sso is very big, I remove the useless contents
						 #print "\n# (INFO) open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
				 }
				 push(@OUT, &read_sso_lines(\@sso, $create_sso,  ## read_sso_lines has read_machine_readable_sso_lines and machine unreadl...
										"u=$upper_expect_limit",
										"l=$lower_expect_limit",
										$attach_range_in_names,
										$attach_range_in_names2,
										$new_format, $get_alignment) );
		}
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Opening input FILE!
		#_______________________________________________
		else{
				 #print "\n# open_sso_files : processing @file \n\n";
				 for($i=0; $i< @file; $i++){
							if($file[$i]=~/\S+\.mspa *$/){ $big_msp_out_file=$file[$i]; splice (@file, $i, 1); $i--;
							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							# Opening zipped file
							#_______________________________________________
							}elsif($file[$i]=~/\S+\.\gz$/ or -B $file[$i]){  ## if file has xxxx.gz extension
									my (@sso);
									@sso=`gunzip -c $file[$i]`;
									if(@sso < 30){  @sso=`zcat $file[$i]`; }      # if zcat fails to produce output use gunzip -c
									if(@sso > 3000){ # if @sso is very big, I remove the useless contents
											print "\n# open_sso_files: size of \@sso for $file[$i] exceeds 3000 lines, ", scalar(@sso), " !!! \n";
									}
									push(@OUT, &read_sso_lines(\@sso, $create_sso,
											"u=$upper_expect_limit",
											"l=$lower_expect_limit",
											$attach_range_in_names,
											$attach_range_in_names2,
											$new_format, $get_alignment) );
							}
							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							# Opening plain file(not zipped)
							#_______________________________________________
							elsif($file[$i]=~/\S+\.[fsm]?sso/ or $file[$i]=~/\S+\.out/ or $file[$i]=~/\S+\.fso/){
									open(SSO, "$file[$i]") or die "\n# (ERROR) open_sso_files: Failed to open $file[$i]\n";
									my @sso=<SSO>;
									if(@sso < 30){  @sso=`zcat $file[$i]`; }      # if zcat fails to produce output use gunzip -c
									if(@sso > 3000){ # if @sso is very big, I remove the useless contents
											#print "\n# (INFO) open_sso_files: size of \@sso is for $file[$i] exceeds 3000 lines, ",
											scalar(@sso), " !!! \n";
									}
									#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
									# !!! DEFAULT !!! Calling read_sso_lines sub
									#_____________________________________________
									push(@OUT, &read_sso_lines([@sso], $create_sso,
											 "u=$upper_expect_limit",
											 "l=$lower_expect_limit",
											 $attach_range_in_names,
											 $attach_range_in_names2,
											 $new_format, $get_alignment) );
									close(SSO);
							}
				 }
		}
		#print "\n# \@OUT, in open_sso_files, has ", scalar(@OUT), " elements \n";
		return(\@OUT); # @OUT has refs of hashes  (\%xxx, \%YYY, \%XXX,,,,)
}



#________________________________________________________________________
# Title     : write_fasta
# Usage     : many argments:  $seq_hash_reference  and $output_file_name
#             takes a hash which has got names keys and sequences values.
# Function  : writes multiple seqs. in fasta format (takes one or more seq.!!)
#             This needs hash which have 'name' 'actual sequence as value'
#
#             To print out each fasta seq into each single file, use write_fasta_seq_by_seq
#             This can rename seq names
#
# Example   : &write_fasta(\%in1, \$out_file_name, \%in2, \%in3,..., );
#             << The order of the hash and scalar ref. doesn't matter. >>
# Warning   : The default output file name is 'default_out.fa' if you do not
#             specify output file name.
#             OUTput file should have xxxxx.fa or xxxx.any_ext NOT just 'xxxxx'
# Keywords  : write_fasta_file, print_fasta_file, write fasta file, fasta_write
#             show_fasta, write_sequence_fasta, write_fasta_files,
# Options   : v for STD out.
#             r for rename the sequences so that Clustalw would not complain with 10 char limit
#               so result wuld be:  0 ->ASDFASDF, 1->ASDFASFASF, 2->ADSFASDFA
# Returns   :
# Argument  :
#   $sort_seq_names=s by s  ## in writing sorted sequences are written
#
# Version   : 2.6
#--------------------------------------------------------------------
sub write_fasta{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	#$| = 1;

	my($sort_seq_names, $string, $string_leng, $na,$out_file_name_provided);
	my($output_file) ='default_out.spfa'; ### when no output file name is given, this is used
	if(@file>0){
	$output_file = $file[0];
	$out_file_name_provided=1;
	}else{ $output_file='default_out.spfa'; }

	if($char_opt=~/s/){
			$sort_seq_names='s';
	}

	for ($n=0 ; $n < @hash; $n ++){
		 my(@keys);
	 my %hash=%{$hash[$n]};
		 if($sort_seq_names){
				 @keys=sort keys %hash;
		 }else{
				 @keys= keys %hash;
		 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # When only one seq is given, use the seq name as output file
	 #________________________________________________________________
	 if(@hash==1 and @keys==1 and @file < 1){
	    $output_file="$keys[0]\.spfa";
	 }elsif(@file < 1){
	    $output_file="default_fa_$n\.spfa";
	 }

	 open (FASTAS_WRITE,">$output_file");      # $string is the seq string.

	 for ($i=0; $i < @keys; $i++){
		$na= $keys[$i];
		$string = "\U$hash{$na}";
		$string=~s/[\n \.-]//g;	    # replaces all non-chars to null. '_' is used for stop codon
		if($char_opt=~/r/){  # rename the seqeunces with '0, 1, 2, 3," etc for  clustalw
		   $na=$i;
		}

		if($debug == 1){
			print ">$na\n";
			print FASTAS_WRITE ">$na\n";
	    }elsif($char_opt=~/v/){
		    print ">$na\n";
		    print FASTAS_WRITE ">$na\n";
		}else{
		    print  FASTAS_WRITE ">$na\n";
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  Main algorithm of writing in 60 char leng line
		#_____________________________________________________
		$string_leng=length($string);
		for($j=0; $j< $string_leng; $j+=60){
			if($debug == 1){
				printf "%.60s\n", substr($string,$j,60);
				printf FASTAS_WRITE "%.60s\n", substr($string,$j,60);
			}elsif($char_opt=~/v/i){
				printf "%.60s\n", substr($string,$j,60);
				printf FASTAS_WRITE "%.60s\n", substr($string,$j,60);
			}else{
			   printf FASTAS_WRITE "%.60s\n", substr($string,$j,60);
			}
		}
	 }
	 close FASTAS_WRITE;
	}
	if( $out_file_name_provided != 1){
	  print "\n\n# You didnt give out file name, $output_file  used\n";
	}
	if( -s $output_file ){
	 if($verbose=~/\S/){ ## if v option is given, mesg is omitted to prevent comments to a redirected output
	    print "\n# Sequences were written in  $output_file ";
	 }
	}else{
	 print "\n# The size of written outfile \"$output_file\" is 0, error \n\n"
	}
}


#_________________________________________________________________________
# Title     : merge_similar_seqlets
# Usage     : @all_seqlets = @{&merge_similar_seqlets(@all_seqlets)};
# Function  : merges seqlet sets which have identical
#             sequences and share similar regions by connection factor of 30%
#             This means, if any two seqlets from the same sequences which
#             share more than 70% seqlet regions overlapping are merged
#             This only sees the very first sequence in the seqlets line!!!
#             (so, PARTIAL MERGE !!)
# Example   : INPUT:
#
#   @input=( 'seq1_1-30 seq2_1-40 seq3_1-50',
#            'seq1_2-49 seq3_4-40 seq4_2-99'....)
#
#   @output=('seq1_1-30 seq2_1-45 seq3_2-45 seq4_2-99');
#
# Keywords  : merge_similar_sequences, merge_sequence_names, merge_sequences,
#              merge_sequence_ranges, merge_similar_sequences_with_ranges,
#              merge_seqlets, merge_duplication_modules
# Options   :
#
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#
# Version   : 2.1
#-------------------------------------------------------------------------------
sub merge_similar_seqlets{
	 my (@all_seqlets, @result_all_seqlets, $i, $j, $k, $seq1, $start1, $end1, $seq2,
	   $smaller_leng, $start2, $end2, @split, @split1, @split2, $factor, $leng_thresh, $optimize,
			 $short_region, $large_region, $average_region, $overlapping_seq_match_size);
	 $factor=7;     #  30% sequence mismatch region is allowed(3)
	 $leng_thresh=30;
	 $optimize=1;
	 $average_region='A'; # default

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	 # Sorting (parsing) input to get options and input array
	 #_________________________________________________________
	 for($i=0; $i< @_; $i++){
	     if(ref($_[$i]) eq 'ARRAY'){
					 @all_seqlets=@{$_[$i]};
			 }elsif($_[$i]=~/f=(\S+)/){  $factor=$1;
			 }elsif($_[$i]=~/z/i){       $optimize=1;
			 }elsif($_[$i]=~/l=(\d+)/i){ $leng_thresh=$1;
			 }elsif($_[$i]=~/^S/){       $short_region='S';   $large_region=$average_region='';
			 }elsif($_[$i]=~/^L/){       $large_region='L';   $short_region=$average_region='';
			 }elsif($_[$i]=~/^A/){       $average_region='A'; $short_region=$large_region  =''; }
	 }
	 if(@all_seqlets==1){

			 #print "\n# (1) merge_similar_seqlets: \@all_seqlets == 1, returning\n" if $verbose;
			 #print "\n#  \@all_seqlets = @all_seqlets\n" if $verbose;
			 return(\@all_seqlets);
	 }else{
			 #print "\n# (1) merge_similar_seqlets: \@all_seqlets > 1, Processing\n" if $verbose;
			 #if( $verbose){
			 #	 for($i=0; $i< @all_seqlets; $i++){
			 #				 print "\n# $all_seqlets[$i]" ;
			 #	 }
			 #}
	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # This is to remove which are identical in @all_seqlets;
	 #_________________________________________________________
	 F1: for($i=0; $i< @all_seqlets; $i++){
			my $merged_two_seqlet_lines;

			if($all_seqlets[$i] eq $all_seqlets[$i+1]){
	        #print "\n# (2) merge_similar_seqlets: \$all_seqlets\[$i\] equals \$all_seqlets\[$i+1\]\n" if $verbose;
					splice(@all_seqlets, $i+1, 1);
					$i-- if $i >0;
		      next F1;
	    }else{
					#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
					# @split1 and 2 are arrays from different string entry in @all_seqlets
					#______________________________________________________________________
					@split1=sort split(/ +/, $all_seqlets[$i]);
					@split2=sort split(/ +/, $all_seqlets[$i+1]);
			}

	    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
	    #   (3) If the first elements of @split1 and 2 are identical, lets merge the two arrays
	    #________________________________________________________________________________
			if($split1[0] eq $split2[0] or $split1[0] eq $split2[1] or $split1[0] eq $split2[2]){
	        #print "\n# (3) \$split1[0] and \$split2[0] are identical, good!\n" if $verbose;
		      @split=(@split1, @split2);
		      if(1){ #~~~~~ optimize option removes similar seqlets
						 $all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
			                              $short_region, $large_region, $average_region)} );
		      }else{
						 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						 # Only removes exactly identical ones
						 #__________________________________________________________
						 $all_seqlets[$i]=  join(' ', @{&remove_dup_in_array(\@split, 's')} );
		      }
					#print "\n# (3) New \$all_seqlets\[\$i\] is \n$all_seqlets[$i]\n" if $verbose;
		      splice(@all_seqlets, $i+1, 1);
		      $i-- if $i >0;
		      next F1;
	    }

	    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# (4) If the first elements of @split1 and 2 are NOT identical, lets check the sequence ranges
	    #_____________________________________________________________________________________________
			#if($verbose){
			#		print "\n# (4) merge_similar_seqlets: the first elements of \@split1 and \@split2 are NOT identical\n";
			#		print "# (4) >>>>>>>>> Checking the regions of them, \$factor= $factor\n";
			#}

			F2: for($j=0; $j < @split1; $j++){
					if($split1[$j] =~/^ *(\S+)_(\d+)\-(\d+)/){
							 my ($seq1, $start1, $end1)=($1, $2, $3);

							 F3: for($k=0; $k<@split2; $k++){
									 if($split2[$k] =~/(\S+)_(\d+)\-(\d+)/){
											 my($seq2, $start2, $end2)=($1, $2, $3);

											 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
											 # Check if the seqs are identicl (from the two arrays), no point to merge which are not identical from the first
											 #__________________________________________________________________________________________
											 if($seq1 eq $seq2){
														$diff_start=abs($start1-$start2);
														$diff_end  =abs($end1  -$end2  );
														$leng1=$end1-$start1;
														$leng2=$end2-$start2;
														if($leng1 >= $leng2){
																$smaller_leng=$leng2;
																$larger_leng =$leng1
														}else{
																$smaller_leng=$leng1;
																$larger_leng =$leng2
														}

														#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
														# Checking the minimal seq region leng here
														#______________________________________________________
														if($smaller_leng < $leng_thresh){ next }

														$overlapping_seq_match_size=${&get_overlapping_seq_match_size($start1, $end1, $start2, $end2)};
														$averge_seq_leng_of_2_seqs=($leng1+$leng2)/2;

														#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
														# This is the critically important part
														#_______________________________________________________________
														if($average_region){
																$finally_adjusted_seq_leng=$averge_seq_leng_of_2_seqs*($factor/10);
														}elsif($short_region){
																$finally_adjusted_seq_leng=$smaller_leng*($factor/10);
														}elsif($large_region){
																$finally_adjusted_seq_leng=$larger_leng*($factor/10);
														}

														#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
														# Now let's check if we regard them homologous or not\
														#_______________________________________________________
														if( $overlapping_seq_match_size >=  $finally_adjusted_seq_leng){

																@split= (@split1, @split2);
																#if ($verbose){
																#		print "# (5) merge_similar_seqlets: \$averge_seq_leng_of_2_seqs=$averge_seq_leng_of_2_seqs";
																#		print " $$$$ Merging occurs with \n@split  \n";
																#}
																if($optimize){ #~~~~~ $optimize option removes similar seqlets
																		$all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
			                                  $short_region, $large_region, $average_region)} );
																}else{
																		$all_seqlets[$i]= join(' ', @{&remove_dup_in_array(\@split, 's')} );
																}
																$merged_two_seqlet_lines=1;
																splice(@all_seqlets, $i+1, 1);
																$i-- if $i >0;
																next F1;

														}
														#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
														# We believe they are not homologous
														#____________________________________________
														else{
																#if ($verbose){
																#		print "\n# (5) merge_similar_seqlets !! MERGING DID NOT occur->
																#		$split1[$j] $split2[$k] \n";
																#		print "\n#   \$averge_seq_leng_of_2_seqs= $averge_seq_leng_of_2_seqs, \$overlapping_seq_match_size= $overlapping_seq_match_size \$finally_adjusted_seq_leng= $finally_adjusted_seq_leng\n";
																#		print "\n# (5) merge_similar_seqlets, \$all_seqlets[$i]\$all_seqlets[$i+1]\n$all_seqlets[$i]\n$all_seqlets[$i+1]\n";
																#}
																next F3;
														}
											 }
									 }
									 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
									 # If there is no range (region) in seq naem, let's skip, as there is no way to check
									 #__________________________________________________________________________________
									 else{ # when split2 does not match xxx_10-20 format
											 next;
									 }
							 }
					}else{ # when split1 does not match xxx_10-20 format
							 next;
					}
			}
			unless($merged_two_seqlet_lines){

			}
	 }
	 #print "\n# \@all_seqlets is  @all_seqlets =======\n" if $verbose;
	 return(\@all_seqlets);
}


#__________________________________________________________________________
# Title     : show_subclusterings
# Usage     : &show_subclusterings(\@out);
# Function  : This is the very final sub of divclus.pl
# Example   : @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup);
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : print_subclusterings, sum_subclusterings, write_subclustering
#             show_clusterings, display_subclusterings
# Options   :
#             f  for file output, eg: xxxxxxx.sat
# Category  :
# Version   : 2.9
#-------------------------------------------------------------------------
sub show_subclusterings{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	my ($max_size, $sat_file_name, $clu_file_name,
	$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
	$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
	@sub_clustering_out_files);  # clusall_1e-5_clu_14-324_ss.sat
	my @out=@{$array[0]};
	$indup_count=0;

	if($char_opt=~/d/){	    $dindom=1;	}
	if($char_opt=~/i/){		$indup=1;	}
	if($vars{'f'}=~/\S+/){     $factor= $vars{'f'}; }
	if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
	if($vars{'s'}=~/\d+/){	   $score = $vars{'s'};	}
	if($vars{'e'}=~/\d+/){	   $evalue= $vars{'e'};	}

	print "\n# (1) show_subclusterings : \@file has : @file";
    if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.mspa/  or
		$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/   ){
         $ori_cluster_size=$2;
         $ori_cluster_num =$3;
         $base=$1;
		 $sat_file_name="$base\.sat";
         $clu_file_name="$base\.clu";
	}else{
         $ori_cluster_size="0000";
	     $ori_cluster_num ="0000";
	     $base=${&get_base_names($file[0])};
	     $clu_file_name="$base\.clu";
		 warn "\n# (2) LINE:",__LINE__," WARN: the \@file input to show_subclusterings is not the right format, dying\n";
		 warn "     Sarah!, right format looks like: 13-234.mspa or 8-420_cluster.mspa \n";
	}

	open(CLU, ">$clu_file_name") || die "\n# (ERROR) show_subclusterings failed miserably to CREATE \"$clu_file_name\" \n";
	push(@sub_clustering_out_files, $clu_file_name);


	@out=@{&sort_string_by_length(\@out)};

	for($i=0; $i< @out; $i++){ # @out has ( 'YAL054C_98-695 YBR041W_90-617', 'YBR115C_230-842 YBR222C_16-537 YER015W_121-686', etc)
	   my $count+=$i+1;
	   my ( $int_dup_number, $sub_clu_size, $seq_with_range, @sp, $new_clus_NAME,
	        %tem, %tem2, %tem3, $j, @keys, $num_seq);
	   if($out[$i]=~/^ *$/){ next }
	   @sp=sort split(/ +/, $out[$i]);

	   for($j=0; $j < @sp; $j++){
		  $seq_with_range=$sp[$j];
		  if($seq_with_range=~/^((\S+)_((\d+)\-(\d+)))/){
			 $tem{$2}++;
			 $tem2{$2}.=sprintf("%-15s ", $1);
			 $tem3{$2} =$3;
			 $tem4{$2} =$5-$4;
		  }
	   }

	   @keys=sort keys %tem;
	   $num_seq=$sub_clu_size=@keys;

	   if($max_size < $sub_clu_size){
		  $max_size=$sub_clu_size; ## This is to collect the sizes of clusters to see if it is good.
	   }
	   $indup_count= &print_summary_for_divclus(
		         $count, \%tem2, \%tem,
		         $ori_cluster_num,
		         $ori_cluster_size,
		         $dindom,
		         $clu_file_name,
								 \%tem3, \%tem4,
								 $indup, );

					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
					 # Local subroutine
					 #_______________________________________________________________
	   sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							 my(@keys, $indup_count, $x, $m, $percentage_fac);
							 my $count=$_[0]; # count of cluster
	       my %tem2=%{$_[1]};	my $num_seq=@keys=sort keys %tem2;
	       my %tem=%{$_[2]};	my $ori_cluster_num=$_[3];
	       my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
	       my $ori_cluster_size=$_[4];
	       my $dindom=$_[5];	my %tem3=%{$_[7]};
	       my $indup=$_[9];	my (%internal_dup);
	       my %tem4=%{$_[8]};
							 #~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
	       if($dindom){
	          for($x=0; $x <@keys; $x++){
											 @domain_inside_domain=@{&get_domain_inside_domain($tem2{$keys[$x]})};
											 @domain_inside_domain=@{&remove_dup_in_array(\@domain_inside_domain)};
											 for($m=0; $m< @domain_inside_domain; $m++){ print "  # Dindom: $m : $domain_inside_domain[$m]\n";   }
											 print "\n";
		  }
							 }
							 #==========================================================================================

	       #~~~~~~~~~~ Internal duplication  ~~~~~~~~~~~~~~
	       if($indup==1){
		   # @keys is the same as sub cluster size,
		   for($x=0; $x < @keys; $x++){
														 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
														 # Checks each sequence for duplication
														 #___________________________________________________
														 my %internal_dup=%{&get_internal_dup_in_a_cluster( $tem2{$keys[$x]} )};
														 my @dup_keys=keys %internal_dup;
														 if(@dup_keys > 0){
																		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
																		 #  This calculates the actual duplicated number rather than jus tthe sequences
																		 #______________________________________________________________________________
																		 $indup_count++;
																		 printf ("%-14s %-12s %-4s", $keys[$x], $new_clus_NAME, $num_seq);
																		 for($m=0; $m< @dup_keys; $m++){
																						 printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
																		 }
														 }
										}
								 }

								#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
								print  CLU  "Cluster size $num_seq\n";
																				printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
																					$new_clus_NAME, $evalue, $factor, $percentage_fac,
																					$ori_cluster_size, $num_seq, $ori_cluster_num);
								print       "Cluster size $num_seq\n";
								printf     ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
															$new_clus_NAME, $evalue, $factor, $percentage_fac,
															$ori_cluster_size, $num_seq, $ori_cluster_num);
								for($x=0; $x <@keys; $x++){
									 printf CLU ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
															 $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
									 printf ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
													$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
								}
								return($indup_count);
	   }
	}
		close(CLU); ## this is a bug fix

	if($max_size == $ori_cluster_size){   $good_bad=1;
	}else{	                              $good_bad=0;	}

    print "\n# Sarah, Do you think the subclusterings are O.K.?" if $verbose;
    print "\n#   Tell me, if you feel suspicious, jong\@salts.med.harvard.edu\n\n" if $verbose;
    return($good_bad, $indup_count, $ori_cluster_size, \@sub_clustering_out_files);
}



#______________________________________________________________
# Title     : cluster_merged_seqlet_sets
# Usage     : @out=@{&cluster_merged_seqlet_sets(\@lines)};
# Function  :
# Example   :
# Warning   :
# Keywords  :
# Options   :
#  $short_region=  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region=  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region=A by A -A # taking average region overlapped in removing similar regions
#
# Version   : 1.8
#--------------------------------------------------------------
sub cluster_merged_seqlet_sets{
	 #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	 my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	 my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	 my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	 my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	 my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	 if($debug==1){print "\n\t\@hash=\"@hash\"
	 \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	 \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	 #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my ($optimize, @splited1, @splited2, $verbose, $link_or_not);
	 my @seq_names_in_clu=@{$array[0]};
	 $link_or_not=0;
	 my $factor=7.5; # 7 means 70% now

	 if($vars{'f'}=~/(\S+)$/){ $factor=$1 }
	 if($char_opt=~/o/){ $optimize=1 }
	 if($char_opt=~/S/){ $short_region='S'; }
	 if($char_opt=~/L/){ $large_region='L';   }
	 if($char_opt=~/A/){ $average_region='A'; }
	 if($char_opt=~/v/){ $verbose=1 }

	 #if($verbose){ print "\n# (1) cluster_merged_seqlet_sets: Checking linkage and merging <<<<<>>>>>\n@seq_names_in_clu\n";   }

	 F1: for($i=0; $i< @seq_names_in_clu; $i++){
			@splited1=split(/ +/, $seq_names_in_clu[$i]);
			for($j=0; $j< @seq_names_in_clu; $j++){
				if($seq_names_in_clu[$i] eq $seq_names_in_clu[$j]){ next  }
				@splited2=split(/ +/, $seq_names_in_clu[$j]);

				$link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1, \@splited2, "f=$factor")};
				print "\n +++++ \$link_or_not is  $link_or_not +++" if $verbose;
				if($link_or_not==1){
						 if($verbose){
								 print "\n# (2) cluster_merged_seqlet_sets: \n $seq_names_in_clu[$i]  \n and $seq_names_in_clu[$j] \n are linked \n";
						 }

						 if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
								$seq_names_in_clu[$i]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																						$short_region, $large_region, $average_region)} );
						 }else{
								$seq_names_in_clu[$i]=join(' ', sort @{&remove_dup_in_array( [@splited1, @splited2])} );
						 }
						 splice(@seq_names_in_clu, $j,1);
						 $j--; $i--;
						 next F1;
		 }
	  }
	 }
	 return(\@seq_names_in_clu);
}

#__________________________________________________________________________
# Title     : merge_sequence_in_mspa_chunk
# Usage     :
# Function  : merges sequences which are linked by common regions
#             This filters the sequences by evalue and ssearch score
#             This is the main algorithm of merging similar sequences.
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : connect_sequence_in_msp, link_sequence_in_msp_chunk
#             connect_sequence_in_msp_chunk, link_sequence_in_msp
#             merge_sequence, link_sequence, connect_sequence
# Options   : _  for debugging.
#             #  for debugging.
#             m  for merge file output format (.mrg)
#             t= for threshold of seqlet length eg)  "t=30"
#             f= for overlap factor (usually between 2 to 7 )
#                 2 means, if the two regions are not overlapped
#                  by more than HALF of of the smaller region
#                  it will not regard as common seqlet block
#             s= for ssearch score minimum
#             E= for ssearch e value maximum
#             S  for S -S  # taking shorter region overlapped in removing similar regions
#             L  for L -L  # taking larger  region overlapped in removing similar regions
#             A  for A -A # taking average region overlapped in removing similar regions
#
# Returns   :
# Argument  :
# Thanks    : Alexey Eroshkin <alexey@axyspharm.com>
# Version   : 2.7
#--------------------------------------------------------------
sub merge_sequence_in_mspa_chunk{
	 #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	 my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	 my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	 my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	 my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	 my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	 if($debug==1){print "\n\t\@hash=\"@hash\"
	 \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	 \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	 #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my ($ssearch_score2, $evalue_found2, $evalue_found1, $ssearch_score1, $optimize );
	 my ($L, %out_hash, @out, $LL, @Final_out, $verbose, $final_factor, $R_diff, @seqlets,
			 $short_region, $large_region, $average_region, $factor, $score, $Evalue_thresh, $length_thresh);
	 $factor =4; # default factor for around 30% sequence mis-overlap is the threshold for common block
	 #~~~~~~~~~~~~~~ The lower the factor the larger clustering will occur ~~~~~~~~~~~~
	 $score  =75; # default ssearch score. seq below this will be chucked out
	 $Evalue_thresh =40; # default maximum e value used. Seq higher than this will be thrown out
	 $length_thresh =30; # sequence length threshold. overlap less than this will be ignored

	 if($char_opt=~/v/){     $verbose = 'v'
	 }if($char_opt=~/z/){    $optimize = 'z'
	 }if($char_opt=~/S/){    $short_region='S';
	 }if($char_opt=~/L/){	   $large_region='L';
	 }if($char_opt=~/A/){	   $average_region='A'; }

	 if($vars{'T'}=~/\d+/){
			$length_thresh=$vars{'T'};# print "\n# merge_sequence_in_mspa_chunk: Thresh is $length_thresh\n" if (defined $verbose);
	 }if($vars{'f'}=~/\S+/){
	    $factor=$vars{'f'}; # print "\n# merge_sequence_in_mspa_chunk: Factor is $factor\n" if (defined $verbose);
	 }if($vars{'s'}=~/\d+/){
		  $score = $vars{'s'}; # print "\n# merge_sequence_in_mspa_chunk: Score is $score\n" if (defined $verbose);
	 }if($vars{'E'}=~/\S+/){
	    $Evalue_thresh= $vars{'E'}; # print "\n# merge_sequence_in_mspa_chunk: Evalue is $Evalue_thresh\n" if (defined $verbose);
	 }

	 @seqlets=split(/\n+/, (${$_[0]} || $_[0]) );

	 F1: for($i=0; $i < @seqlets; $i ++){
			if($seqlets[$i]=~/^ *((\S+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
		     if($6 eq $9){ splice(@seqlets, $i, 1); $i--; next };
				 ($long_match1, $enq_seq1, $mat_seq1, $R_start1, $R_end1 )=($1, $6, $9, $4, $5);
				 $R_leng1=$R_end1-$R_start1;  $ssearch_score1= $2;  $evalue_found1 = $3;
	    }
	    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	    # Following lines are disabled as I believe seqlets have been checked in previous sub
			#________________________________________________________________________________________________
	    if( ($R_leng1 < $length_thresh) || ($ssearch_score1 < $score) ){ splice(@seqlets, $i, 1); $i--; next; }
	    if( $evalue_found1 > $Evalue_thresh){ splice(@seqlets, $i, 1); $i--; next; }

			F2: for($j=0; $j < @seqlets; $j ++){
		     if($seqlets[$i] eq $seqlets[$j]){ next };
		     if($seqlets[$j]=~/^ *((\S+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
			      ($long_match2, $enq_seq2, $mat_seq2, $R_start2, $R_end2)=($1, $6, $9, $4, $5);
			      $R_leng2=$R_end2-$R_start2;			$ssearch_score2=$2;			$evalue_found2= $3;
	       }

				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 # Following lines are disabled as I believe seqlets have been checked in previous sub
				 #________________________________________________________________________________________________
		     if( ($R_leng2 < $length_thresh)||($ssearch_score2 < $score) ){ splice(@seqlets, $j, 1); $j--; next; }
		     if( $evalue_found2 > $Evalue_thresh){ splice(@seqlets, $j, 1); $j--; next; }

		     $R_diff=abs($R_leng1-$R_leng2)/2;   ## <<<---- Note it is div by 2

		     if($R_leng2 < $R_leng1){ $smaller_leng=$R_leng2; }else{ $smaller_leng=$R_leng1; }

		     $Start_diff=abs($R_start1-$R_start2)/2; ## <<<---- Note it is div by 2
		     $final_factor=$smaller_leng/$factor;


				 #~~~~~~~~~~ If average R_diff and average Start_diff are less then 1/7 of the smaller seqlet
				 #~~~~~~~~~~ we regard they are same selqets
				 if(( $R_diff < $final_factor ) &&       ### $Start_diff is essential!
					  ($Start_diff < $final_factor ) ){  ### if diff is less than around 30% of the smaller length
					  if($R_leng2 >= $R_leng1){
							 #~~~~~ $mat_seq1 or $mat_seq2 can increase to 'slr1453,sll0238', so you need ',' in the middle only
							 $extended_name="$mat_seq2,$mat_seq1";
							 $L=length($extended_name);
							 $LL=length($long_match2)+2;
							 $seqlets[$i]= sprintf("%-${LL}s %-${L}s", $long_match2, $extended_name);
							 splice(@seqlets, $j, 1);
							 $i-- unless($i==0);
							 $j--;
							 next F1;
					  }elsif( $R_leng1 >= $R_leng2){  ## chooses the bigger range seq
							 $extended_name="$mat_seq1,$mat_seq2"; # must be ',' not ' '
							 $L=length($extended_name);
							 $LL=length($long_match1)+2;
							 $seqlets[$i]=sprintf("%-${LL}s %-${L}s", $long_match1, $extended_name);
							 splice(@seqlets, $j, 1);
							 $i-- unless($i <= 0);
							 $j--;
							 next F1;
					  }
	       }else{
			      next F2;
		     }
	    }
	 }
	 if($char_opt=~/m/){
			for($i=0; $i< @seqlets; $i++){
				 if($seqlets[$i]=~/^ *\S+ +\d+\.?[e\-\d]* +\d+ +\d+ +(\S+) +\d+ +\d+ +(\S+) *$/){
						if($1 eq $2){ next }
						$leading_seq=$1; $long=$2; $long=~s/\,/ /g;
						push(@Final_out, "$leading_seq $long" );
				 }
			}
	 }
	 @Final_out=sort @Final_out;
	 #print "\n\n\n# \@Final_out\n@Final_out \n=================\n " if $verbose;
	 return(\@Final_out);
}


#______________________________________________________________________________
# Title     : create_sorted_cluster
# Usage     : &create_sorted_cluster
# Function  : to make a "sorted_cluster_file" from the .clu files in a directory
# Example   :
# Keywords  : make_cluster_file, sort_clu_files
# Options   :
# Author    : Sarah A. Teichmann, modified by Jong
# Date      : 19th September 1997
# Version   : 1.8
#--------------------------------------------------------------------------------
sub create_sorted_cluster{
		my ($i, $q, $p, $n, $j, $clufile, @filecontent, $new_gene, $cluster_size,
				@clufiles, @cluster_sizes_new_unsorted, @cluster_sizes_new,
				$newclus_number, %hash);

		@clufiles=@{$_[0]};
		if(@clufiles < 1 ){
				@clufiles=@{&read_file_names_only('.','.clu')};
				print "\n# $0, create_sorted_cluster: \@_ is empty, reading PWD to get xxx.clu files\n";
				if(@clufiles < 1){
					 print "\n# $0, create_sorted_cluster: I couldn\'t find any clu files, dying\n";
					 die;
				}
		}
		if($_[1]){  $good_cluster_file=${$_[1]}; }

		print "\n# (1) create_sorted_cluster: found \@clufiles are
								@clufiles\n";

		for ($i=0; $i < @clufiles; $i++) {
				 open (CLU_FILE, "<$clufiles[$i]") or
						die "\n# $0: create_sorted_cluster: error opening $clufiles[$i]";

				 my $cluster_size;
				 while(<CLU_FILE>){
						 if( /^ *Cluster +size +(\d+)/i){
								 $cluster_size=$1;
								 $hash{$cluster_size} .=$_;
						 }elsif (/^ *Cluster +[number]* *\d+/i) {
								 $hash{$cluster_size} .=$_;
						 }elsif (/^ *\d+ +\d+ +\S+/) {
								 $hash{$cluster_size} .=$_;
						 }
				}
	 }
	 close(CLU_FILE);
	 @sorted_by_size=sort { $a<=>$b } keys %hash;
	 $good_cluster_file2="sorted_cluster_file\.gclu";

	 open(GOODCLUS, ">$good_cluster_file") || die "\n# $0 create_sorted_cluster: I can not open $good_cluster_file\n";
	 open(GOODCLUS2, ">$good_cluster_file2") || die "\n# $0 create_sorted_cluster: I can not open $good_cluster_file\n";
	 for($i=0; $i< @sorted_by_size; $i++){
			 print GOODCLUS $hash{$sorted_by_size[$i]};
			 print GOODCLUS2 $hash{$sorted_by_size[$i]};
	 }
	 close(GOODCLUS);
	 close(GOODCLUS2);
	 return(\$good_cluster_file);
}

#________________________________________________________________________
# Title     : show_hash
# Usage     : &show_hash(\@input_array);
# Function  : for debugging purpose. Shows any array elem line by line.
#             the line is 60 elements long (uses recursion)
# Example   : Output:      item1
#             Output:      item2
#             Output:      item3
# Warning   : There is a global variable:  $show_hash_option
#             It tries to detect any given sting which is joined by ','
# Keywords  :
# Options   : -s or -S or s or S for spaced output. Eg)
#             seq1       1 1 1 1 1 1 1 1 1 1 1 1
#
#             instead of
#             seq1       111111111111
#
#             -h or -H or h or H for horizontal line of '---------...'
#
# Returns   :
# Argument  :
# Version   : 1.7
#--------------------------------------------------------------------
sub show_hash{
	my($k, $i, $t, @in2, $in, $LEN, %TEM ); ## You should not put $show_hash_option
	my(@in)=@_;                     ## and $horizontal_line  in my !!!
	my($KL)=2; # default keys string length;
	my($VL)=80; # default values string length;
	my($GAP)=2;  # default space between keys and values
	my($horizontal_line, $show_hash_optionXX, $Hash_counter, @line);

	## This is to get the option of 'space' to make spaced output.
	for($t=0; $t < @in; $t++){
	 if($in[$t] =~/^[-]+[sS][pace]*$/){
		 $show_hash_optionXX = 1;
		 splice(@in, $t, 1);
	 }elsif(${in[$t]} =~/^[-]+[sS][pace]*$/){
		 $show_hash_optionXX = 1;
		 splice(@in, $t, 1);
	 }elsif($in[$t] =~/^[-]+[hH][rR]*$/){
		 $horizontal_line = 1;
		 splice(@in, $t, 1);
	 }elsif(${in[$t]} =~/^[-]+[hH][rR]*$/){
		 $horizontal_line = 1;
		 splice(@in, $t, 1);
	 }
	}

	######## Main loop #################
	if($horizontal_line ==1){  ## This puts the delimiter '--------------(  )'
	  $Hash_counter ++;
	  print "\n","-"x78,"(${Hash_counter}th hash)", "\n";
	}

	for($k=0; $k < @in; $k++){
	 if(ref($in[$k]) eq 'ARRAY'){  ### When the hashes were given in array ref.
		 &show_hash(@{$in[$k]}, $show_hash_optionXX, $horizontal_line);
		 print "\n";
	 }
	 elsif(ref($in[$k]) eq 'HASH'){  ### recursion
		 &show_hash(%{$in[$k]});
		 print "\n";
	 }
	 elsif(ref($in[$k+1]) eq 'HASH'){  ### recursion
		 &show_hash(%{$in[$k+1]}); print "\n";
	 }
	 elsif(ref($in[$k]) eq 'SCALAR'){  print ${$_[$k]}, "\n";  }
	 elsif( !ref($in[$k]) ){
		 if( !ref($in[$k+1]) && defined($in[$k+1])  ){
			 if($show_hash_optionXX == 1){  #### space option checking.
				#if($in[$k+1] =~ /\,.+\,/){  #### if the string is joined with ','
				#	 @line = split(/\,/, $_[$k+1]);
				# }else{
				#	 @line = split(//, $_[$k+1]);
				# }
				%TEM = @in;
				$LEN = ${&max_elem_string_array_show_hash(keys %TEM)};
				 if($LEN > $KL){ $KL = $LEN + $GAP +2};
				 printf ("%-${KL}s ", $in[$k]);  $k++;
				 printf ("%-${VL}s\n","@line");
			 }else{                        ### If not option is set, just write
				%TEM = @in;
				$LEN = ${&max_elem_string_array_show_hash( keys %TEM)};
				 if($LEN > $KL){ $KL = $LEN + $GAP +2};
				 printf ("%-${KL}s ", $in[$k]);  $k++; # print $in[$k], "\t";  $k++;
				 printf ("%-${VL}s\n",$in[$k]);        # print $in[$k], "\n";
			 }
		 }
		  #________________________________________________________
		  # Title    : max_elem_string_array_show_hash
		  # Keywords : largest string length of array
		  # Function : gets the largest string length of element of any array of numbers.
		  # Usage    : ($out1, $out2)=@{&max_elem_array(\@array1, \@array2)};
		  #            ($out1)       =${&max_elem_array(\@array1)          };
		  # Argument : numerical arrays
		  # returns  : one or more ref. for scalar numbers.
		  # Version  : 1.1
		  #-------------------------------------------------------
		  sub max_elem_string_array_show_hash{
			 my(@input, $i, $max_elem);
			 @input = @{$_[0]} || @_;
			 for($i=0; $i< @input ; $i++){
					$max_elem = length($input[0]);
					if (length($input[$i]) > $max_elem){
						$max_elem = length($input[$i]);
					}
			 }
			 \$max_elem;
		  }
		  #####################################insert_gaps_in_seq_hash
	 }
	}
}

#_________________________________________________________________________________
# Title     : read_sso_lines
# Usage     : &read_sso_lines([@sso], $create_sso, $attach_range_in_names,
#                 $attach_range_in_names2, $new_format, $get_alignment) );
# Function  : Main subroutine for open_sso_files. This calls either machine
#              readable or unreadable form parsing subroutine
# Example   :
# Keywords  : read_sso_lines_in_array
# Options   : a c r r2 n
#             u= for upper E value limit
#             l= for lower E value limit
# Category  :
# Version   : 1.5
#----------------------------------------------------------------------------
sub read_sso_lines{
		my (@out_refs, $parseable, @SSO, $create_sso, $i, $j, $k, $attach_range_in_names);
		my ($upper_expect_limit, $lower_expect_limit)=(50,0); ##<<--- DEFAULT

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
		# Processing input and options, don't worry!
		#__________________________________________
		for($i=0; $i< @_; $i++){
			 if($_[$i]=~/u=(\S+)/){    $upper_expect_limit=$1 }
			 elsif(ref($_[$i]) eq 'ARRAY'){ @SSO=@{$_[$i]};   }
			 elsif($_[$i]=~/l=(\S+)/){ $lower_expect_limit=$1 }
			 elsif($_[$i]=~/^c$/){     $create_sso = 'c' }
			 elsif($_[$i]=~/^a$/){     $get_alignment='a'; }
			 elsif($_[$i]=~/^r$/){   $attach_range_in_names='r' }
			 elsif($_[$i]=~/^r2$/){   $attach_range_in_names2='r2' }
			 elsif($_[$i]=~/^n$/){   $new_format='n' }
		}
		#print "\n# (INFO) \$attach_range_in_names2 is $attach_range_in_names2" if $attach_range_in_names2;

		#~~~~~~ Checking if sso is a parseable form or not~~~~~~~~~~~~~
		TEMP:for($k=0; $k < @SSO; $k++){
					if($SSO[$k] =~ /\>\>\>/  or $SSO[$k] =~ /^ *\; \S+\:/ ){
							$parseable++;  if($parseable >= 8){  last TEMP;     }
					}elsif($SSO[$k]=~/^  +\:+/){ $parseable--;
					}elsif($SSO[$k] =~ /^ +1\>\>\>(\S+)/){ $pvm_version_fasta_out=1; $parseable +=10; $original_target_seq=$1; last TEMP;
					}
		}
		if($parseable >= 8){ ## 8 is critically important as some do not have matche seq in FASTA, SSEARCH
		    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		    # Machine READABLE  MSSO style file input(SSEARCH, FASTA result)
				#_________________________________________________________________
				@out_refs=@{&read_machine_readable_sso_lines(\@SSO,
								"u=$upper_expect_limit",
								"l=$lower_expect_limit",
								$get_alignment, $create_sso,
								$new_format, $attach_range_in_names,
								$attach_range_in_names2)};
		}else{
		    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		    # Machine UNREADABLE  MSSO style file input(SSEARCH, FASTA result)
				#_________________________________________________________________
				@out_refs=@{&read_machine_unreadable_sso_lines(\@SSO,
								$get_alignment, $create_sso,
								"u=$upper_expect_limit",
								"l=$lower_expect_limit",
								$new_format, $attach_range_in_names,
								$attach_range_in_names2)};
		}
		return(@out_refs);
}



#________________________________________________________________________________
# Title     : read_machine_unreadable_sso_lines
# Usage     : @out_refs=@{&read_machine_unreadable_sso_lines(\@SSO, $get_alignment,
#                           $create_sso, $upper_expect_limit,$new_format, $lower_expect_limit,
#                           $attach_range_in_names, $attach_range_in_names2)};
# Function  :
# Example   : output will look=>
#  ZFH1_DROME 60 d1ad3a_ 446 d1ad3a_ 0.9 71 34 3-37 253-287
#  ZFH1_DROME 60 d1ahdp_ 68 d1ahdp_ 0.00018 100 56 2-58 3-59
#  ZFH1_DROME 60 d1crka2 282 d1crka2 8.4 58 50 5-55 73-123
#  ZFH1_DROME 60 d1dkza_ 215 d1dkza_ 4.9 59 40 1-41 112-152
#  ZFH1_DROME 60 d1ecra_ 305 d1ecra_ 3.2 63 47 9-56 201-248
#
# Keywords  : read_normal_sso_lines
# Options   : a c r r2 n
#             u= for upper E value limit
#             l= for lower E value limit
# Version   : 1.3
#--------------------------------------------------------------------------------
sub read_machine_unreadable_sso_lines{
	 my ($upper_expect_limit, $lower_expect_limit)=(50,0);
	 my (@SSO, @out_refs, $match_seq, $match_evalue, $alignment_found,
			 $match_zscore, $query_seq_len, $space, %match, $entry_found,
			 $match_gap_len, $query_seq_name, $i, $j, $sw_score, $match_evalue,
			 $sw_ident, $query_seg_start, $query_seg_end, $match_seg_start,
			 $match_seg_end, $query_gap_len, $match_gap_len, $overlap,
			 $gap_start, $start, $match_range);
	 for($i=0; $i< @_; $i++){
			 if($_[$i]=~/u=(\S+)/){    $upper_expect_limit=$1 }
			 elsif(ref($_[$i]) eq 'ARRAY'){ @SSO=@{$_[$i]};   }
			 elsif($_[$i]=~/l=(\S+)/){ $lower_expect_limit=$1 }
			 elsif($_[$i]=~/^c$/){     $create_sso = 'c' }
			 elsif($_[$i]=~/^a$/){     $get_alignment='a'; }
			 elsif($_[$i]=~/^r$/){   $attach_range_in_names='r' }
			 elsif($_[$i]=~/^r2$/){   $attach_range_in_names2='r2' }
			 elsif($_[$i]=~/^n$/){   $new_format='n' }
	 }

	 print "\n# (i)read_machine_unreadable_sso_lines : You have put non-parseable format of xxxx.sso\n";
	 print "#      Did you set \'M\' option in do_sequence_search? \n";

	 for($j=4; $j< @SSO; $j++){
			 if($SSO[$j]=~/^ *\S+\: +(\d+) +\w+ *$/){ ## matching 'hbox.fa: 60 aa'
						$query_seq_len=$1;
						print "\n (INFO) target seq len is  $query_seq_len \n";

			 }elsif($query_seq_len and $SSO[$j]=~/^ \>(\w[\w\-\.\/\\]+)/){ ##  matching  >MJ0497
						$query_seq_name=$1;
						$j+=4; ## jumping to skip the stat bars
						print "\n# (i)read_machine_unreadable_sso_lines : Found Query seq=> $query_seq_name ";
						next;
			 }

			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # Matching the Match sequences
			 #_________________________________________________
			 if($query_seq_name and $SSO[$j]=~/^ {0,4}\>\> *(\S+) +.+\((\d+) aa\) *$/){ # # matching >>MG032 ATP-d (addA) Bacillus subtilis  (666 aa)
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
						# This is the assignment to OUT hash
						#________________________________________________
						if($alignment_found > 0){ ## If I have already read >>xxxx
								$match{$match_seq}=sprintf("%-5s %-8s %-6s %-4s %-5s %-30s %-4s %-5s %s\n",
									 $sw_score, $match_evalue, $sw_ident, $query_seg_start, $query_seg_end, $query_seq_name,
									 $match_seg_start, $match_seg_end, $match_seq);
						}
						$match_seq=$1;
						$entry_found=1;    $alignment_found=0;
						$query_gap_len=$match_gap_len=$match_seg_start=$match_seg_end=0;
						$query_seg_start=$query_seg_end=$overlap=0;
						undef($space);
						#print "\n# (INFO) read_machine_unreadable_sso_lines : Found MATCHed seq $match_seq\n" if $verbose;
			 }elsif($entry_found and $SSO[$j]=~/ *Z\-score: *(\S+) *expect\( *\) +(\S+)/){ ## getting Evalue
						$match_zscore=$1;   # just in case needed
						$match_evalue=$2;
						if($upper_expect_limit < $match_evalue or $lower_expect_limit > $match_evalue){
								$match_seq=$alignment_found=$entry_found=$query_gap_len='';
						}
			 }elsif($entry_found and $SSO[$j]=~/Smith\-Waterman +score\: +(\d+)\; *(\S+) *\% *identity *in *(\d+) *aa overlap/i){
						$sw_score=$1;
						$sw_ident=$2;
						$overlap=$3;
			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # Measuring the gap before position info "^         50        60$"
			 #_______________________________________________________________________
			 }elsif( $entry_found and !$alignment_found  and $SSO[$j]=~/^( +)(\d+) +\d+/  ){
						$start=$2;                                ## $start is the digit
						$gap_start=length($1)+length($start)-1;
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# $alignment_found is set HERE!
						#_____________________________________
						$alignment_found=1;
						print "\n# $query_seq_name $match_seq \$gap_start is  $gap_start, \$start: $start" if $verbose;
			 }elsif($alignment_found==1 and $SSO[$j]=~/^( +)[\.\:]/ ){ ### matching    .: .: : ::     :.:..: :.. .. ..
						$space=length($1);
						$query_seg_start=$space-$gap_start+$start;
						$query_seg_end=$query_seg_start+$overlap;
						$query_range="$query_seg_start-$query_seg_end";
			 }elsif($space and $alignment_found==1 and  $SSO[$j]=~/^( +)(\d+)/ ){ ##  !! $alignment_found == 1 !!
						$alignment_found++; ## <<--- $alignment_found++;  should increment!
						$match_gap_start=length($1)+length($2)-1;
						$match_start=$2;
						$match_seg_start=$space-$match_gap_start+$match_start;
						$match_seg_end=$match_seg_start+$overlap;
						$match_range ="$match_seg_start-$match_seg_end";
						#$match{$match_seq}.="$query_range $match_range ";
						#print "\n $query_seq_name $match_seq $match_evalue $overlap $query_range $match_range";
			 }
		}# end of for $j
		if( ($create_sso=~/c/) && (@file < 1) ){
			 open (SSO2, ">$query_seq_name\.sso");
			 print SSO2 @SSO, "\n";
			 #print "\n# (INFO) $query_seq_name\.sso  is created";
			 close SSO2;
		}
		push(@out_refs, \%match);
		return(\@out_refs);
}# end of for $i




#______________________________________________________________________________
# Title     : sso_to_msp
# Usage     : &sso_to_msp(@ARGV, $single_out_opt);
# Function  : This takes sso file(s) and produces MSP file. It
#             concatenate sso file contents when more than one
#             sso file is given.
# Example   : &sso_to_msp(@ARGV, 'OUT.mspa', $single_out_opt);
# Warning   : This capitalize all the input file names when
#              producing xxxxx.mspa. xxxxx.sso -> XXXX.sso
# Keywords  : sso_file_to_msp_file, convert_sso_to_msp,
# Options   : _  for debugging.
#             #  for debugging.
#             v  for showing the MSP result to screen
#             s  for making single MSP file for each sso file
#                    as well as big MSP file which has all sso
#             u= for upper expectation value limit
#             l= for lower expect val limit
#             s= for single file name input eg. "s=xxxxx.mspa"
#             n  for new format (msp2 format)
#             r  for adding range
#             r2 for adding ranges in all sequence names
#
# Returns   : the file names created (xxxx.mspa, yyyy.mspa,,,,)
# Argument  :
# Version   : 2.6
#-----------------------------------------------------------------------------
sub sso_to_msp{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my ($upper_expect_limit, $lower_expect_limit)=(50, 0);
	 my (%sso, @sso, @SSO, $big_out_msp1,  @final_out, $big_out_msp2,
	   $create_sso, $single_out_opt, $add_range, $add_range2, $big_out_msp,
	   $Evalue_thresh, $new_format, $Score_thresh, $margin, $single_file_name);
	if($vars{'u'}=~/([\.\d]+)/){ $upper_expect_limit = $vars{'u'} };
	if($vars{'l'}=~/([\.\d]+)/){ $lower_expect_limit = $vars{'l'} };
	if($vars{'t'}=~/(\d+)/){ $Score_thresh  = $vars{'t'} };
	if($vars{'m'}=~/(\d+)/){ $margin  = $vars{'m'} };
	if($vars{'s'}=~/\S/){ $single_file_name  = $vars{'s'} };
	if($char_opt=~/r2/){  $add_range='r'; $add_range2='r2' }
	if($char_opt=~/r/){   $add_range = 'r' }
	if($char_opt=~/c/){   $create_sso = 'c' }
	if($char_opt=~/s/){   $single_out_opt='s' }
	if($char_opt=~/n/){   $new_format='n' }
	 print "\n# File given to sso_to_msp is \"@file\", Normally xxx.sso file names\n";

	 if($single_file_name=~/\S/){
	   $big_out_msp=$single_file_name;
	 }else{
	   for($i=0; $i < @file; $i++){
		   if($file[$i]=~/\.mspa$/){ ## when output file name is given
			   $big_out_msp=$file[$i];
			   splice(@file, $i, 1);
			   $i--;
		   }elsif($file[$i]=~/^(\d+\-\d+)([_\d]*)\.[mfs]?sso/){  ## creates xxxx.mspa file name from xxxx.sso
			   $big_out_msp1="\U$1"."$2"."\.mspa";
			   $big_out_msp2="\U$1".".mspa";
		   }elsif($file[$i]=~/^(\S+)\.[mfs]?sso$/){
			   $big_out_msp1="\U$1"."\.mspa";
			   $big_out_msp2="\U$1"."_all".".mspa";
			   print "\n# sso_to_msp: File matched  xxxx.sso  format \n";
		   }elsif($file[$i]=~/^(\S+)\.out$/){
			   $big_out_msp1="\U$1"."\.mspa";
			   $big_out_msp2="\U$1"."_all".".mspa";
			   print "\n# sso_to_msp: File matched  xxxx.out  format \n";
		   }elsif($file[$i]=~/^(\S+)\.p[rot\,]*\.ts\.gz/){
			   $big_out_msp1="\U$1".".mspa";
			   $big_out_msp2="\U$1"."_all".".mspa";
		   }elsif($file[$i]=~/^(\S+)\.ts\.gz/){
			   $big_out_msp1="\U$1".".mspa";
			   $big_out_msp2="\U$1"."_all".".mspa";
		   }elsif($file[$i]=~/^(\S+)\.out\.gz/ or $file[$i]=~/^(\S+)\.[mfs]?sso\.gz/){
			   $big_out_msp1="\U$1".".mspa";
			   $big_out_msp2="\U$1"."_all".".mspa";
		   }
	   }
	 }
	 if(defined($big_out_msp)){
	   $big_out_msp1=$big_out_msp2=$big_out_msp;
	   print "\n# \$big_out_msp is defined as \'$big_out_msp\'\n";
	 }else{
	   print "\n# sso_to_msp: You did not define the big MSP file out format, so $big_out_msp1 \n";
	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 #  (1) When File was given to this sub routine
	 #__________________________________________
	 if(@file == 1){   ## ONE single file input??
	  print "# one file @file is given, OUT will be: $big_out_msp1 \n";
	  @sso=@{&open_sso_files(@file, $add_range, $add_range2,
	          "u=$upper_expect_limit",
			  "l=$lower_expect_limit",
			  "m=$margin",
			  $new_format,
			  "s=$big_out_msp")};
	  push(@final_out, &write_mspa_files(@sso, $big_out_msp1,
	        $single_out_opt, $add_range) );

	 }elsif(@file > 1){ ## MOre than 1 file input??
	  @sso=@{&open_sso_files(@file, $add_range, $add_range2,
	        "l=$lower_expect_limit",
	        "u=$upper_expect_limit",
	        "m=$margin",
	        $new_format)};
	  push(@final_out, @{&write_mspa_files(@sso, $big_out_msp2,
			$single_out_opt, $add_range)} ); ## concatenates all the hash ref to one
	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 #  (2) When NO File but ARRAY is given
	 #      Here, you can have SSO files created
	 #__________________________________________
	 elsif(@array >=1){
	  print "\n# In sso_to_msp, \@array is given rather than \@file";
	  @sso=@{&open_sso_files(@array, "u=$upper_expect_limit", $add_range2,
			  "l=$lower_expect_limit", $add_range, $create_sso,
			  "m=$margin", $new_format)};
	  push(@final_out, @{&write_mspa_files(@sso, $big_out_msp,
						  $single_out_opt, $add_range)} );
	 }
	 return(\@final_out);
}
#________________________________________________________________________________
# Title     : make_clustering_summary
# Usage     : &make_summ($sorted_cluster_file)
# Function  : to make a summary file of a sorted cluster file
# Example   :
# Keywords  : summary, make_cluster_summary, subclustering summary
# Options   :
# Author    : Sarah A. Teichmann
# Date      : 19th September 1997
# Version   : 1.5
#--------------------------------------------------------------------------------
sub make_clustering_summary{
		my ($good_cluster_file, $summary_file, @filecontent, $i, $filecontent,
				$cluster_size, @cluster_sizes, $cluster_number, $number_of_clusters,
				 $summary_file, @filecontent, %hash, @keys, @temp_clu, $make_separate_summary);
		$good_cluster_file=${$_[0]} || $_[0];

		$make_separate_summary=0;
		for($i=0; $i< @_; $i++){
				if($_[$i]=~/u *$/){  $make_separate_summary='u';
														 splice(@_, $i, 1)
				}
		}

		$summary_file="$good_cluster_file".".summary";

		open(CLU_FILE, "$good_cluster_file");
		while(<CLU_FILE>){
					push(@temp_clu, $_);  ## copying the content to ;
					if( /^ *Cluster +size +(\d+)/i){
							$cluster_size=$1;
					}elsif (/^ *Cluster +[number]* *(\d+)/) {
							$hash{$cluster_size} ++;
					}
		}
		close(CLU_FILE);

		open(CLU_FILE, ">$good_cluster_file"); # now overwrting it.

		if($make_separate_summary){
				open (SUMM, ">$summary_file");
				print SUMM "Cluster size    No. of clusters\n";
		}

		print CLU_FILE "Cluster size    No. of clusters\n";
		@keys=sort {$a<=>$b} keys %hash;
		for ($i=0; $i<@keys; $i++){
				print SUMM "     $keys[$i]               $hash{$keys[$i]}\n" if $make_separate_summary;
				print CLU_FILE "     $keys[$i]               $hash{$keys[$i]}\n";
		}
		close (SUMM) if $make_separate_summary;
		print CLU_FILE "\n# This file is created by $0 with make_clustering_summary sub, Details below\n\n";
		for(@temp_clu){  print CLU_FILE $_ }
		close (CLU_FILE);
		return(\$summary_file);
}




#________________________________________________________________________________
# Title     : convert_clu_to_sso_to_msp  (do not use this)
# Usage     : &clu_to_sso_to_msp(\$clu);
# Function  : reads in a big single linkage cluster file(or normal cluster file)
#              and creates a big msp file which contains all the entries in the
#              cluster file (usually with the extension of sclu or clu)
#             This normally reads in xxxx.mso, xxxx.sso like files, but if the
#              corresponding  xxx.mspa file already exists, it concatenates them to
#              make a bigger one.
# Example   :
# Keywords  : clu_2_sso_2_msp, cluster_to_msp, cluster_to_sso_to_msp
#              clu_to_sso_to_msp
# Options   :
# Category  :
# Version   : 1.8
#--------------------------------------------------------------------------------
sub convert_clu_to_sso_to_msp{
		 my($i, $j, $k, $s, $u, $p, $m, $n, $y, @possible_extensions, @list,
					@final_files, @U_L_case, $file, @file, @written_mspa_files);

		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		 # Opening cluster file (xx.clu)
		 # %clus looks like this:  2-507     YGR041W YLR353W
		 #                         3-308     YDR222W YDR346C YLR225C
		 #                         2-184     YCL066W YCR040W
		 #______________________________________________________________
		 my $clu=${$_[0]} || $_[0];
		 if($verbose){
					 print "\n# clu_to_sso_to_msp : \"$clu\" is given
								and I am processing it with clu_to_sso_to_msp\n" if defined $clu;
		 }
		 my %clus=%{&open_clu_files(\$clu)};
		 my @keys= keys %clus;
		 my $num_of_cluster=@keys=@{&sort_by_cluster_size(\@keys)};

		 print "# $0: clu_to_sso_to_msp: No. of cluster=$num_of_cluster after open_clu_files \n" if $verbose;

		 &show_array(\@keys) if $verbose;
		 &show_hash(\%clus) if $verbose;
		 @possible_extensions=('msp', 'sso', 'msso', 'msso.gz','fsso', 'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
		 @U_L_case=('\U', '\L');

		 for($i=0; $i< @keys; $i++){
				 my (@list, @final_files, $clus_name, $big_out_msp, @msp_hashes);
				 $clus_name=$keys[$i];
				 unless($single_file_name=~/\S/){
							$big_out_msp="$clus_name\_cluster\.mspa"; #<<<----- final output name
				 }else{
							$big_out_msp=$single_file_name;
				 }
				 push(@written_mspa_files, $big_out_msp); ## This is the output of this sub

				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 #  If $clus_name.mspa is already there, skip
				 #_____________________________________________
				 if( (-s $big_out_msp) > 100  and !$over_write ){
						 print "\n# clu_to_sso_to_msp : $big_out_msp MSP file already exists, skipping\n";
						 print "#    Use  \$over_write option \'o\' to start all over again or \n";
						 print "#    delete clustering files like XX-XX_cluster.clu to go on\n";
						 next ;
				 }
				 $num_of_seq_member=@list=split(/ +/, $clus{$keys[$i]}); # @list has (HIU001, HI002, HI333, MJ111, etc)
				 print "# $0: clu_to_sso_to_msp: No. of seq member=$num_of_seq_member after split \n" if $verbose;

				 FOR0: for($j=0; $j < @list; $j++){
									my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low, @sub_dir_heads,
										 $file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
										 $file_name_prot_low_gz, $file_name_prot_up_gz);

									$each_seq_name=$list[$j];
									#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
									#  Here I take chars from the sequ names, as dirs have fragments of chars
									#_______________________________________________________________________________
									for($s=1; $s <=2 ; $s++){  ## here, number 2 indicates, I check single or 2 char sub dir names
											 $sub_dir_head= substr($list[$j], 0, $s);
											 push(@sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
											 push(@sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
									}
									#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
									#  Checking all the possible subdirectories to crop all the sso files
									#_______________________________________________________________________________
									FOR1: for($p=0; $p < @sub_dir_heads; $p++){
											 $subd=$sub_dir_heads[$p];
											 FOR2 : for($e=0; $e < @possible_extensions; $e++){
														$ext=$possible_extensions[$e];
														#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
														#  This makes all the possible lower upper case names
														#______________________________________________________
														for( $u=0; $u < @U_L_case; $u++){
																	if($U_L_case[$u]=~/U/){  $each_seq_name="\U$each_seq_name";
																	}else{                   $each_seq_name="\L$each_seq_name"; }

																	if(-s "$each_seq_name\.$ext"){   push(@final_files, "$each_seq_name\.$ext" ) ; next FOR0 }
																	elsif(-s "$each_seq_name\.$ext\.gz"){ push(@final_files, "$each_seq_name\.$ext\.gz" ) ; next FOR0 }
																	else{
																			 $file_wanted="\.\/$subd\/$each_seq_name\.$ext";
																			 if(-s $file_wanted){
																											 push( @final_files, $file_wanted); next FOR0 }
																			 elsif(-s "$file_wanted\.gz"){
																											 push( @final_files, "$file_wanted\.gz");
																											 next FOR0
																			 }
																	}
														}
											 } # FOR2
									} # FOR1

				 } # FOR0

				 if(@final_files < 1){
							print "\n# clu_to_sso_to_msp :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
							print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
							next;
				 }
				 # $write_each_msp_to_disk='w';

				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 #  Check if small msp files have already made in previous steps
				 #________________________________________________________________
				 if($final_files[0]=~/\.mspa *$/){ ##  concatenate msp into big_msp
						 open(BIG_MSP_FILE, ">$big_out_msp");
						 for($y=0; $y< @final_files; $y++){
								 open(SINGLE_MSP, "$final_files[$y]");
								 print "\n # Written by  convert_clu_to_sso_to_msp in $0\n";
								 while(<SINGLE_MSP>){
										 print BIG_MSP_FILE $_;
								 }
						 }
						 close(BIG_MSP_FILE);
						 close(SINGLE_MSP);
						 push(@written_mspa_files, $big_out_msp);

				 }else{
						 if($write_each_msp_to_disk){
									print "\# $0 : going to run open_sso_files with $write_each_msp_to_disk opt\n";
									$big_out_msp=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
																																					"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
									if(-s $big_out_msp > 200){  print "\n# $0: SUCCESS to create $big_out_msp :) :) :-) :-) ?\n"; }
						 }else{
									print "\n# clu_to_sso_to_msp: I am running open_sso_files. \n";
									@msp_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
																																					"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};

									&write_mspa_files(@msp_hashes, $big_out_msp); ## concatenates all the hash ref to one
						 }
				 }
		 }
		 return(\@written_mspa_files);
}# end of


#________________________________________________________________________________
# Title     : convert_clu_to_mspa
# Usage     : @written_mspa_files=@{&convert_clu_to_mspa(\$single_linkage_file)};
# Function  : reads in a big single linkage cluster file(or normal cluster file)
#              and creates a big mspa file which contains all the entries in the
#              cluster file (usually with the extension of sclu or clu)
#             This normally reads in xxxx.mso, xxxx.sso like files, but if the
#              corresponding  xxx.mspa file already exists, it concatenates them to
#              make a bigger one.
# Example   :
# Keywords  : clu_2_sso_2_mspa, cluster_to_mspa, cluster_to_sso_to_mspa
#              clu_to_sso_to_mspa
# Options   :
# Category  :
# Version   : 2.8
#--------------------------------------------------------------------------------
sub convert_clu_to_mspa{
    my($i, $j, $k, $s, $u, $v, $p, $m, $n, $y, @possible_extensions, $single_file_name,
           @seq_names, @final_files, @U_L_case, $file, @file, @name_types,
           @poss_sub_dir_heads, @written_mspa_files, $Lean_output, $subdir_char_size,
           $no_of_ext_to_be_checked, $extension_type_found);

    $subdir_char_size=2; # default
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Opening cluster file (xx.clu)
    # %clus looks like this:  2-507     YGR041W YLR353W
    #                         3-308     YDR222W YDR346C YLR225C
    #                         2-184     YCL066W YCR040W
    #______________________________________________________________
    my $clu=${$_[0]} || $_[0];
    $Lean_output=${$_[1]} || $_[1];

    if($verbose){   print "\n# convert_clu_to_mspa : \"$clu\" is given and I am processing it with clu_to_sso_to_mspa\n" if defined $clu;     }
    my %clus=%{&open_clu_files(\$clu)};
    my @clusters= keys %clus;
    my $num_of_cluster=@clusters=@{&sort_by_cluster_size(\@clusters)};

    print "# (i) $0: convert_clu_to_mspa: No. of cluster=$num_of_cluster after open_clu_files \n" if $verbose;

    #&show_array(\@clusters) if $verbose;
    #&show_hash(\%clus) if $verbose;
    @possible_extensions=('mspa', 'mspa.gz', 'msso', 'msso.gz','fsso', 'pbla', 'pbla.gz',
                                                               'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
    @U_L_case=('\U', '\L', ' ');  ## !! the ' ' is necessary

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
    # Making each SINGLE linkage clu to MSP file format to be ready for divclus
    #______________________________________________________________________________
    for($i=0; $i< @clusters; $i++){
	   my (@seq_names, @final_files, $clus_name, $big_out_mspa, @mspa_hashes);
	   $clus_name=$clusters[$i];
	   unless($single_file_name=~/\S/){
						 $big_out_mspa="$clus_name\_cluster\.mspa"; #<<<----- final output name
	   }else{
						 $big_out_mspa=$single_file_name;
	   }
	   push(@written_mspa_files, $big_out_mspa); ## This is the output of this sub

	   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	   #  If $clus_name.mspa is already there, skip
	   #_____________________________________________
	   if( (-s $big_out_mspa) > 100  and !$over_write ){
               print "\n# (i) convert_clu_to_mspa : $big_out_mspa MSP file already exists, skipping\n";
               print "#    Use  \$over_write option \'o\' to start all over again or \n";
               print "#    delete clustering files like XX-XX_cluster.clu to go on\n";
               next ;
       }
       $num_of_seq_member=@seq_names=split(/\s+/, $clus{$clusters[$i]}); # @seq_names has (HIU001, HI002, HI333, MJ111, etc)
       print "# $0: convert_clu_to_mspa: No. of seq member=$num_of_seq_member after split \n" if $verbose;

       $no_of_ext_to_be_checked=@possible_extensions;
       $extension_type_found=0;

       FOR0: for($j=0; $j < @seq_names; $j++){
							 my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low,
									$file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
									$file_name_prot_low_gz, $file_name_prot_up_gz);
									$each_seq_name=$seq_names[$j];
							 my @poss_sub_dir_heads=('.'); ## <<<<------- This is critically important, when 'D' opt is not used!

							 if($each_seq_name=~/(\S+)_\d+\-\d+$/){
									 $each_seq_name_range=$each_seq_name;
									 $each_seq_name=$1;
									 @name_types=($each_seq_name, $each_seq_name_range);
							 }else{
									 @name_types=($each_seq_name);
							 }
							 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							 #  Here I take chars from the sequ names, as dirs have fragments of chars
							 #_______________________________________________________________________________
							 for($s=1; $s <= $subdir_char_size ; $s++){  ## here, number 2 indicates, I check single or 2 char sub dir names
									 $sub_dir_head= substr($seq_names[$j], 0, $s);
									 unshift(@poss_sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
									 unshift(@poss_sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
							 }
							 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							 #  Checking all the possible subdirectories to crop all the sso files
							 #_______________________________________________________________________________

							 FOR1: for($p=0; $p <= @poss_sub_dir_heads; $p++){ ## Default has '.' will make things like '././fam_8_8.pbla.gz'

										$subd=$poss_sub_dir_heads[$p];               ## Also, the '<=' not '<' cures the same problem.
										#print "# (i) Checking sub dir $subd\n";
										FOR2 : for($e=$extension_type_found; $e < $no_of_ext_to_be_checked; $e++){
												 $ext=$possible_extensions[$e];
												 #print "      (i) \$ext at $subd is  $ext\n";
												 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
												 #  This makes all the possible lower upper case names
												 #______________________________________________________
												 for( $u=0; $u < @U_L_case; $u++){
														for($v=0; $v <@name_types; $v++){
															 $each_seq_name=$name_types[$v];
															 if($U_L_case[$u]=~/U/){  $each_seq_name="\U$each_seq_name";
															 }elsif($U_L_case[$u]=~/L/){ $each_seq_name="\L$each_seq_name";
															 }else{ $each_seq_name=$each_seq_name }

															 if(-s "$each_seq_name\.$ext"){
																		push(@final_files, "$each_seq_name\.$ext" ) ;
																		$extension_type_found=$e; $no_of_ext_to_be_checked=$e+1;
																		$found_search_prog_exention_used=$ext;
																		$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
																		$found_search_prog_exention_used=$ext;
																		next FOR0
															 }elsif(-s "$each_seq_name\.$ext\.gz"){
																		push(@final_files, "$each_seq_name\.$ext\.gz" ) ;
																		$extension_type_found=$e; $no_of_ext_to_be_checked=$e+1;
																		$found_search_prog_exention_used=$ext;
																		$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
																		$found_search_prog_exention_used=$ext;
																		next FOR0
															 }else{
                                                                  $file_wanted="\.\/$subd\/$each_seq_name\.$ext";
                                                                  if(-s $file_wanted){
                                                                          push( @final_files, $file_wanted);
                                                                          $extension_type_found=$e; $no_of_ext_to_be_checked=$e+1;
                                                                          $found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
                                                                          $found_search_prog_exention_used=$ext;
                                                                          last FOR1;
                                                                  }elsif(-s "$file_wanted\.gz"){
                                                                          push( @final_files, "$file_wanted\.gz");
                                                                          $extension_type_found=$e; $no_of_ext_to_be_checked=$e+1;
                                                                          $found_search_prog_exention_used=$ext;
                                                                          $found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
                                                                          next FOR0;
                                                                  }
															 }
														}
												 }
										} # FOR2
							 } # FOR1
				 } # FOR0

                 if(@final_files < 1){
					print "\n# (E) convert_clu_to_mspa :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
					print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
					next;
				 }
				 $write_each_mspa_to_disk='';
                 #@final_files=sort(@final_files); ## this is not really necessary

				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 #  Check if small mspa files have already made in previous steps
				 #________________________________________________________________
				 if($final_files[0]=~/(\S+)\.mspa/){ ##  concatenate mspa into big_mspa
						 $search_file_base=$1;
                         open(BIG_MSP_FILE_C, ">$big_out_mspa");
                         print BIG_MSP_FILE_C "# Latest write by convert_clu_to_mspa in $0\n";
                         for($y=0; $y< @final_files; $y++){

                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
                             # Opens single MSP file. Check if they were gzipped or not before open
                             #________________________________________________________________________
                             if($final_files[$y]=~/(\S+)\.gz$/){
                                 system("gzip -d $final_files[$y]");
                                 open(SINGLE_MSP, "$1") || warn "\n\n Can not open SINGLE_MSP $final_files[$y]";
                             }else{                          open(SINGLE_MSP, "<$final_files[$y]") || warn "\n\n Can not open SINGLE_MSP $final_files[$y]";
                             }
                             while(<SINGLE_MSP>){
							      print BIG_MSP_FILE_C $_;
							 }
						 }
						 close(BIG_MSP_FILE_C);
						 close(SINGLE_MSP);
						 push(@written_mspa_files, $big_out_mspa);

						 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
						 # Doing something for L option ($Lean_output)
						 #___________________________________________________
						 if($Lean_output and -d $found_real_subdir_name and $found_search_prog_exention_used){
                             for($y=0; $y< @seq_names; $y++){
                                     unlink("$found_real_subdir_name\/$seq_names[$y]\.$found_search_prog_exention_used");
                                     unlink("$found_real_subdir_name\/$seq_names[$y]\.$found_search_prog_exention_used\.gz");
                             }
						 }elsif($Lean_output){
                             for($y=0; $y< @seq_names; $y++){
                                     unlink("$seq_names[$y]\.$found_search_prog_exention_used");
                                     unlink("$seq_names[$y]\.$found_search_prog_exention_used\.gz");
                             }
						 }
				 }else{
                      if($write_each_mspa_to_disk){
                           print "\# $0 : going to run open_sso_files with $write_each_mspa_to_disk opt\n";
                           $big_out_mspa=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_mspa_to_disk,
                                                                                                                                           "u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_mspa, $over_write)};
                           if(-s $big_out_mspa > 200){  print "\n# $0: SUCCESS to create $big_out_mspa :) :) :-) :-) ?\n"; }
                      }else{
                           print "\n# convert_clu_to_mspa: I am running open_sso_files. \n";
                           @mspa_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_mspa_to_disk,
                                                         "u=$upper_expect_limit", $new_format, $add_range,
                                                         $add_range2, $big_out_mspa, $over_write)};
                           &write_mspa_files(@mspa_hashes, $big_out_mspa); ## concatenates all the hash ref to one
                      }
				 }
		 }## end of  for($i=0; $i< @clusters; $i++){
         return(\@written_mspa_files);
}# end of





#________________________________________________________________________________
# Title     : clu_to_sso_to_msp
# Usage     : &clu_to_sso_to_msp(\$clu);
# Function  : reads in a big single linkage cluster file(or normal cluster file)
#              and creates a big msp file which contains all the entries in the
#              cluster file (usually with the extension of sclu or clu)
#             This normally reads in xxxx.mso, xxxx.sso like files, but if the
#              corresponding  xxx.mspa file already exists, it concatenates them to
#              make a bigger one.
# Example   :
# Keywords  : convert_clu_to_sso_to_msp, clu_2_sso_2_msp, cluster_to_msp, cluster_to_sso_to_msp
# Version   : 1.8
#--------------------------------------------------------------------------------
sub clu_to_sso_to_msp{
		 my($i, $j, $k, $s, $u, $p, $m, $n, $y, @possible_extensions, @seq_names,
				@final_files, @U_L_case, $file, @file, @written_mspa_files);

		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		 # Opening cluster file (xx.clu)
		 # %clus looks like this:  2-507     YGR041W YLR353W
		 #                         3-308     YDR222W YDR346C YLR225C
		 #                         2-184     YCL066W YCR040W
		 #______________________________________________________________
		 my $clu=${$_[0]} || $_[0];
		 if($verbose){
				 print "\n# clu_to_sso_to_msp : \"$clu\" is given
						 and I am processing it with clu_to_sso_to_msp\n" if defined $clu;
		 }
		 my %clus=%{&open_clu_files(\$clu)};
		 my @keys= keys %clus;
		 my $num_of_cluster=@keys=@{&sort_by_cluster_size(\@keys)};

		 print "# $0: clu_to_sso_to_msp: No. of cluster=$num_of_cluster after open_clu_files \n";

		 &show_array(\@keys) if $verbose;
		 &show_hash(\%clus) if $verbose;
		 @possible_extensions=('msp', 'sso', 'msso', 'msso.gz',
							'pbla.gz', 'pbla', 'fsso', 'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
		 @U_L_case=('\U', '\L');

		 for($i=0; $i< @keys; $i++){
				my (@seq_names, @final_files, $clus_name, $big_out_msp, @msp_hashes);
				$clus_name=$keys[$i];
				unless($single_file_name=~/\S/){
						$big_out_msp="$clus_name\_cluster\.mspa"; #<<<----- final output name
				}else{
						$big_out_msp=$single_file_name;
				}
				push(@written_mspa_files, $big_out_msp); ## This is the output of this sub

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				#  If $clus_name.mspa is already there, skip
				#_____________________________________________
				if( (-s $big_out_msp) > 100  and !$over_write ){
						print "\n# clu_to_sso_to_msp : $big_out_msp MSP file already exists, skipping\n";
						print "#    Use  \$over_write option \'o\' to start all over again or \n";
						print "#    delete clustering files like XX-XX_cluster.clu to go on\n";
						next ;
				}
				$num_of_seq_member=@seq_names=split(/ +/, $clus{$keys[$i]}); # @seq_names has (HIU001, HI002, HI333, MJ111, etc)
				print "# $0: clu_to_sso_to_msp: No. of seq member=$num_of_seq_member after split \n" if $verbose;

				FOR0: for($j=0; $j < @seq_names; $j++){
					 my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low, @sub_dir_heads,
							 $file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
							 $file_name_prot_low_gz, $file_name_prot_up_gz);

					 $each_seq_name=$seq_names[$j];
					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
					 #  Here I take chars from the sequ names, as dirs have fragments of chars
					 #_______________________________________________________________________________
					 for($s=1; $s <=2 ; $s++){  ## here, number 2 indicates, I check single or 2 char sub dir names
							 $sub_dir_head= substr($seq_names[$j], 0, $s);
							 push(@sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
							 push(@sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
					 }
					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
					 #  Checking all the possible subdirectories to crop all the sso files
					 #_______________________________________________________________________________
					 FOR1: for($p=0; $p < @sub_dir_heads; $p++){
							 $subd=$sub_dir_heads[$p];
							 FOR2 : for($e=0; $e < @possible_extensions; $e++){
										$ext=$possible_extensions[$e];
										#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
										#  This makes all the possible lower upper case names
										#______________________________________________________
										for( $u=0; $u < @U_L_case; $u++){
												if($U_L_case[$u]=~/U/){  $each_seq_name="\U$each_seq_name";
												}else{                   $each_seq_name="\L$each_seq_name"; }

												if(-s "$each_seq_name\.$ext"){   push(@final_files, "$each_seq_name\.$ext" ) ; next FOR0 }
												elsif(-s "$each_seq_name\.$ext\.gz"){ push(@final_files, "$each_seq_name\.$ext\.gz" ) ; next FOR0 }
												else{
														$file_wanted="\.\/$subd\/$each_seq_name\.$ext";
														if(-s $file_wanted){
																push( @final_files, $file_wanted); next FOR0 }
														elsif(-s "$file_wanted\.gz"){
																push( @final_files, "$file_wanted\.gz");
																next FOR0
														}
												}
										}
							 } # FOR2
					 } # FOR1

				} # FOR0

				print "\n# @final_files \n=============> $big_out_msp  \n\n" if $verbose;

				if(@final_files < 1){
					 print "\n# clu_to_sso_to_msp :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
					 print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
					 next;
				}
				$write_each_msp_to_disk='w';

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				#  Check if small msp files have already made in previous steps
				#________________________________________________________________
				if($final_files[0]=~/\.mspa *$/){ ##  concatenate msp into big_msp
						 open(BIG_MSP_FILE, ">$big_out_msp");
						 for($y=0; $y< @final_files; $y++){
                                 if($final_files[$y]=~/\.gz$/){  open(SINGLE_MSP, "|gunzip -d $final_files[$y]") || warn "\n Can not open SINGLE_MSP $final_files[$y]";
                                 }else{                          open(SINGLE_MSP, "<$final_files[$y]") || warn "\n Can not open SINGLE_MSP $final_files[$y]";
                                 }
								 while(<SINGLE_MSP>){
										 print BIG_MSP_FILE $_;
								 }
						 }
						 close(BIG_MSP_FILE);
						 close(SINGLE_MSP);
						 push(@written_mspa_files, $big_out_msp);

				}else{
						if($write_each_msp_to_disk){
								 print "\# $0 : going to run open_sso_files with $write_each_msp_to_disk opt\n";
								 $big_out_msp=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
															 "u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
								 if(-s $big_out_msp > 200){  print "\n# $0: SUCCESS to create $big_out_msp :) :) :-) :-) ?\n"; }
						}else{
								 print "\n# clu_to_sso_to_msp: I am running open_sso_files. \n";
								 @msp_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
															 "u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};

								 &write_mspa_files(@msp_hashes, $big_out_msp); ## concatenates all the hash ref to one
						}
				}
		 }
		 return(\@written_mspa_files);
}# end of

#________________________________________________________________________
# Title     : read_dir_names_only
# Usage     : @all_dirs_list = @{&read_dir_names_only(\$absolute_path_dir_name, ....)};
# Function  : read any dir names and and then put in array. If no argument
#             for the target directory, it opens PWD automatically
#             You can specify the length of dir names to choose.
# Example   : @files=@{&read_dir_names_only('n', "s=1", '.')};
# Warning   : This does not report '.', '..'
#             Only file names are reported. Compare with &read_any_dir
# Keywords  : read_dir_only, get_dir_names, get_dir_names_only, get_subdir_names,
# Options   : n   for names only reading(not the full path) , default is full path
#             s=  for the size of dirs name. If you want all the dir names
#                   with a size of 1 char, s=1
# Returns   : one ref. of array.
# Argument  : takes one or more scaler references. ('.', \$path, $path, ... )
# Category  :
# Version   : 3.4
#--------------------------------------------------------------------
sub read_dir_names_only{
	my($in_dir, $i,$k, @possible_dirs, @chopped_pwd_path, @args,
	  @final_files, $full_dir, $pwd, $path,@read_files,
	  $size_of_dir_name);
	$pwd=`pwd`;
	chomp($pwd);
	$full_dir=1;
	@args=@_;

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Checking option
	#__________________________________________________
	for($k=0; $k < @args; $k++){
	 if(    $args[$k] eq 'n' or ${$args[$k]} eq 'n'){
	     $full_dir=0;
		 splice(@args, $k, 1); $k--;
	 }elsif( $args[$k] =~/s=(\d+)/ or ${$args[$k]} =~/s=(\d+)/){
		 $size_of_dir_name=$1;
		 #print "\n# read_dir_names_only : You have put the size of dir names : $size_of_dir_name\n";
		 splice(@args, $k, 1); $k--;
	 }
	}
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# When no arg, this opens PWD automatically
	#_________________________________________________
	if(@args == 0){
	 if($full_dir==1){
		 $in_dir=$pwd;
	 }else{
		 $in_dir='.';
	 }
	 @final_files=@{&open_and_read_dir_names_only(\$in_dir)};
	}elsif(@args > 0){
	 for($k=0; $k < @args; $k++){
		if(!(ref($args[$k]))){    $in_dir=$args[$k];
		}elsif(ref($args[$k])){   $in_dir=${$args[$k]};    }

		if($in_dir ne '..' and $in_dir !~ /\// ){
			push(@final_files, @{&open_and_read_dir_names_only(\$in_dir)} );
		}elsif($in_dir eq '..' and $full_dir==1){
			#print "\n# read_dir_names_only: \"..\" is given to open\n";
			@chopped_pwd_path=split(/\//, $pwd);
			pop(@chopped_pwd_path);
			$in_dir=join('/', @chopped_pwd_path);
			push(@final_files, @{&open_and_read_dir_names_only(\$in_dir)} );
		}elsif($in_dir eq '..'){
		    $in_dir = '..';
			push(@final_files, @{&open_and_read_dir_names_only(\$in_dir)} );
			for(@final_files){ $_=~s/\.//; }
		}
		##########  Main READING PART ##########

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Embedded subroutine
		#_________________________________________________
		sub open_and_read_dir_names_only{
			my ($i, @final_files);
			my $in_dir=${$_[0]};
			opendir(DIR1,"$in_dir");
			my @read_files = readdir(DIR1);
			if($size_of_dir_name){
				for($i=0; $i < @read_files; $i ++){
					unless(length($read_files[$i]) == $size_of_dir_name){
						next;
					}
					if($full_dir ==1){
						$read_files[$i]="$in_dir\/$read_files[$i]";
					}
					if( ($read_files[$i] !~ /\/\.\.?$/) && ( -d $read_files[$i]) ){
						 $read_files[$i]=~s/[\.\/]*//; ## removing ./ in front of dirs (in bash)
						 push(@final_files, "$read_files[$i]");
					}
				}
				return([@final_files]);
			}else{
				for($i=0; $i < @read_files; $i ++){
					if($full_dir ==1){
						$read_files[$i]="$in_dir\/$read_files[$i]";
					}
					if( ($read_files[$i] !~ /\/\.\.?$/) && ( -d $read_files[$i]) ){
						 $read_files[$i]=~s/[\.\/]*//; ## removing ./ in front of dirs (in bash)
						 push(@final_files, "$read_files[$i]");
					}
				}
				return([@final_files]);
			}
			}
	 }
	}
	return([sort @final_files]);
}




#________________________________________________________________________________
# Title     : get_overlapping_seq_match_size
# Usage     : $ovlapsize=${&get_overlapping_seq_match_size($st1, $en1, $st2, $en2)
# Function  :
# Example   :
# Keywords  : CF: get_overlapping_range, get_overlapping_seq_match
# Options   :
# Version   : 1.1
#--------------------------------------------------------------------------------
sub get_overlapping_seq_match_size{
		my($start1, $end1, $start2, $end2, $overlapping_region_matched);
		if(@_ == 4){
			 $start1=$_[0]; $end1 =$_[1];  $start2=$_[2]; $end2  =$_[3];
		}elsif(@_==2){
			 if( $_[0]=~/(\d+)\-(\d+)/ ){
					 $start1=$1;      $end1  =$2;
			 }elsif($_[1]=~/(\d+)\-(\d+)/ ){
					 $start2=$1;      $end2  =$2;
			 }else{
					 print "\n# (E) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
					 print "   They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
					 print "   You got it, Sarah?? Try again my dear!\n";
			 }
		}else{
					 print "\n# (E) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
					 print "   They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
					 print "   You got it, Sarah?? Try again my dear!\n";
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#     ---------
		#  ------
		#___________________________________
		if($start1 >= $start2 and $end1 >= $end2){
				$overlapping_region_matched=$end2-$start1;
		}
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# ---------
		#     ----------
		#___________________________________
		elsif($start1 <= $start2 and $end1 <= $end2){
				$overlapping_region_matched=$end1-$start2;
		}
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#      -----
		#    ----------
		#___________________________________
		elsif($start1 >= $start2 and $end1 <= $end2){
				$overlapping_region_matched=$end1-$start1;
		}
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  ---------
		#    ----
		#___________________________________
		elsif($start1 <= $start2 and $end1 >= $end2){
				$overlapping_region_matched=$end2-$start2;
		}
		return(\$overlapping_region_matched);
}
#___________________________________________________________
# Title     : get_seq_fragments
# Usage     : @seq_frag=&get_seq_fragments(\%msf, @RANGE);
# Function  : gets sequence(string) segments with defined
#             ranges.
# Example   :
#  %test=('seq1', '1234AAAAAAAAAAAaaaaa', 'seq2', '1234BBBBBBB');
#  @range = ('1-4', '5-8');
#
#  %out = %{&get_seq_fragments(\%test, \@range)};
#  %out => (seq1_5-8   AAAAA
#           seq2_5-8   BBBBB
#           seq1_1-4    1234
#           seq2_1-4    1234 )
#
# Warning   :
# Keywords  : get_sequence_fragments,
# Options   : _  for debugging.
#             #  for debugging.
#             l=  for min seqlet length
#             r  for adding ranges in the seq names
#
# Returns   :
# Argument  :
# Version   : 1.8
#-------------------------------------------------------
sub get_seq_fragments{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my $min_seqlet_size=10;
	 my @vars=keys %vars;
	 my $no_range_in_name=1;
	 for($i=0; $i< @vars; $i++){
	   if($vars[$i] eq 'l'){
		  $min_seqlet_size=$vars{$vars[$i]};
	   }
	 }
	 if($char_opt=~/v/){ print "\n \$char_opt is $char_opt  @char_opt\n"; }
	 if($char_opt=~/n/){ $no_range_in_name = 1 }
	 if($char_opt=~/r/){ $no_range_in_name = 0 }

	 print "\nget_seq_fragments \$no_range_in_name is $no_range_in_name \n";
	 for($i=0; $i< @hash; $i++){
	 my (%out_frag, $frag_name, $range_start, $range_end, @out_hash);
	 my %seqs = %{$hash[$i]};
	 my @names = keys %seqs;
	 if(@names==1){
	    for($j=0; $j < @names; $j++){
		   my $seq_name = $names[$j];
		   my $seq = $seqs{$seq_name};
		   for($k=0; $k< @range; $k++){
			  my $range = $range[$k];
			  if($no_range_in_name==1){
				 $frag_name = "$seq_name";
			  }else{
			     $frag_name = "$seq_name\_$range";
			  }
			  #if(length($frag_name)>14 ){
			  #	 $frag_name ='x'."${j}_${range}";
		      #}
			  ($range_start, $range_end)=$range=~/(\d+\.?\d*)\-(\d+\.?\d*)/;
			  my $frag_len = $range_end-$range_start+1;
			  if($frag_len < $min_seqlet_size){
			     next;
			  }
			  my $fragment = substr($seq, $range_start-1, $frag_len);
			  $out_frag{$frag_name}=$fragment;
		   }
		}
		push(@out_hash,  \%out_frag);
	 }elsif(@names > 1){
	    for($k=0; $k< @range; $k++){
		  my %out_frag=();
	      my $range=$range[$k];
		  ($range_start, $range_end)=$range=~/(\d+\.?\d*)\-(\d+\.?\d*)/;
	      my $frag_len = $range_end-$range_start+1;
		  if($frag_len < $min_seqlet_size){
		     next;
		  }
	      for($j=0; $j < @names; $j++){
	         my $seq_name=$names[$j];
			 my $seq = $seqs{$seq_name};
		     if($no_range_in_name==1){
				 $frag_name = "$seq_name";
			 }else{
			     $frag_name = "$seq_name\_$range";
			 }
			 #if(length($frag_name)>15 ){
			 #	$frag_name ='x'."${j}_${range}";
		     #}
			 if($range_start==0){ $range_start++; } ## This is a bugfix
			 my $fragment = substr($seq, $range_start-1, $frag_len);
			 $out_frag{$frag_name}=$fragment;
		  }
		  push(@out_hash, \%out_frag);
		}
	 }
	 }
	 if(@out_hash > 1){ return(@out_hash)
	 }elsif(@out_hash==1){ return($out_hash[0]) }
}

#________________________________________________________________________________
# Title     : print_clusfile_from_hash
# Usage     : &print_clusfile_from_hash(\%hash)
# Function  : To print out a file in cluster file format from an input hash containing the genes as keys and the cluster number as values.
# Example   :
# Keywords  : print_single_linkage_cluster, print_cluster_file
# Options   :
# Author    : Sarah A. Teichmann, jong@ebi.ac.uk
# Version   : 1.2
#--------------------------------------------------------------------------------
sub print_clusfile_from_hash {
    my ($i, $j, $k, $gene, @subclus, %hash, $single_linkage_cluster);
    $single_linkage_cluster="single_linkage.sclu";
    open(SING, ">$single_linkage_cluster") or die "\n# $0: print_clusfile_from_hash: failed to open $single_linkage_cluster\n";
    if( @_==1 and ref($_[0]) eq 'HASH'){%hash=%{$_[0]};}

    my @clusters=values(%hash);

    @clusters=@{&remove_dup_in_array(\@clusters)};
    my @genes=keys(%hash);
    for ($i=0; $i<@genes; $i++){
            my $clus=$hash{"$genes[$i]"};
            push(@{"$clus"},$genes[$i]);
            next;
    }

    my (%sizes);
    for ($i=0; $i<@clusters; $i++){
            my $cluster=$clusters[$i];
            @{"$cluster"}=@{&remove_dup_in_array(\@{"$cluster"})};
            my $size=@{"$cluster"};
            $sizes{"$size"}.="$cluster\n";
            next;
    }
    my @clus_sizes=keys(%sizes);
    @clus_sizes=sort {$a<=>$b} (@clus_sizes);

    for ($i=0; $i<@clus_sizes; $i++){
            my $clus_size=$clus_sizes[$i];
            unless ($clus_size>1){next;}
            print SING "Cluster size $clus_size\n";
            my @subclus=split(/\n/,$sizes{"$clus_size"});
            for ($j=0; $j<@subclus; $j++){
                    my $clus=$subclus[$j];
                    print SING "Cluster $clus\n";
                    for ($k=0; $k<@{"$clus"}; $k++){
                            $gene=${"$clus"}[$k];
                            print SING " 1 1 $gene\n";
                            next
                            }
                    next;
         }
         next;
    }
    close(SING);
    return(\$single_linkage_cluster);
}

#______________________________________________________________
# Title     : get_internal_dup_in_a_cluster
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.1
#--------------------------------------------------------------
sub get_internal_dup_in_a_cluster{
	$cluster_line=$_[0] || ${$_[0]};
	my($i, $j, @seq, %out, $seq_name);
	my $overlap_factor=40;
	my $min_inside_dom_size=40;
	@seq=split(/ +/, $cluster_line);  ## These sequence are single seq with different regions
	@seq= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2] }
			             map {/^((\S+)_(\d+)\-(\d+) *.*)$/ && [$1, $2, $3, $4]} @seq;

	F1:for($i=0; $i< @seq; $i++){
	   $seq1=$seq[$i];
	   if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
		  $seq_name=$1;
		  $start1=$2;
		  $end1=$3;
	   }
	   F:for($j=1; $j< @seq; $j++){
		  $seq2=$seq[$j];
		  if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
		  if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
			 $start2=$2;
			 $end2=$3;
		  }
		  $leng2=$end2-$start2;
		  $margin=$leng2/12;   ## 8% overlap is regarded as not overlapping

		  if(( ($start1+$margin) > $end2)||
		    ( ($start2+$margin) > $end1)){ # skips non overlapping seqlets

			$out{"$start1\-$end1"}.="$start2\-$end2 ";

			splice(@seq, $j, 1);
			$j--;
		  }
	   }
	}
	#@out=sort (@out);
	#@out=@{&remove_dup_in_array(\@out)};
	#@out=@{&remove_similar_seqlets(\@temp, "f=2")};
	return(\%out);
}
#________________________________________________________________________
# Title     : read_head_box
# Usage     : %entries = %{&read_head_box([\$file_to_read, \@BOXED ] )};
# Function  : Reads the introductory header box(the one you see on top of sub routines of
#             Jong's programs.). Make a hash(associative array) to put entries
#             and descriptions of the items. The hash values have new lines '\n' are
#             attached, so that later write_head_box just sorts Title to the top
#             and prints without much calculation.
#             This is similar to read_head_box, but
#             This has one long straight string as value(no \n inside)
#             There are two types of ending line one is Jong's #---------- ...
#             the other is Astrid's  #*************** ...
# Example   : Output is something like
#             ('Title', 'read_head_box', 'Tips', 'Use to parse doc', ...)
# Warning   :
# Keywords  : open_head_box, open_headbox, read_headbox
# Options   : 'b' for remove blank lines. This will remove all the entries
#             with no descriptions
# Returns   : A hash ref.
# Argument  : One or None. If you give an argu. it should be a ref. of an ARRAY
#              or a filename, or ref. of a filename.
#             If no arg is given, it reads SELF, ie. the program itself.
# Version   : 2.7
#--------------------------------------------------------------------
sub read_head_box{
	my($i, $c, $d, $j, $s, $z, @whole_file, $title_found, %Final_out,
	  $variable_string, $TITLE, $title, @keys, $end_found, $line, $entry,
	  $entry_match, $End_line_num, $remove_blank,  $title_entry_null,
	  $end_found, $Enclosed_entry, $Enclosed_var, $blank_counter,
	  $title_entry_exist, $entry_value, $temp_W, $Warning_part, $tmp,
	  $option_tb_found
	);

	if(ref($_[0]) eq 'ARRAY'){ ## When array is given
	  @whole_file = @{$_[0]};
	}elsif(-e ${$_[0]}){       ## When filename is given in a ref
	  open(FILE, "${$_[0]}");
	  @whole_file=(<FILE>);
	}elsif(-e $_[0]){          ## When filename is given
	  open(FILE, "$_[0]");
	  @whole_file=(<FILE>);
	}elsif( $_[0] eq 'b'){          ## When filename is given
	  $remove_blank = 1;
	}elsif( ${$_[0]} eq 'b'){          ## When filename is given
	  $remove_blank = 1;
	}else{
	  open(SELF, "$0");
	  @whole_file=(<SELF>);
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for($i=0; $i<@whole_file; $i++){
	 $whole_file[$i] =~ tr/\t/ {7}/;  ## This is quite important to some parsing!!!
	 #########################################
	 ##  The first and second line of box 1 ##
	 #########################################
	 if( ($whole_file[$i]=~/^#[_\*\~\-\=]{20,}$/)&&    ##  '#______' is discarded
		 ($whole_file[$i+1]=~/ *\# {0,3}([TitlNam]+e) {0,8}: {1,10}([\w\.:]*) *(Copyright.*)/i) ){
		 $TITLE = $1;      $title = "$2\n";   $Final_out{'Warning'}.="$3\n";
		 $entry_match=$TITLE; ## The very first $entry_match is set to 'Title' to prevent null entry
		 if($TITLE =~ /^Title|Name$/i){   #
			  if( ($title=~/^\s+$/)||( $title eq "\n") ){
				  $title_entry_null =1;  $title = '';  }    }
		 $Final_out{$TITLE}=$title;
		 $title_found ++ ;   $i++;  ## << this is essential to prevent reading the same line again.
		 last if $title_found > 1;    }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ## The first and second line of box 2, #__________ or #**************
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&($whole_file[$i]=~/^#[_\*]{20,}$/)&&
		 ($whole_file[$i+1]=~/^# {1,3}(\w{1,6}\s{0,2}\w+) {0,7}: {1,5}(.*) */i) ){
		 $title_found ++ ;        $i++;
		 $entry_match=$1;       $entry_value=$2;
		 $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;  ## Capitalize words
		 $Final_out{$entry_match}.= "$entry_value\n";
		 last if $title_found > 1;  next;   }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ##  'Enclosed' : section. After this, everything is read without discrimination ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($Enclosed_entry == 1)&&($whole_file[$i] =~/^#{1,} {1,}(.*)$/) ){
		 $Final_out{$Enclosed_var}.= "$1\n";    }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ##  With proper entry 1 : for  'eg)'
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&($title_found==1)&&
		 ($whole_file[$i]=~ /^# {1,12}(eg ?\)) {0,8}(.*)/i)){
		 $entry_match='Example';
		 $Final_out{$entry_match}.= "$2\n";
	 }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ##  With PROPER entry 2 : descriptins like. 'Ussage : ssssssxxjkk  kj'
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&($title_found==1)&&
		 ($whole_file[$i]=~ /^# {1,2}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)] {0,6}(.*) */i)){
		 $entry_match=$1;       $entry_value=$2;
		 $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
		 $Final_out{$entry_match}.= "$entry_value\n";
		 if($entry_match=~/^(Enclosed?)$/i){
			  $Enclosed_entry = 1;  $Enclosed_var=$1;        }    }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 #  With proper entry 3 : descriptins like. 'Ussage :', But blank description ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&($title_found==1)&&
		 ($whole_file[$i]=~ /^# {1,2}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)]( {0,})$/i)){
		 $entry_match=$1;       $entry_value=$2;
		 $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
		 $Final_out{$entry_match}.= " $entry_value\n";
		 if($entry_match=~/^(Enclosed?)$/i){
			  $Enclosed_entry = 1;  $Enclosed_var=$1;      }    }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 #  $option variable matching                ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1) && ($title_found==1) &&
		 ($whole_file[$i]=~ /^# {1,15}([\$\@]\w+ +[\w=\>]+ +\S+ \w+ \S+ *.*)/ )){
		 $Final_out{$entry_match} .= "$1\n";  }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ###  all space line matching                 ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&  ##<----- If blank line is matched. Take the line
		 ($title_found==1)&&($whole_file[$i]=~/^# {0,}$/) ){
		 $blank_counter++;
		 if($blank_counter > 2){ $blank_counter--; }
		 else{ $Final_out{$entry_match}.= " \n";  }     }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ###  Anything after 3 space to 12 positions  ##
	 ###  To match 'examples' etc. INC. ':'       ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&
		 ($title_found==1)&&($whole_file[$i]=~/^#( {2,12})(.+)/) ){
		 $Final_out{$entry_match}.= "$1$2\n"; $blank_counter=0; }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ###  Anything after 1 space to 11 positions  ##
	 ###  To match 'examples' etc. EXC. ':'       ##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($end_found != 1)&&
		 ($title_found==1)&&($whole_file[$i]=~/^# {1,12}([^:.]+)/) ){
		 $Final_out{$entry_match}.= "$1\n"; $blank_counter=0;}

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 ###-------End of the read_box reading--------##
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( ($title_found==1)&&
		 ($whole_file[$i]=~ /^#[\~\=\*\-]{15,}/)){  ## to match '#-----..' or '#******..'(Astrid's)
		 $End_line_num = $i;       $end_found++;
		 last;      }

	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 #  <<<<  Check if there is option table >>>>  #
	 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 elsif( (/^#{10,} option table of this program   #{10,}/)&&($end_found >=1) &&($title_found==1)){
				 $option_tb_found++; ### This is a global var.
	 }
	} ## < End of for loop


	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### If title is not there at all     ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	@keys=keys %Final_out;
	for(@keys){
	  if(/^Title$/i){    ## No Entry of Title at all??
		  $TITLE =$&;
		  $title_entry_exist = 1;
		  if($Final_out{$_}=~/^ *$/){   ## if Title => Null or just space
			  $title_entry_null = 1;    }  }  }

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### When title entry is not there    ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if( $title_entry_exist != 1){
		for($s=$End_line_num+1; $s < $End_line_num+20; $s++){
			if( $whole_file[$s] =~ /^sub {1,5}([\w\.]+) {0,6}\{/){
				$Final_out{'Title'} = "$1\n";   last;       }
			elsif( $whole_file[$s] =~/^#________________________+/){
				#######################################
				## Uses running file name as titile  ##
				#######################################
				$Final_out{'Title'} = "$0";     last;
			}else{
				#######################################
				## Uses running file name as titile  ##
				#######################################
				$Final_out{'Title'} = "$0";
			}
		}
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### When title is blank              ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	elsif($title_entry_null ==1){  ## It looks for 'sub xxxxx{ ' line to get title
		### $End_line_num is the last line read.
		for($s = $End_line_num+1; $s < $End_line_num+20; $s++){
			if( $whole_file[$s] =~ /^sub {1,5}(\w+\.*\w*) {0,7}{/){
				$Final_out{$TITLE} = "$1\n";    last;     }
			elsif( $whole_file[$s] =~/^#________________________+/){
				#######################################
				## Uses running file name as titile  ##
				#######################################
				$Final_out{$TITLE} = "$0";     last;
			}else{
				#######################################
				## Uses running file name as titile  ##
				#######################################
				$Final_out{$TITLE} = "$0";
			}
		}
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	## Error handling, if no head box is found   ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($title_found < 1){ print "\nFatal: No headbox found by read_head_box2 sub.\n";  }
	\%Final_out;
}               ##<<--- ENd of the sub read_head_box
#________________________________________________________________________________
# Title     : import_ENV_vars
# Usage     :
# Function  : You can use any ENV set variables directly in your
#             program. So, you can say $USER instead of $ENV{'USER'}
# Example   :
# Keywords  : import_Env_vars, import_ENV_variables
# Options   :
# Version   : 1.1
#--------------------------------------------------------------------------------
sub import_ENV_vars{
		my($caller_package, $env_var_name);
		$caller_package=caller;
		foreach  $env_var_name (keys %ENV){
			 ${"${'caller_package'}::${'env_var_name'}"}=$ENV{$env_var_name}; ## ' ' are necessary
		}
		#print "\n# import_ENV_vars: ALL the ENV settings are imported to $0 program\n" if $verbose;
}

#__________________________________________________________________________
# Title     : sort_by_cluster_size
# Usage     : @out=@{&sort_by_cluster_size(\@input_line_array)};
# Function  : it sorts by the 1st digit before '-'  as in 2-183_cluster, 2-140_cluster,
#               etc.
# Example   :
# Keywords  : sort_by_columns, sort_by_text_columns, sort_by_column_numerically
#             sort_by_pattern
# Options   :
# Version   : 1.2
#----------------------------------------------------------------------------
sub sort_by_cluster_size{
	 my (@in, @M, $col);
	 if(@_ < 1  ){ print "\n# FATAL: sort_by_cluster_size needs 1 argument\n"; die }
	 if(ref $_[0] eq 'ARRAY'){ 	  @in = @{$_[0]};      }else{ 	  @in = @_;    }
	 $col=0;
	 @in= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { [$_, ($_=~/^(\S+)\-/)[$col] ] } @in;
	 return(\@in);
}


#______________________________________________________________
# Title     : check_linkage_of_2_similar_seqlet_sets
# Usage     :
# Function  : connects two clusters of seqlets if they share
#              identical or near identical seqlets
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#  $factor = by f=  # eg)  "f=$factor" in the higher level sub
#
# Returns   :
# Argument  :
# Version   : 1.9
#--------------------------------------------------------------
sub check_linkage_of_2_similar_seqlet_sets{
	 my ($seq1, $name1, $start1, $end1, $seq2, $leng1, $leng2,
	    $name2, $start2, $end2, $diff_start,  $diff_end, @splited1,
	    @splited2, $link_or_not, $factor, $s, $t);
	 @splited1=@{$_[0]};
	 @splited2=@{$_[1]};

	 $link_or_not=0;
	 $factor=7;  # this means 70% sequence region overlap of the intermediate is chosen

	 if($_[2]=~/f=(\S+)/i){	  $factor=$1;	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # Breaks the splited1 and splited2 strings to words to compare
	 #_________________________________________________________________
	 F1: for($s=0; $s<@splited1; $s++){
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# Checks if the input has :  XXXXX_10-400 format or simple name like XXXXXX
			#______________________________________________________________________________
			if($splited1[$s]=~/^((\S+)_(\d+)\-(\d+))/){
					$seq1=$1;
					$name1=$2;
					$start1=$3;
					$end1=$4;
			}else{   $seq1=$splited1[$s]; $name1=$start1=$end1='';    }
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# Breaks the splited2
			#_____________________________________________________________________
			F2: for($t=0; $t< @splited2; $t++){
				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 # If splited1 has XXXX_10-100 format(def($name1)), then compare regions
				 #_________________________________________________________________________
				 if($name1 and $splited2[$t]=~/^((\S+)_(\d+)\-(\d+))/){
						 $seq2=$1;
						 $name2=$2;
						 $start2=$3;
						 $end2=$4;
						 if($seq1 eq $seq2){ $link_or_not=1; return(\$link_or_not) }
						 if($name1 ne $name2){
								 next F2;
						 }elsif($name1 eq $name2){ ## ~~~~~~~~~~~~~ THIS is the MOST IMP CORE PART ~~~~~~~~~~~~~
								 $leng1=$end1-$start1;
								 $leng2=$end2-$start2;
								 if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
								 $diff_start=abs($start1-$start2);
								 $diff_end  =abs($end1  -$end2  );
								 if((($diff_start+$diff_end)/2) <= ($smaller_leng/$factor) ){
										$link_or_not=1;
										return(\$link_or_not);
								 }
						 }## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 }else{
				     $seq2=$splited2[$t];
						 if($seq1 eq $seq2){ $link_or_not=1; }
				 }
			}
	 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # If $link_or_not has become 1 in any of the above part, 1 is returned
	 #________________________________________________________________________
	 return(\$link_or_not);
}

#_____________________________________________________________________________
# Title     : remove_similar_seqlets
# Usage     : @seqlets=@{&remove_similar_seqlets(\@split)};
# Function  : merges(gets average starts and ends ) of similar
#             seqlets to reduce them into smaller numbers. This can also handle
#              names like XLBGLO2R_8-119_d1hlm__.
#
# Example   : @seqlets=@{&remove_similar_seqlets(\@mrg1, $mrg2, \@mrg3)};
#               while @mrg1=qw(M_2-100 M_2-110 M_8-105 M_4-108 N_10-110 N_12-115);
#                     $mrg2='Z_3-400 Z_2-420';
#                     @mrg3=('X_2-300 X_3-300', 'X_2-300', 'X_5-300 X_2-301' );
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : merge_sequence_names, merge_seq_names, merge_sequence_ranges
#             merge_seq_ranges
# Options   : _  for debugging.
#             #  for debugging.
#             f= for factor
#             S  for shorter region matched is used
#             A  for average region matched is used
#             L  for larger region matched is used
#
# Version   : 2.1
#-------------------------------------------------------------------------------
sub remove_similar_seqlets{
	 my ($i, $j, $seq1, $smaller_leng, $leng1, $leng2, $start1, $end1, $seq2, $start2,
	   $av_diff, $num_of_seq, $av_end, $av_start, $end2, @seqlets,
	   @array_input, @seqlet, $tail1, $tail2, $shorter_region, $larger_region,
	   $average_region, $factor);
	 $factor=7;  ## !!! This var makes big difference in the final clustering
	 $average_region = 'A'; ## default is to get the average of comparing regions

	 for($i=0; $i< @_; $i++){
	    if(ref($_[$i]) eq 'ARRAY'){
		     @array_input=@{$_[$i]};
		     for($j=0; $j<@array_input; $j++){
			      @seqlet=split(/ +/, $array_input[$j]);
					  push(@seqlets, @seqlet);
		     }
		     #if($verbose){
				 #   print "\n# remove_similar_seqlets: ARRAY ref is given as input\n";
				 #   print "#  They are: @seqlets\n";
				 #}
	    }elsif($_[$i]=~/f=(\S+)/){   $factor=$1
	    }elsif($_[$i]=~/^(S) *$/){   $shorter_region=$1 ; $average_region=0;
	    }elsif($_[$i]=~/^(L) *$/){   $larger_region =$1 ; $average_region=0;
	    }elsif($_[$i]=~/^(A) *$/){   $average_region=$1 ; $shorter_region=$larger_region=0;
	    }elsif($_[$i]=~/\S+\_\d+\-\d+/){
		     push(@seqlets, split(/ +/, $_[$i]) );
	    }elsif(ref($_[$i]) eq 'SCALAR' and ${$_[$i]}=~/\S+\_\d+\-\d+/){
	       push(@seqlets, split(/ +/, ${$_[$i]}) );
	    }
	 }
	 #print "\n# remove_similar_seqlets : I am using \$factor : $factor\n" if $verbose;

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # Sorting is necessary as I am not doing the real thorough comparison
	 #______________________________________________________________________
	 $num_of_seq=@seqlets=sort @seqlets;
	 #if($verbose){
	 #	 print "\n# (1) remove_similar_seqlets: Num of seq to merge: $num_of_seq (from \@seqlets)";
	 #	 print "\n# (2) remove_similar_seqlets: \@seqlets are  @seqlets\n";
	 #}
	 my ($short_start, $large_start, $short_end, $large_end);

	 for($i=0; $i< @seqlets; $i++){
	    if($seqlets[$i]=~/^ *(\S+)_(\d+)\-(\d+)(\S*)/){  ## last (\S*) is necessary for XLBGLO2R_8-119_d1hlm__
		     my($seq1, $start1, $end1, $tail1)=($1, $2, $3, $4);
	       if($seqlets[$i+1]=~/^(\S+)_(\d+)\-(\d+)(\S*)/){
						 ($seq2, $start2, $end2, $tail2)=($1, $2, $3, $4);
						 if($seq1 eq $seq2){
								#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
								$diff_start=abs($start1 - $start2);
								$diff_end  =abs($end1   - $end2  );
								$leng1=$end1-$start1;
								$leng2=$end2-$start2;

								if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
								if( ($diff_start+$diff_end)/2 <= $smaller_leng*($factor/10) ){

										if($average_region){
											 $av_start=int(($start1+$start2) / 2);
											 $av_end  =int(($end1 + $end2) / 2);
															 $seqlets[$i]="$seq1\_$av_start\-${av_end}$tail1";  # $tail1 is for names like XLBGLO2R_8-119_d1hlm__
															 # print "\n# new seqlet : $seqlets[$i]\n" if $verbose;
											 splice(@seqlets, $i+1, 1);
											 $i--;
										}else{
											 if($start1 < $start2){
											 $short_start=$start2; $large_start=$start1;  ## note that short start should be $start2 if $start2 is bigger
											 }else{
													$short_start=$start1; $large_start=$start2;
											 }
											 if($end1 < $end2){
													$short_end=$end1;  $large_end=$end2;
											 }else{
													$short_end=$end2;  $large_end=$end1;
											 }
											 if($shorter_region){
													 $seqlets[$i]="$seq1\_$short_start\-${short_end}$tail1";
											 }elsif($larger_region){
													 $seqlets[$i]="$seq1\_$large_start\-${large_end}$tail1";
											 }

											 splice(@seqlets, $i+1, 1);
											 $i--;
										}
								}
						 }
		     }
	    }
	 }
	 #print "\n# (3) remove_similar_seqlets: The final out are: @seqlets\n" if $verbose;
	 return(\@seqlets);
}

#______________________________________________________________________________
# Title     : make_seq_index_file
# Usage     : @idx_files_made=@{&make_seq_index_file(\@file)};
# Function  : creates xxxx.fa.idx file and makes a link to pwd. If @file contains
#              names with .idx extension already, it will not put another idx
#              index to it.
# Example   :
# Keywords  : make_fasta_seq_index_file, create_seq_index_file, make_idx_file,
#             create_idx_file, create_seq_idx_file, make_index_file, create_index_file
#             make_sequence_index_file, create_sequene_index_file
# Options   :
# Version   : 1.4
#----------------------------------------------------------------------------
sub make_seq_index_file{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	my(@index_files_made, $fasta_db_input, $fasta_db_idx, %index);
	print "\n# make_seq_index_file : input \@file was @file\n";

	for($i=0; $i< @file; $i++){
		$fasta_db_input=$file[$i];
		if($fasta_db_input !~/\S+\.idx$/){
			$fasta_db_idx="$fasta_db_input.idx";
		}

		open(FASTA_DB, "$fasta_db_input");
		open(FASTAIDX, ">$fasta_db_idx");

 	    print FASTAIDX "# fasta_index for $fasta_db_input\n";
		while(<FASTA_DB>){
			if(/^\> {0,4}(\S+) */){
				$index{$1}=tell(FASTA_DB);
				print FASTAIDX "\n$1 $index{$1}";
			}
		}
		close(FASTA_DB);
		close(FASTAIDX);
		if(-s $fasta_db_idx){
			print "\n# The size of $fasta_db_idx is more than 0, looks O.K. \n";
			push(@index_files_made, $fasta_db_idx);
			system("ln -s $fasta_db_idx .");
		}else{
		    print "\n# The size of $fasta_db_idx is less than 0, ERROR??\n";
		}
	}
	if(@file < 2){
	   return( \$fasta_db_idx );
	}else{
	   return(\@index_files_made);
	}
}
#______________________________________________________________
# Title     : sort_words_in_string
# Usage     :
# Function  : sort words in strings sperated by ' ' or "\n"
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : sort_words_in_sequences, sort_sequences_in_string,
#             sort_strings_in_string, sort_string_by_words, sort_elements_in_string
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.1
#--------------------------------------------------------------
sub sort_words_in_string{
	 my @in=@{$_[0]} || @_;
	 my @OUT;
	 for (@_){
	  push(@OUT, join(' ', sort split(/ +|\n/) ));
	 }
	 return(\@OUT);
}
#________________________________________________________________________
# Title     : show_array
# Usage     : &show_array(\@input_array);
# Function  : for debugging purpose. Shows any array elem line by line.
# Example   : Output:      item1
#             Output:      item2
#             Output:      item3
# Warning   : can handle scalar ref, too.
# Keywords  :
# Options   : -h  for horizontal display of elements
#             c   for compact (do not put new line between array chunk)
#             s   for putting new line between arrays
# Returns   :
# Argument  :
# Version   : 2.4
#--------------------------------------------------------------------
sub show_array{
	my($k, $i, $t,  @in2, $in, $space, $show_horizontally, $compact);
	my(@in)=@_;

	## This is to get the option of 'horizontal' to make horizontal output.
	for($t=0; $t < @in ; $t++){
	 if($in[$t] =~/\-?[hH][orizontal]*$/){   ### No ref.
		 $show_horizontally = "h";
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/-?[hH][orizontal]*$/){  ### ref.
		 $show_horizontally = "h";
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/^s$/i){  ### ref.
		 $space = "s";
		 $compact='';
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/^c$/i){  ### ref.
		 $compact = "c";
		 $space='';
		 splice(@in, $t, 1);  $t--;
	 }
	}

	for($k=0; $k < @in; $k++){
	 if(ref($in[$k]) eq 'ARRAY'){
		 &show_array(@{$in[$k]}, "$show_horizontally", "$compact", "$space" );
	 }elsif(ref($in[$k]) eq 'SCALAR'){
		 if($show_horizontally eq "h"){
			 print ${$in[$k]}, ",  ";
		 }elsif(  $show_horizontally ne "h"){
			 print ${$in[$k]}, "\n";
		 }
	 }elsif( !ref($in[$k]) ){
		 if($show_horizontally eq 'h'){
			 print  $in[$k] , ",  ";
		 }elsif(  $show_horizontally ne "h"){
			 print  $in[$k] , "\n";
		 }
	 }
	}
	if($compact !~/^c$/i){
	print "\n"; #### This is necessary to distinguish different arrays.
	}
}


#________________________________________________________________________________
# Title     : mspa_single_link_hash
# Usage     : %hash=%{&mspa_single_link_hash(\@mspa_files, E-value);
# Function  : To make a hash with all the genes in the mspa files as the keys,
#             which are linked at or below the E-value threshhold,
#             with the values denoting the cluster number
# Example   :
# Keywords  : single_linkage, mspa_single_linkage, mspa_single_linkage_hash
#              make_mspa_single_link_hash
# Options   :
# Author    : Sarah A. Teichmann with thanks to Alex Bateman,
#               sat@mrc-lmb.cam.ac.uk, jong@salt2.med.harvard.edu
# Version   : 1.7
#--------------------------------------------------------------------------------
sub mspa_single_link_hash{
		my (@mspa_files, $i, $j, $k, $e_val, $gene_1, $gene_2,
				@mspcont, $gene_1, $gene_2, $E_cut, %hash, $array);

		$E_cut=0.001; ## Default Eval cut

		if( @_==2 and ref($_[0]) eq 'ARRAY'){
				@mspa_files=@{$_[0]};
				$E_cut=${$_[1]} || $_[1];
		}else{
				print "Subroutine mspa_single_link_hash takes one input array and the E-value as its arguments!!" && die;
		}
		#print "\n# (i) mspa_single_link_hash : \$E_cut is $E_cut \nwith @mspa_files\n";
		print "\n# (i) mspa_single_link_hash : \$E_cut is $E_cut \nwith @mspa_files\n" if $verbose;

		for ($i=0; $i<@mspa_files; $i++){
				if ($mspa_files[$i]=~/\S+\.mspa$/){
						open(MSP, "$mspa_files[$i]");
						@mspcont=<MSP>;
						close(MSP);
				}elsif ($mspa_files[$i]=~/\S+\.mspa\.gz$/){
						@mspcont=`gunzip -c $mspa_files[$i]`;
						#print "\n @mspcont \n"; die;
				}

				$array++;
				for ($j=0; $j<@mspcont; $j++){
						if ($mspcont[$j]=~/^\s*\S+\s+(\S+)\s+\S+\s+\d+\s+\d+\s+(\S+)\s+\d+\s+\d+\s+(\S+)/){
								$e_val=$1;
								unless($e_val<=$E_cut){next;}
								$gene_1=$2;
								$gene_2=$3;
								#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
								# Removing REGION information as Sarah's can not handle regions
								#_______________________________________________________________
								if($gene_1=~/^(\S+)_\d+\-\d+/){ $gene_1=$1 }
								if($gene_2=~/^(\S+)_\d+\-\d+/){ $gene_2=$1 }

								if ($gene_1 eq $gene_2){next;}
								if ( ! $hash{"$gene_1"} and $gene_1){
										$hash{"$gene_1"}="$array";
										push (@{"$array"}, $gene_1);
								}
								if (! $hash{"$gene_2"} and $gene_2){
										$hash{"$gene_2"}="$array";
										push (@{"$array"}, $gene_2);
								}
								if ($hash{"$gene_1"}==$hash{"$gene_2"}){next;}
								if ( $hash{"$gene_1"} gt  $hash{"$gene_2"}){
										push (@{"$hash{$gene_2}"}, @{"$hash{$gene_1}"});
										for ($k=0; $k<@{"$hash{$gene_2}"}; $k++){
												 $hash{${"$hash{$gene_2}"}[$k]}=$hash{"$gene_2"};
												 next;
										}
										next;
								}
								if ( $hash{"$gene_2"} gt  $hash{"$gene_1"}){
										push (@{"$hash{$gene_1}"}, @{"$hash{$gene_2}"});
										for ($k=0; $k<@{"$hash{$gene_1}"}; $k++){
												 $hash{${"$hash{$gene_1}"}[$k]}=$hash{"$gene_1"};
												 next;
										}
										next;
								}
								next;
						}else{ next; }
				}
				next;
		}
		return (\%hash);
} ## mspa_single_link_hash






#________________________________________________________________________
# Title     : write_fasta_seq_by_seq
# Usage     : &write_fasta_seq_by_seq(\%hash, [$extension], [\$output_filename]);
# Function  : accepts one hash of multiple sequences and writes many files
#             of single sequences by using the names as file names.
#             If $extension is provided, it writes an output as in
#             the below example (seq1_sc.fasta). If not, it just attach
#             'fasta' to files.
#             This needs, hash of 'name', 'actual sequence as value'
# Example   : with >xxxx
#                  ASDFASDFASDFASDFASDFASDFASDF
#                  >yyyy
#                  ASDFASDFASDFASDFASDFASDFSDAFSD
#
#             You will get two files (xxxx.fasta, yyyy.fasta)
# Keywords  : write_each_fasta, write_single_fasta, write_fasta_single
#             single_fasta_write, write_fasta_files_seq_by_seq,
#             write_single_fasta_files, write_seq_by_seq
# Options   : can specify extension name.
#             e  for checking fasta file exists or not and skipps if so
#             r for rename the sequences so that Clustalw would not complain with 10 char limit
#               so result wuld be:  0 ->ASDFASDF, 1->ASDFASFASF, 2->ADSFASDFA
#   $write_rv_seq_as_well=R by R  # write reverse seq as well as forward seq
#   $write_pure_seq_only=o by o
#   $extension= by E=
#
# Returns   : nothing. default OUTPUT file name is '$key.fasta' !!
# Category  :
# Version   : 2.4
#--------------------------------------------------------------------
sub write_fasta_seq_by_seq{
	my ($i, $exists_opt, $rename_seq_opt, $out_file_name_given,
  		$write_rv_seq_as_well, $output_file_rv, $extension,
		$output_file, $write_pure_seq_only, $default_extention);
    $default_extention='spfa';  ## spfa ==> Single Protein FAsta

		for($i=0; $i< @_; $i++){
            if($_[$i]=~/e$/){   $exists_opt=1; splice(@_, $i, 1);  $i--;
            }elsif($_[$i]=~/r$/){ $rename_seq_opt='r';  splice(@_, $i, 1);  $i--;
            }elsif($_[$i]=~/o$/){ $write_pure_seq_only='o';  splice(@_, $i, 1);  $i--;
            }elsif($_[$i]=~/R$/){ $write_rv_seq_as_well='R';  splice(@_, $i, 1);   $i--;
            }elsif($_[$i]=~/E=(\S+)/){ $extension=$1;  splice(@_, $i, 1);   $i--;
            }elsif($_[$i]=~/\S\.fasta/ or -e $_[$i] ){  $out_file_name_given=1;  $output_file = $_[$i];
                 splice(@_, $i, 1);     $i--;
            }elsif( ref ($_[$i]) eq 'SCALAR'){
                 if( ${$_[$i]} =~/\.[msp]*fa/ or -e ${$_[$i]} ){
                        $out_file_name_given=1;   $output_file=${$_[$i]};  splice(@_, $i, 1);  $i--;
                 }
            }
		}
		my(%temp_hash, %temp_hash_rv, $key );
		my(%input)     =%{$_[0]};


		for $key (keys %input){
				my(%temp_hash, %temp_hash_rv, $output_file_rv, $output_file);
				%temp_hash=%temp_hash_rv=();
				$temp_hash{$key}=$input{$key};
				if($write_rv_seq_as_well){
						 $key_rv="$key\_rv";
						 $temp_hash_rv{$key_rv}=reverse($input{$key});
				}
				unless($output_file){
						 #print "\n# (i) \$outputfile is NOT given as an argument to write_fasta_seq_by_seq\n";
						 if ($extension){
									if($write_rv_seq_as_well){
											$output_file_rv = "$key\_rv\.$extension";
											$output_file = "$key\.$extension";
									}else{
											$output_file = "$key\.$extension";
									}
						 }else{
                                    #print "\n    (i) You didnt specify \$extension in write_fasta_seq_by_seq, \"$default_extention\" used\n";
									if($write_rv_seq_as_well){
											$output_file_rv = "$key_rv\.$default_extention";
                                            $output_file = "$key\.$default_extention";
									}else{
                                            $output_file = "$key\.$default_extention";
									}
						 }
				}else{
						 #print "\n# (i) \$output_file, $output_file is given as an argument to write_fasta_seq_by_seq\n";
						 $out_file_name; # $out_file_name is already given as an argument
						 ($base, $extension)=split(/\./, $output_file);
						 if($write_rv_seq_as_well){
									$output_file_rv = "$base\_rv\.$extension";
						 }
						 $output_file = "$base\.$extension";
				}

				if( $exists_opt==1 and -e $output_file){
						#print "# write_fasta_seq_by_seq: File $output_file exists, I wouldn\'t overwrite $output_file\n\n\n";
						next;
				}
				if( $out_file_name_given == 1){
						&write_fasta(\%temp_hash, \$output_file, $rename_seq_opt);
						if($write_rv_seq_as_well){
								$output_file_rv=$output_file;  ## because output file name is already given
								&write_fasta(\%temp_hash_rv, \$output_file_rv, $rename_seq_opt, $write_pure_seq_only);
						}
				}else{
						if($write_rv_seq_as_well){
								&write_fasta(\%temp_hash_rv, \$output_file_rv, $rename_seq_opt, $write_pure_seq_only);
						}
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# The very DEFAULT writing
						#__________________________________________________
						&write_fasta(\%temp_hash, $rename_seq_opt, $write_pure_seq_only);
				}
		}
}




#______________________________________________________________
# Title     : get_domain_inside_domain
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : find_dindoms, domain_inside_domain, domain_in_domain
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.0
#--------------------------------------------------------------
sub get_domain_inside_domain{
	$cluster_line=$_[0] || ${$_[0]};
	my($i, $j, @seq, @out);
	my $overlap_factor=40;
	my $min_inside_dom_size=40;
	@seq=split(/ +/, $cluster_line);
	F1:for($i=0; $i< @seq; $i++){
	   $seq1=$seq[$i];
	   if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
		  $seq_name=$1;
		  $start1=$2;
		  $end1=$3;
	   }
	   F:for($j=0; $j< @seq; $j++){
		  $seq2=$seq[$j];
		  if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
		  if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
			 $start2=$2;
			 $end2=$3;
		  }
		  if(($start1 > $end2)||($start2 > $end1)){ # skips non overlapping seqlets
			 next;
		  }
		  if(($start1 > $start2)&&($end1 < $end2)){  #   -----
			 $leng_seq1=$end1-$start1;               # ----------
			 $leng_seq2=$end2-$start2;
			 if(( ($leng_seq2/2) >= $leng_seq1 )&&
			    ($leng_seq1 > $min_inside_dom_size) ){   # if seq1 is less than 60% of seq2, it is a hidden domain
				push(@out, "$seq2\($seq1\)");
			 }
		  }elsif(($start1 < $start2)&&($end1 > $end2)){  # -----------
			 $leng_seq1=$end1-$start1;                   #   ------
			 $leng_seq2=$end2-$start2;
			 if(( ($leng_seq1/2) >= $leng_seq2)&&
			    ($leng_seq2 > $min_inside_dom_size) ){   # if seq1 is less than 60% of seq2, it is a hidden domain
				push(@out, "$seq1\($seq2\)");
			 }
		  }
	   }
	}
	return(\@out);
}
#________________________________________________________________________
# Title     : reverse_sequences
# Usage     : %out = %{&rev_sequence_one_hash(\%input_seq_hash, \%hash2,...)};
# Function  : gets ref. of strings, reverses the elems.
# Example   :
# Warning   :
# Keywords  : reverse_sequence, reverse_sequence_hash, rev_sequence_hash
# Options   :
# Returns   : one or more hash references.
# Argument  : hash, eg(1, 'skdfj', 2, 'kdfjkdj', 3, 'kdfjk');
#             Input example:
#             ..
#             >HI0256
#             FLSANVLPIAPIINGGRTAVDNITQSVSDKPFVKDIGTKIKEAIALSKYSTQPQYISTTN
#             >HI0094
#             DILRTFVKMETGLKFPKKFKLKANLALFMNRRNKRPDTIMTAVADAGQKISEAKLNTTAK
#             ..
#
#             Output example: (Reversed :-)
#             ..
#             >HI0256_rv   <<-- note the added extension
#             ALDJFLKAJFJALSDJFLAJSLFJAKLSDFJLASJDFLAJSLDFJASJDFLJSDFJSDLJ
#             >HI0094_rv
#             LASJDFLKAJFJALSDJFLKSDJLFAJLKDJFLASJDFLKDFJKDJFKDJFKDJFKJDLJ
#             ..
#
# Version   : 1.2
#--------------------------------------------------------------------
sub reverse_sequences{
	my(%rev_hash, @rev_hash_refs, $name, $name_with_ext, $i);
	for($i=0; $i < @_; $i++){
	    my %in_hash = %{$_[$i]};
		my @keys    = keys %in_hash;
		for $name (@keys ){
		    $name_with_ext = "$name\_rv";
			$rev_hash{$name_with_ext} = reverse($in_hash{$name});
		}
		push(@rev_hash_refs, \%rev_hash);
	}
	if(@rev_hash_refs ==1){ return($rev_hash_refs[0]);}
	else{ return(@rev_hash_refs);}
}

#________________________________________________________________________
# Title     : handle_arguments
# Usage     : Just put the whole box delimited by the two '###..' lines below
#             to inside of your subroutines. It will call 'handle_arguments'
#             subroutine and parse all the given input arguments.
#             To use, claim the arguments, just use the variable in the box.
#             For example, if you had passed 2 file names for files existing
#             in your PWD(or if the string looks like this: xxxx.ext),
#             you can claim them by $file[0], $file[1] in
#             your subroutine.
# Function  : Sorts input arguments going into subroutines and returns default
#             arrays of references for various types (file, dir, hash, array,,,,)
#             If you give (\@out, @file), it will put @out into @array as a ref
#             and also the contents of @out will be dereferenced and put to
#             raw_string regardless what is in it).
#
# Example   : 'handle_arguments(\@array, $string, \%hash, 8, 'any_string')
# Warning   :
# Keywords  : handling arguments, parsing arguments,
# Options   :
# Returns   : Following GLOBAL variables
#
#             $num_opt,    @num_opt     @file          @dir
#             $char_opt,   @char_opt    %vars          @array,
#             @hash        @string,     @raw_string    @range,
#
#             $num_opt has 10,20
#             @num_opt has (10, 20)
#             @file has  xxxx.ext
#             @dir has  dir  or /my/dir
#             $char_opt has 'A,B'
#             @char_opt has (A, B)
#             @array has  (\@ar1, \@ar2)
#             @hash has (\%hash1, \%hash2)
#             @string  ('sdfasf', 'dfsf')
#             @raw_string (file.ext, dir_name, 'strings',,)
#             @range has values like  10-20
#             %vars deals with x=2, y=3 stuff.
#
# Argument  : any type, any amount
# Version   : 5.0
#--------------------------------------------------------------------
sub handle_arguments{
    my($c, $d, $e, $f, $i, $j, $k, $l, $s, $t, $x, $y, $z, $char_opt, $dir, @hash,
        $file, $in_dir, $num_opt, @char_opt, @dir, @file, @string, @file_dir, @k,
        @num_opt, @raw_string,@string, @array, %vars, @range, @temp, $temp,
        @char_options, %duplicate);

    &set_debug_option;
    if(@_<1){ print chr(7),"\n This is handle_arguments. No args Passed, Error?\n"}
    elsif( (@_ ==1)&& (ref($_[0]) eq 'ARRAY') ){ # when there is only 1 argument
      push(@array, $_[0]);
      push(@k, $_[0]);
    }elsif( (@_==1)&&( !ref($_[0]) ) ){
      if(-f $_[0]){ push(@file, $_[0]);   push(@string, $_[0]) }
      elsif(-d $_[0]){ push(@dir, $_[0]); push(@string, $_[0]) }
      elsif($_[0]=~/^\d+$/){ push(@num_opt, $_[0]); $num_opt.=$_[0] }
      elsif($_[0]=~/^\w+$/){ push(@string, $_[0]); }
    }elsif(@_ >=1){ @k = @_ }

    #####______Start of  general argument handling______######
    for($k=0; $k < @k ;$k++){
      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      # Check if the input was REFERENCE
      #________________________________________________
      if( !ref($k[$k]) ){
          if($k[$k]=~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){  push(@char_opt, $1); $char_opt .= "$1\,";
          }elsif($k[$k]=~ /^\-([a-zA-Z]+)$/){          ## When multiple option is given,
                          @char_options = split(/\,|/, $1);  push(@char_opt, @char_options);
                          $char_opt .= join("\,", @char_options); ## '-' should be used. eg. '-HEGI'
          }elsif($k[$k]=~ /^(\w+)\=(\S* *)$/){  $vars{$1}=$2;  $vars .= "$1\,";
          }elsif($k[$k]=~ /^(\-?\d+)$/){ push(@num_opt, $1);  $num_opt .= "$1\,";
          }elsif($k[$k]=~ /^\d+\.?\d*\-\d+\.?\d*$/){  push(@range,  $k[$k] );
          }elsif(-f $k[$k]){                          push(@file,   $k[$k] );
          }elsif(-d $k[$k]){                          push(@dir,    $k[$k] ); }
          if(    $k[$k]=~ /\/[\w\d\.\-]+\/.+?\/$/){   push(@dir,    $k[$k] );
          }elsif($k[$k]=~ /\/.+?\/\w+\.\w+$/){        push(@file,    $k[$k] );
          }elsif($k[$k]=~ /^\/[\w\d\.\-]+[\/]*$/){    push(@dir,    $k[$k] );
          }elsif($k[$k]=~ /^[\/\w\d\-\.]+\.\w+$/){    push(@file,   $k[$k] );
          }elsif($k[$k]=~ /\S\/[\/\w\d\-\.]+\.\w+$/){ push(@file,   $k[$k] );
          }elsif($k[$k]=~/^\w+[\/\\\w\d\.\-]+$/){     push(@string, $k[$k] );
                      # string does not have space, but includes '\', '/', '.'
          }else{                                      push(@raw_string, $k[$k] ); }
      }elsif( ref($k[$k]) ){
          if( ref($k[$k]) eq "SCALAR"){
              if(${$k[$k]} =~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){ push(@char_opt, $1); $char_opt  .= "$1\,";
              }elsif(${$k[$k]}=~ /^\-([a-zA-Z]+)$/){ push(@char_opt, @char_options);
                              $char_opt  .= join("\,", @char_options);  ## as an option string.
              }elsif(${$k[$k]}=~ /^(\w+)\=(\S* *)$/){  $vars{$1}=$2;  $vars .= "$1\,";
              }elsif(${$k[$k]}=~ /^(\-?\d+)$/){ $num_opt .= "$1\,";  push(@num_opt, $1);
              }elsif(${$k[$k]}=~ /^\d+\.?\d*\-\d+\.?\d*$/){    push(@range,  $k[$k] );
              }elsif(-f ${$k[$k]}){                            push(@file,   ${$k[$k]} );
              }elsif(-d ${$k[$k]}){                            push(@dir,    ${$k[$k]} );
              }elsif(${$k[$k]}=~ /\/[\/\w\d\.\-]+[\/].+[\/]$/){ push(@dir,    ${$k[$k]} );
              }elsif(${$k[$k]}=~ /\/\S*\.\w+ *$/){   push(@file,    ${$k[$k]} );
              }elsif(${$k[$k]}=~/^\/[\/\w\d\.\-]+[\/]*$/){     push(@dir,    ${$k[$k]} );
              }elsif($k[$k]=~ /\/[\w\d\.\-]+\/.+?\/\w+\.\w+$/){   push(@file,    $k[$k] );
              }elsif(${$k[$k]}=~ /^[\/\w\d\-\.]+\.\w+$/){      push(@file,   ${$k[$k]} );
              }elsif(${$k[$k]}=~/^\w+[\w\d\.\-]+$/){           push(@string, ${$k[$k]} );
              }else{                                           push(@raw_string, ${$k[$k]}) }
          }elsif(ref($k[$k]) eq "ARRAY"){
              my @temp_arr = @{$k[$k]}; push(@array, $k[$k]);
              for ($i=0; $i<@temp_arr; $i++){
                 if(-f $temp_arr[$i]){                            push(@file, $temp_arr[$i]); next;
                 }elsif($temp_arr[$i]=~/^\d+\.?\d*\-\d+\.?\d*$/){ push(@range,$temp_arr[$i] ); next;
                 }elsif(-d $temp_arr[$i]){                        push(@dir , $temp_arr[$i]);  next;
                 }
                 if($temp_arr[$i]=~ /\/[\/\w\d\.\-]+[\/].+\/$/){ push(@dir,    ${$k[$k]} );
                 }elsif($temp_arr[$i]=~ /\/[\w\d\.\-]+\/.+?\/\w+\.\w+$/){   push(@file,    $k[$k] );
                 }elsif($temp_arr[$i]=~/^\/[\/\w\d\.\-]+[\/]*$/){ push(@dir, $temp_arr[$i] );
                 }elsif($temp_arr[$i]=~/^[\/\w\d\-\.]+\.\w+$/){   push(@file,$temp_arr[$i] );
                                                                  push(@string,$temp_arr[$i] );
                 }elsif($temp_arr[$i]=~/^\w+[\w\d\.\-]+$/){       push(@string,$temp_arr[$i]);
                 }else{                                           push(@raw_string, $temp_arr[$i]);
              }
          }
      }elsif(ref($k[$k]) eq "HASH"){
          push(@hash,   $k[$k] ); }
      }
    }
    @raw_string=(@raw_string, @string);
    @file = grep { ! $duplicate{$_}++ } @file;
    return(\@hash, \@array, \@string, \@dir, \@file, \@num_opt,
            \@char_opt, \$num_opt, \$char_opt, \@raw_string, \%vars, \@range );
}


#______________________________________________________________
# Title     : open_clu_files
# Usage     : %clus=%{&open_clu_files(\$input)};
# Function  :
# Example   : Clu file eg)
#
#  Cluster 7360103
#    1  1 SLL1058         7-255       2   Origin: 3   736   Sub:3
#    1  1 MJ0422          17-283      2   Origin: 3   736   Sub:3
#    1  1 HI1308          3-245       2   Origin: 3   736   Sub:3
#
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
#              This automatically converts lower to upper letters
# Keywords  : open_cluster_files,
# Options   : _  for debugging.
#             #  for debugging.
#             b  for to get just names ($simple_clu_reading)
#             r  for adding ranges in the names
#             U  for makeing sequence names upppercase
#
# Returns   : a ref of hash of $clus{"$clus_size\-$id"}.=$m."\n";
#             Actual content:
#             3-133 => 'HI00111 HI00222 MG1233 '
# Argument  :
# Category  :
# Version   : 1.9
#--------------------------------------------------------------
sub open_clu_files{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my($simple_clu_reading, $possible_range, $add_ranges,
	  $id, $name_range, %clus, $found, $upper_case_seq_name);
	 my $file=$file[0];
	 if($char_opt=~/b/){ $simple_clu_reading= 'b' };
	 if($char_opt=~/U/){ $upper_case_seq_name='U'; };

	 my $clus_size=1;
	 open(CLU, "$file");
	 while(<CLU>){
	  if($simple_clu_reading=~/\S/){ ## to get just names
		  if(/^ *\d+ +\d+ +\d+ +\d+ +\d+/ or /^ *#/){  ## To skip the very first summary columns
			 next;
		  }elsif(/^ *\d+ +\d+ +(\S+) +(\S+)/){
			 $seq_name=$1;
			 $possible_range=$2;
			 if($2=~/\d+\-\d+/ and $char_opt=~/r/){
				$name_range="$seq_name\_$possible_range";
				$clus{$name_range} = $name_range;
			 }else{
			    $clus{$seq_name}=$seq_name;
			 }
		  }
	  }else{
		  if(/^ *\d+ +\d+ +\d+ +\d+ +\d+/ or /^ *#/ ){  ## To skip the very first summary columns
			 next;
		  }elsif(/^ *Cluster +size +(\d+)/i ){
			 $clus_size=$1;
			 $found=1;
		  }elsif(/^ *Cluster +([_\d]+) *size:? *(\d+)/i){  # to match 'Cluster 14313'  or  'Cluster 234_1234_1'
			 $id  =$1;
			 $found=1;
			 $clus_size=$2; # if defined($2);
		  }elsif(/^ *Cluster +[number]* *([\w]+)/i){  # to match 'Cluster 14313'  or  'Cluster 234_1234_1'
			 $id  = $1;
			 $found=1;
		  }elsif(($found==1)&&(/^ *\S* *\S* *(\S+)\.prot\,? *.*/)){ ## this is to correct MP genome names
			 $m=$1;
						 if($upper_case_seq_name){
								 $clus{"$clus_size\-$id"}.="\U$m ";
						 }else{
								 $clus{"$clus_size\-$id"}.="\U$m ";
						 }
		  }elsif(($found==1)&&(/^ *(\d+) *\d* *(\S{2,32}) *(\S*)/)){          # general clu match
			 $clus_size=$1 unless ($clus_size);
			 $m=$2;
			 $possible_range=$3;
			 if($2=~/\d+\-\d+/ and $char_opt=~/r/){
				$name_range="$m\_$possible_range";
								if($upper_case_seq_name){
										$clus{"$clus_size\-$id"}.="\U$name_range ";
								}else{  $clus{"$clus_size\-$id"}.="$name_range "; }
			 }else{
								if($upper_case_seq_name){
										$clus{"$clus_size\-$id"}.="\U$m ";
								}else{  $clus{"$clus_size\-$id"}.="$m ";  }
			 }
		  }
	  }
	 }
	 close(CLU);
	 return(\%clus);
}



#________________________________________________________________________
# Title     : remove_dup_in_array
# Usage     : @out2 = @{&remove_dup_in_array(\@input1, \@input2,,,,)};
#             @out1 = &remove_dup_in_array(\@input1 );
# Function  : removes duplicate entries in an array. You can sort the
#             result if you wish by 's' opt. Otherwise, result will keep
#             the original order
# Example   : (1,1,1,1,3,3,3,3,4,4,4,3,3,4,4);  --> (1,3,4);
# Warning   :
# Keywords  : merge array elements, remove_repeting_elements,
#             remove_same_array_elements, remove_redundancy, remove_redundant_elements
#             remove_duplication_in_array
# Options   :
#   s  for sorting the array output
# Returns   : one or more references.
# Argument  : one or more refs for arrays or one array.
# Author    : jong@salt2.med.harvard.edu
# Version   : 1.7
#--------------------------------------------------------------------
sub remove_dup_in_array{
		my($i, @in, $sort_opt, @out_ref, @nondup,%duplicate, @orig, @out_ref);
		@in=@_;
		for($i=0; $i<@in; $i++){
			 if($in[$i] eq 's'){
					$sort_opt=1;  splice(@in, $i, 1); $i--;
			 }elsif( ref($in[$i]) eq 'SCALAR'  and  ${$in[$i]} eq 's' ){
					$sort_opt=1;  splice(@in, $i, 1); $i--;
			 }
		}
		for($i=0; $i<@in; $i++){
				undef(%duplicate);
				if(ref($in[$i]) eq 'ARRAY'){    @orig = @{$in[$i]};    }
				else{ @orig=@in }
				@nondup = grep { ! $duplicate{$_}++ } @orig;    ## NOTE -> $_
				if($sort_opt==1){ @nondup= sort @nondup }
				push(@out_ref, \@nondup);
		}
		if(@out_ref ==1){ return($out_ref[0]);}
		elsif(@out_ref >1){  return(@out_ref);}
}


#________________________________________________________________________________
# Title     : read_machine_readable_sso_lines
# Usage     : @out_refs=@{&read_machine_readable_sso_lines(\@SSO, $get_alignment,
#                           $create_sso, $upper_expect_limit,$new_format, $lower_expect_limit,
#                           $attach_range_in_names, $attach_range_in_names2)};
# Function  :
# Example   :
# Keywords  : read_m10_sso_lines read_msso_lines
# Options   : a c r r2 n
#   $get_alignment=a by a  <-- for merge_sequence_alignments.pl etc
#             u= for upper E value limit
#             l= for lower E value limit
# Category  :
# Version   : 1.9
#--------------------------------------------------------------------------------
sub read_machine_readable_sso_lines{
     my ($upper_expect_limit, $lower_expect_limit)=(50,0);
     my (%match, $name_range, @out_refs, $query_found, $query_sq_stop, $query_sq_start, $match_found, $get_alignment,
            $match_seq, $match_found2, $i, $j,$match_found3, $overlap, $sw_score, $match_sq_start, $match_sq_stop, $match_seq2, $sw_ident, $name_range, $query_seq,
            $al_display_start, $original_target_seq, $match_seq_count, @SSO, $attach_range_in_names, $attach_range_in_names2, $new_format, %temp,
            $original_Q_sq_start, $original_Q_sq_stop, $gap_char_num,  $original_M_sq_start, $original_M_sq_stop, $match_align_leng, $query_align_leng);

     for($i=0; $i< @_; $i++){
             if($_[$i]=~/u=(\S+)/){    $upper_expect_limit=$1 }
             elsif(ref($_[$i]) eq 'ARRAY'){ @SSO=@{$_[$i]};   }
             elsif($_[$i]=~/l=(\S+)/){ $lower_expect_limit=$1 }
             elsif($_[$i]=~/^c *$/){     $create_sso = 'c'; print "\n# read_machine_readable_sso_lines: \$create_sso is set"; }
             elsif($_[$i]=~/^a *$/){     $get_alignment='a'; }
             elsif($_[$i]=~/^ *r *$/){   $attach_range_in_names='r' }
             elsif($_[$i]=~/^r2 *$/){    $attach_range_in_names2='r2' }
             elsif($_[$i]=~/^n *$/){     $new_format='n' }
     }
     $attach_range_in_names='r';
     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # Matching each machine readable line
     #_______________________________________________
     for($j=0; $j< @SSO; $j++){
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  >S33700_annexin
        #_________________________________________________________
        if($SSO[$j]=~/^ {1,3}\>(\S+)/){
             $query_seq=$query_seq_original=$1; $j+=4;
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  >>>S33700_annexin.fa, 339 aa vs ../test_set.mpfa library
        #_________________________________________________________________________________
        }elsif($SSO[$j]=~/\>\>\> *(\S+)\,? +(\d+) +/){  ## >>>  line
             $query_found=1;  $query_seq_leng=$2;  ## Ignoring the $1, as file name can be different from rea seq names
             $j+=5;
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
        #  >>S33700_annexin I; annexin repeat homology. [ Rattus norvegicus (Norway rat) ]
        #______________________________________________________________________________________
        }elsif( $query_found==1 and $SSO[$j]=~/^ {0,2}\>\>(\w[\S\-\.]+)([\.prot\,\:]*) */ ){ ##
             $match_found=1;
             $match_seq_count++;

             $al_display_start=0; ##<--- This is necessary in reading the sequence lines to return alignment with 'a' option

             if(length($2)>0){  print "\n# read_machine_readable_sso_lines: Seq name has this special char \"$2\". I ignore it"; }
             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             #  Changing the CASE according to the option
             #_____________________________________________
             if($uppercase_seq_name eq 'U'){
                  $match_seq="$1"; $match_seq="\U$match_seq";  ## make it uppercase
             }elsif($lowercase_seq_name eq 'L'){
                  $match_seq="$1"; $match_seq="\L$match_seq"; ## make it lowercase
             }else{ $match_seq="$1";  } ## make it uppercase

             $query_seq=$query_seq_original; ## >>> BUG FIX <<< this resets ASSS_30-123 to ASSS, so that NO incremental ranges in query

        }elsif($match_found and $SSO[$j]=~/^ *\; +sw_score *\:[\t ]+(\S+)/i){  $sw_score =$1;
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  ; sw_expect:  line NOT exists in pair comparison
        #___________________________________________________________
        }elsif($match_found and $SSO[$j]=~/^\; +\w+_expect\:? +(\S+)/){
            #~~~~~~~~~~~~~~~~~~~~~~~
            # Filtering by E val
            #_______________________
            if( $1 > $upper_expect_limit or $1 < $lower_expect_limit ){
                    $match_found=0; next;
            }else{ $expect =$1; }
        }elsif($match_found and $SSO[$j]=~/^\; +sw_ident\: +(\S+)/){  $sw_ident =$1;
        }elsif($match_found and $SSO[$j]=~/^ *\; +sw_overlap\: +(\S+)/){  $overlap=$1;

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  Matching ">d2admb__17-3 368 aa .." which is QUERY
        #_________________________________________________________
        }elsif($match_found and $SSO[$j]=~/^ {0,2}\>(\w[\S\-\.]+)([\.prot\,\:]*)/){
            $match_found2=1;        $match_found=0;
            if( length($2)>0 ){  print "\n# read_machine_readable_sso_lines: Seq name has this special char \"$2\". I ignore it"; }
        }elsif($match_found2==1 and $SSO[$j]=~/\; +sq_len\: +(\S+)/){
            $query_sq_len=$1;
        }elsif($match_found2==1 and $SSO[$j]=~/\; +al_start\: +(\S+)/){
            $query_sq_start=$abs_query_seq_start=$1;
        }elsif($match_found2==1 and $SSO[$j]=~/\; +al_stop\: +(\S+)/){
            $query_sq_stop=$1;
            $query_align_leng=$query_sq_stop-$query_sq_start+1;
        }elsif($match_found2 and $SSO[$j]=~/\;[\t ]+al_display_start: +(\d+)/){
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # Following is important to keep track of the abs seq start point ($abs_query_seq_start=al_start-al_display_start)
            #______________________________________________________________________________________________________________________________
            $abs_query_seq_start-=$1;

            $al_display_start++;
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
            # When REGIONS are noted, record the original range info and adjust according to the alignment
            #___________________________________________________________________________________________________
            if($query_seq=~/^(\S+)_(\d+)\-(\d+)$/){
                 $query_seq=$1;
                 $original_Q_sq_start=$2;
                 $original_Q_sq_stop =$3;
                 $query_sq_start +=$2-1 ;
                 $query_sq_stop  +=$2-1;
                 $query_seq="$query_seq\_$query_sq_start\-$query_sq_stop";
            }elsif($query_seq){
                $query_seq="$query_seq\_$query_sq_start\-$query_sq_stop" if $attach_range_in_names;
            }else{
               print "\n \$query_seq is empty for \n\n"; die;
            }
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        #  Matching ">RPOP_NEUCR_50-433" which is MATCH
        #_________________________________________________________
        }elsif($match_found2 and $SSO[$j]=~/^ {0,2}\>(\w[\S\-\.]+)([\.prot\,\:]*) *[\d+]*/){
            $match_found3=1;
            $match_found2=0;
            $match_seq   =$1;
            if(length($2)>0){  print "\n# open_sso_files: Seq name has this special char \"$2\". I ignore it"; }
        }elsif($match_found3 and $SSO[$j]=~/\; +sq_len\: +(\S+)/){
                 $match_sq_len=$1;
        }elsif($match_found3 and $SSO[$j]=~/\; +al_start\: +(\d+)/){
                 $match_sq_start=$abs_match_seq_start=$1;
        }elsif($match_found3 and $SSO[$j]=~/\; +al_stop\: +(\d+)/){
                 $match_sq_stop=$1;
                 $match_align_leng=$match_sq_stop-$match_sq_start+1;
        }elsif($match_found3 and $SSO[$j]=~/\; +al_display_start: +(\d+)/){
                $match_found3=0;

                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Following is important to keep track of the abs seq start point
                #________________________________________________________________________
                $abs_match_seq_start-=$1;

                $al_display_start++;
                #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # To prevent Null SWscore and Eval
                #___________________________________________________
                if(!$expect){   $expect='0.0';
                        if((!$sw_score and $expect==0) or $sw_ident > 0.8){ $sw_score=1000; }
                }
                #~~~~~~~~~~~~~~~~~~~~~~~~~
                # Attaching the ranges  (under NO e option)
                #_________________________
                if($attach_range_in_names){
                      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                      # Checks margin opt and adds it
                      #__________________________________
                      if($margin=~/\d+/){
                           if($match_sq_start < $margin){  $match_sq_start=1;
                           }else{          $match_sq_start-=$margin;   }
                           $match_sq_stop += $margin;
                      }
                      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                      # adding the offset for names with ranges
                      #__________________________________________________
                      if($match_seq=~/^(\S+)_(\d+)\-(\d+)$/){
                           $match_seq=$1;
                           $original_M_sq_start=$2;
                           $original_M_sq_stop =$3;
                           $match_sq_start +=$2-1;
                           $match_sq_stop  +=$2-1;
                           $match_seq="$match_seq\_$match_sq_start\-$match_sq_stop";
                      }
                      $name_range=$match_seq;

                      if($original_target_seq=~/\S+/){ $query_seq=$original_target_seq } # for PVM version out
                      if($new_format=~/n/){  # under NO e option
                          $match{$name_range}=sprintf("%s %s %s %s %s %s %s %s %s\n",
                          $query_seq, $query_sq_start, $query_sq_stop, $sw_score, $expect, $sw_ident,
                          $match_sq_start, $match_sq_stop, $name_range);
                      }else{
                          $match{$name_range}=sprintf("%-5s %-9s %-6s %-4s %-5s %-30s %-4s %-5s %s\n",
                          $sw_score, $expect, $sw_ident, $query_sq_start, $query_sq_stop, $query_seq,
                          $match_sq_start, $match_sq_stop, $name_range);
                      }
              }else{
                   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                   # Checks margin opt and adds it
                   #__________________________________
                   if($margin=~/\d+/){
                           if($match_sq_start < $margin){  $match_sq_start=1;
                           }else{                          $match_sq_start-=$margin; }
                           $match_sq_stop += $margin;
                   }
                   if($original_target_seq=~/\S+/){ $query_seq=$original_target_seq } # for PVM version out

                   if($new_format=~/n/){
                           $match{$match_seq}=
                                sprintf("%s %s %s %s %s %s %s %s %s\n",
                                $query_seq, $query_sq_start, $query_sq_stop, $sw_score, $expect, $sw_ident,
                                $match_sq_start, $match_sq_stop, $match_seq);
                   }else{
                           $match{$match_seq}=sprintf("%-5s %-9s %-6s %-4s %-5s %-30s %-4s %-5s %s\n",
                               $sw_score, $expect, $sw_ident, $query_sq_start, $query_sq_stop, $query_seq,
                               $match_sq_start, $match_sq_stop, $match_seq);
                   }
              }
        }
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # When 'a' option set, for getting sequences to show alignment
        #________________________________________________________________
        if($get_alignment and $al_display_start==1 and $SSO[$j]=~/^([\w\-]+) *$/){
                 ${"match_alignment\_$match_seq_count"}{"$query_seq"} .= $1;
        }elsif($get_alignment and $al_display_start==2 and $SSO[$j]=~/^([\w\-]+) *$/){
                 ${"match_alignment\_$match_seq_count"}{"$match_seq"} .= $1;
        }elsif($get_alignment and ($SSO[$j]=~/^ *\;al_cons\:/ or $SSO[$j]=~/\<\<\</)){
                 $al_display_start=0;
                 #print ${"match_alignment\_$match_seq_count"}{"$query_seq"} , "\n";
                 #print ${"match_alignment\_$match_seq_count"}{"$match_seq"} , "\n";
                 my %temp=%{"match_alignment\_$match_seq_count"};
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # Getting seq segments Matched !!
                 #_______________________________________________________________
                 $Q_fragment_start=$query_sq_start-$original_Q_sq_start;
                 $M_fragment_start=$match_sq_start-$original_M_sq_start;
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # (1) When Q alignment starts with gap> '-----XXXXX' + 'ADFAYXXXXX', remove gaps
                 #__________________________________________________________________________________
                 if($temp{$query_seq}=~/^(\-+)\w/){
                         $leading_gap_leng=length($1);
                         #print "\n\$leading_gap_leng : $leading_gap_leng $abs_query_seq_start, $abs_match_seq_start, $query_align_leng, $match_align_leng\n";

                         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                         # Number of gap char is important as the al_start and al_stop of FASTA does not include gap leng
                         #_______________________________________________________________________________________________________________
                         $temp{$query_seq}=substr($temp{$query_seq}, $leading_gap_leng+$abs_query_seq_start, ); ## <-- step 1
                         $gap_char_num=$temp{$query_seq}=~tr/\-/\-/;                       ## <-- step 2
                         $temp{$query_seq}=substr($temp{$query_seq}, 0, $query_align_leng+$gap_char_num); ## <-- step 3
                         $temp{$match_seq}=substr($temp{$match_seq}, $leading_gap_leng+$abs_query_seq_start, $query_align_leng+$gap_char_num);
                         #print "\n$temp{$query_seq}\n$temp{$match_seq} $query_align_leng + $gap_char_num \n"; #die;
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # (2) When M alignment starts with gap> '-----XXXXX' + 'ADFAYXXXXX', remove gaps
                 #____________________________________________________________________________________
                 }elsif($temp{$match_seq}=~/^(\-+)\w/){
                      $leading_gap_leng=length($1);
                      #print "\n\$leading_gap_leng : $leading_gap_leng $abs_query_seq_start, $abs_match_seq_start, $query_align_leng, $match_align_leng\n";

                      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                      # Number of gap char is important as the al_start and al_stop of FASTA does not include gap leng
                      #_______________________________________________________________________________________________________________
                      $temp{$query_seq}=substr($temp{$query_seq}, $leading_gap_leng+$abs_match_seq_start, ); ## <-- step 1
                      $gap_char_num=$temp{$query_seq}=~tr/\-/\-/;                       ## <-- step 2
                      $temp{$query_seq}=substr($temp{$query_seq}, 0, $query_align_leng+$gap_char_num); ## <-- step 3
                      $temp{$match_seq}=substr($temp{$match_seq}, $leading_gap_leng+$abs_match_seq_start, $query_align_leng+$gap_char_num);
                      #print "\n$temp{$query_seq}\n$temp{$match_seq} $query_align_leng + $gap_char_num \n"; #die;
                 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 # (3) When Alignment is like> 'YYY----XXzz' + 'YYYDFAYXXzz'
                 #____________________________________________________________________________
                 }else{
                      #print "\n $abs_query_seq_start, $abs_match_seq_start, \$query_align_leng $query_align_leng, $match_align_leng\n";
                      $temp{$query_seq}=substr($temp{$query_seq}, $abs_query_seq_start, );
                      $gap_char_num=$temp{$query_seq}=~tr/\-/\-/;
                      $temp{$query_seq}=substr($temp{$query_seq}, 0, $query_align_leng+$gap_char_num);
                      $temp{$match_seq}=substr($temp{$match_seq}, $abs_match_seq_start, $query_align_leng + $gap_char_num);
                      #print "\n$temp{$query_seq}\n$temp{$match_seq} $query_align_leng + $gap_char_num $match_align_leng $gap_char_num\n"; die;
                 }
                 push(@out_refs, \%temp );
                 %{"match_alignment\_$match_seq_count"}=();
            }
     } ## <-- for($j=0; $j< @SSO; $j++)
     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # If create sso option is set, it creates SSO files(array input case)
     #________________________________________________________________________
     if( $create_sso and !$get_alignment){
         open (SSO2, ">$query_seq\.msso");
         print SSO2 @SSO, "\n";
         print "\n# read_machine_readable_sso_lines : $query_seq\.msso file  is created by \"c\" opt ";
         close SSO2
     }
     unless($get_alignment){
         push(@out_refs, \%match);
     }
     return(\@out_refs);
}






#________________________________________________________________________________________
# Title     : merge_sequence_in_mspa_file
# Usage     :
# Function  :
# Example   : INPUT: (MSP file) ===>
#  59     2.6        47    64     d2pia_3        10    30     d1erd___10-30
#  161    1.1e-07    24    91     d2pia_3        16    85     d1frd___16-85
#
#  722    0          1     106    d1put__        1     106    d1put___1-106
#  66     4.9        2     68     d1put__        43    106    d2lbp___43-106
#  69     1.3        12    49     d1put__        81    120    d1cgo___81-120
#
#  60     3.3        13    38     d1frd__        32    57     d1orda1_32-57
#  65     1.7        21    58     d1frd__        40    69     d2mtac__40-69
#
#   ==== OUTPUT ===>
#    d1frd___1-98 d1frd___1-98_1-98 d1frd___16-85 d2pia_3_24-91_24-91
#    d1frd___16-85_16-85 d2pia_3_24-91
#    d1put___1-106 d1put___1-106_1-106
#    d2pia_3_1-98 d2pia_3_1-98_1-98
#
# Keywords  : mergr_seq_in_msp_file, merge_sequence_in_msp, merge_sequences_in_msp_file
#             merge_sequence_in_MSPA_file
# Options   :
#  $dynamic_factor =  y by y -y   # adjusting factor value dynamically(more seq higher factor)
#  $short_region   =  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region   =  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region =  A by A -A # taking average region overlapped in removing similar regions
#
# Thanks    : Alexey Eroshkin <alexey@axyspharm.com>
# Version   : 3.4
#----------------------------------------------------------------------------------------
sub merge_sequence_in_mspa_file{
		#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
		my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
		my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
		my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
		my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
		my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
		if($debug==1){print "\n\t\@hash=\"@hash\"
		\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
		\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
		#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		my ($msp_value, @all_seqlets, %temp_hash, @msp_chunks, $clu_out, $size_of_all_seqlets,
		    $ragne, $base, $optimize, $mrg_out, @arr, $sat_out, %final_hash_out, @final_pre_hash,
				$length_thresh, $merge, $factor, $Evalue_thresh, $score, $dynamic_factor, $score_match,
				$eval_match, $query_seq, $query_start, $query_stop, $match_seq, $match_start,
				$short_region, $large_region, $average_region, $original_clu_size, $match_stop);
		$factor=$default_factor=7; #~~~~ default connection factor U, 7 means 70% now!
		$length_thresh=30;
		$Evalue_thresh=10;
		$score =75;
		$range='r';
		if(@file < 1){ print "\n# (E) merge_sequence_in_mspa_file needs at least 1 MSP file\n"; die }

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Following changes the defaults with given parameters
		#_____________________________________________________________
		if($char_opt=~/z/i){       $optimize='z';    ## This will cause using remove_similar_seqlets than remove_dup_in_array !
		}if($char_opt=~/m/){       $merge='m';
		}if($char_opt=~/y/){       $dynamic_factor='y';
        }if($char_opt=~/r/){       $verbose='r';
        }if($char_opt=~/v/){       $verbose='v';
		}if($char_opt=~/S/){       $short_region='S';
		}if($char_opt=~/L/){       $large_region='L';
		}if($char_opt=~/A/){       $average_region='A';
		}if($vars{'T'}=~/\d+/){    $length_thresh=$vars{'T'};
		}if($vars{'f'}=~/\S+/){    $factor=$vars{'f'};  ## Here I give a generous $factor !
		}if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
        }if($vars{'e'}=~/\S+/){    $Evalue_thresh= $vars{'e'};
		}if($vars{'E'}=~/\S+/){    $Evalue_thresh= $vars{'E'}; }

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  Just to inform what parameters have been chosen
		#_____________________________________________________________
        print "\n# (1) merge_sequence_in_mspa_file : default \$score      : $score";
        print "\n#                                : default \$Evalue_thresh     : $Evalue_thresh";
        print "\n#                                : used \$length_thresh : $length_thresh";
        print "\n#                                : default \$factor     : $default_factor";
        print "\n#                                : used    \$factor     : $factor";
        print "\n#                                : \$dynamic_factor     : $dynamic_factor\n";

		for($c=0; $c< @file; $c++){
			 open(MSP, "$file[$c]");
			 $base=${&get_base_names($file[$c])};
			 $clu_out="$base\_F${factor}.clu"; # <-- This is the most important output. Sarah's program will process this
			 $sat_out="$base\_F${factor}.sat";
			 #print "# (2) merge_sequence_in_mspa_file : processing $file[$c] for $clu_out\n" if $verbose;
			 my @msp1=<MSP>;

			 for($i=0; $i< @msp1; $i++){
					#~~~~~~~~~~ Include range or NOT in the seq name ~~~~~~~~~~~~~~~~~~~~~~~~~~`
					# %temp_hash is just to get the chunk of MSP block. As msp file uses empty line as a delimiter
					#____________________________________________________________________________
					if($char_opt=~/r/){
						 if($msp1[$i]=~/^\s*(\S+)\s+(\S+)\s*\S*\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)/){
                              $score_match=$1;
                              $eval_match=$2;
                              $query_seq=$5;
                              $query_start=$3;
                              $query_stop=$4;
                              $match_seq=$8;
                              $match_start=$6;
                              $match_stop=$7;
                              if($query_seq=~/\S+_\d+\-\d+$/){ $new_seq1=$query_seq }else{ $new_seq1="$query_seq\_$query_start\-$query_stop"; }
                              if($match_seq=~/\S+_\d+\-\d+$/){ $new_seq2=$match_seq }else{ $new_seq2="$match_seq\_$match_start\-$match_stop"; }

                              if($new_seq1 eq $new_seq2){
                                   #print "# (3.0) merge_sequence_in_mspa_file: Skipped = $msp1[$i]"  if $verbose;
                                   next
                              };

                              if($score_match < $score or $eval_match > $Evalue_thresh){
                                      #print "# (3.1) merge_sequence_in_mspa_file: Skipped = $msp1[$i]"  if $verbose;
                                      next
                              };
                              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                              # Modifying $msp1[$i] line !!!
                              #______________________________
                              $msp1[$i]=sprintf("%s %-3s %s %s %s %s %s %s",
                                                          $score_match, $eval_match, $query_start, $query_stop, $new_seq1, $match_start,
                                                          $match_stop, $new_seq2);
                              $temp_hash{$query_seq}.="$msp1[$i]\n";
						 }
					}else{
						 if($msp1[$i]=~/^\s*(\S+)\s+(\S+)\s*\S*\s+\d+\s+\d+\s+(\S+)[_\d+\-\d+]?\s+\d+\s+\d+\s+\S+/){
                              if($1 < $score or $2 > $Evalue_thresh){
                              #print "# (3.0) merge_sequence_in_mspa_file: Skipped = $msp1[$i]"  if $verbose;
                              next };
                              $temp_hash{$3}.="$msp1[$i]\n";
						 }
					}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			}
			close(MSP);
		}
		$original_clu_size=@msp_chunks= values(%temp_hash); ## Using temp hash is more than 2 times faster than push

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Determining the dynamic factor here (when 'd' opt is set)
		#____________________________________________________________
		if($dynamic_factor){
				#--> 100 => 10, 1000 => 15, 10000 => 20
				print "\n# ### \$factor: $factor\n";
				$factor += (log($original_clu_size)*5)/10 - 1; ## This is a simplistic.
				if($factor > 9.5){ $factor=9.5 } # this is the very upper limit for any factor.
				print "\n# ### \$factor: $factor\n";
		}

		if($verbose){
           print "# (4) merge_sequence_in_mspa_file: The msp chunks used are:\n";
		   for($i=0; $i< @msp_chunks; $i++){
				print "$msp_chunks[$i]\n";
		   }
		}

		for($i=0; $i< @msp_chunks; $i++){
            @arr=@{&merge_sequence_in_mspa_chunk($msp_chunks[$i], $verbose, $optimize,
								"$merge", "E=$Evalue_thresh", "s=$score", "f=$factor", "T=$length_thresh",
								$short_region, $large_region, $average_region)};
			push(@all_seqlets,  @arr);
		}

		#~~~~~~~~~ sorting inner sequences in strings ~~~~~~~~~
		#______________________________________________________
		@all_seqlets=@{&sort_words_in_string(@all_seqlets)}; ## This speeds up about 2 times !!!

		#~~~~~~~ Sort by the _digit-  in seqlet names ~~~~~~~~~
		@all_seqlets= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2]  }
									map {/^ *((\S+)_(\d+)\-(\d+).*)/ && [$1, $2, $3, $4]} @all_seqlets;
		if(  $verbose){
			print "\n# (6) merge_sequence_in_mspa_chunk: Showing the very final result before merging\n";
			for($i=0; $i< @all_seqlets; $i++){
				print "\n$all_seqlets[$i]";
			}
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# merge sequences in a simple way until there is no change in the array
		#  This is an incomplete merge(merges first seqlets of string ...
		#______________________________________________________________________
		for($i=0; $i< @msp_chunks; $i ++){
             ITERATION_RETURN_POINT:
             $size_of_all_seqlets=@all_seqlets;
             @all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
                                                                $short_region, $large_region, $average_region, "f=$factor")};
             if($size_of_all_seqlets > @all_seqlets){
                 @all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
                                                $short_region, $large_region, $average_region, "f=$factor")};
                 goto ITERATION_RETURN_POINT;
             }else{
                 last;
             }
		}

		if($optimize){
             @all_seqlets=@{&remove_similar_seqlets(\@all_seqlets,
                                             $short_region, $large_region, $average_region)};
             @all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};

		}else{
             @all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};
		}
		return(\@all_seqlets);
}


#______________________________________________________________________________
# Title     : run_fasta_sequence_search
# Usage     : $gzipped_msp_file=${&run_fasta_sequence_search("a=$algorithm",
#                        "O=$out_file_msp_name", "File=$temp_file_name", "u=$upper_expect_limit",
#                        "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
#
# Function  : runs FASTA or SSEARCH algorithm (given by a= option)
# Example   :
# Keywords  : run_ssearch_sequence_search, do_fasta_sequence_search,
#             do_fasta_search, run_fasta_ssearch_sequence_search
# Options   :
#             Query_seqs=  for enquiry sequences eg)  "Query_seqs=$ref_of_hash"
#             DB=   for target DB  "DB=$DB_used"
#             File= to get file base(root) name.  "File=$file[0]"
#             i= to get file base(root) name. same as File=
#             m  for MSP format directly from FASTA or Ssearch result than through sso_to_msp to save mem
#             s  for the big single output (msp file output I mean)
#             s= for the single big msp file name
#             O= for Out file name, same as s=
#             o  for overwrite existing xxxx.fa files for search
#             c  for create SSO file (sequence search out file)
#             d  for very simple run and saving the result in xxxx.gz format in sub dir starting with one char
#             r  for reverse the query sequence
#             R  for attaching ranges of sequences
#             k= for k-tuple value. default is 1 (ori. FASTA prog. default is 2)
#             u= for $upper_expect_limit
#             E= for $Evalue_thresh
#             l= for $lower_expect_limit
#             a= for choosing either fasta or ssearch algorithm
#             d= for defining the size of subdir made. 2 means it creates
#                    eg, DE while 1 makes D
#             d  for $make_gz_in_sub_dir_opt, putting resultant sso files in gz format and in single char subdir
#             D  for $make_msp_in_sub_dir_opt, convert sso to msp and put in sub dir like /D/, /S/
#             n  for new format to create new msp file format with sso_to_msp routine
#          PVM=  for PVM run of FASTA (FASTA only)
#             M  for machine readable format -m 10 option
#             M= for machine readable format -m 10 option
#             N  for 'NO' do not do any processing but, do the searches only.
#       FILE_AGE for defining the age of file in days to be overwritten.
# Author    : Jong Park, jong@salt2.med.harvard.edu, for commercial use, ask me.
# Category  :
# Version   : 1.7
#------------------------------------------------------------------------------
sub run_fasta_sequence_search{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($algorithm, $upper_expect_limit, $sequence_DB, $k_tuple, $machine_readable,$create_sso,
         @msp_from_temp, $gzipped_msp_file, @temp, $temp_file_name, $add_range,
         $out_file_msp_name, $input_file_name, $out_file_sso_name,
         $gzipped_sso_file, $machine_readable, $Evalue_thresh);
    $algorithm='fasta';    $out_file_msp_name='default_out_file.mspa';
    $upper_expect_limit=1;              $sequence_DB=$ENV{'NRDB_FASTA'};
    $k_tuple=1;            $machine_readable='M';
    $upper_expect_limit=1; $lower_expect_limit=0;
    $Score_thresh=100;

    if($vars{'O'}=~/\S+/){ $out_file_msp_name  = $vars{'O'}            };
    if($vars{'E'}=~/\S+/){ $Evalue_thresh      = $vars{'E'}            };
    if($vars{'u'}=~/\d+/){ $upper_expect_limit = $vars{'u'}            };
    if($vars{'l'}=~/\d+/){ $lower_expect_limit = $vars{'l'}            };
    if($vars{'k'}=~/\d+/){ $k_tuple            = $vars{'k'} unless($algorithm=~/ssearch/);  };             ## Bug fix <-- ssearch does not like $ktuple set
    if($vars{'a'}=~/\S+/){ $algorithm          = $vars{'a'}; if($algorithm=~/ssearch/i){ $k_tuple='';}  }; ## Bug fix <-- ssearch does not like ktuple
    if($vars{'t'}=~/\d+/){ $Score_thresh       = $vars{'t'}            };
    if($vars{'r'}=~/\S+/){ $add_range          = 'r'                   };
    if($vars{'s'}=~/\S+/){ $single_big_msp     = 's'                   };
    if($vars{'DB'}=~/\S+/){            $sequence_DB=$vars{'DB'} ;
            if(-s $sequence_DB){
            }elsif(-s "../$sequence_DB"){  $sequence_DB= "../$sequence_DB"
            }elsif(-s "../../$sequence_DB"){  $sequence_DB= "../../$sequence_DB";
            }else{
                    print "\n# (ERROR) do_sequence_search: You set DB param, but I can\'t find $sequence_DB\n";
                    die;
            }
    }else{  print "\n# (ERROR) do_sequence_search: I need DB param defined, sorry, aborting\n"; }

    if(    $vars{'FILE'}=~/(\S+)/i){ $input_file_name = $1;
    }elsif($vars{'File'}=~/(\S+)/i){ $input_file_name = $1; }
    if($vars{'FILE_AGE'}=~/\S+/){ $age_in_days_of_out_file= $vars{'FILE_AGE'};  };
    if($vars{'Query_seqs'}=~/\S+/){ %seq_input = %{$vars{'Query_seqs'}}};
    if($vars{'Query'}=~/\S+/){      %seq_input = %{$vars{'Query'}}};
    if($vars{'u'}    =~/\S+/){ $upper_expect_limit          = $vars{'u'}            };
    if($vars{'PVM'}  =~/\S+/){ $PVM_FASTA_run  = $vars{'PVM'}; print "\n# PVM opt is set\n";     };
    if($vars{'M'}  =~/\S+/){ $machine_readable = $vars{'M'};           };

    if($char_opt=~/r/){    $add_range          = 'r' }
    if($char_opt=~/o/){    $over_write         = 'o' }
    if($char_opt=~/c/){    $create_sso         = 'c' }
    if($char_opt=~/s/){    $single_big_msp     = 's'; print "\n# Single file opt is set\n"; }
    if($char_opt=~/M/){    $machine_readable   = 'M' }
    if($char_opt=~/N/){    $No_processing      = 'N'; $create_sso='c'; }
    #print "\n# (INFO) run_fasta_sequence_search: $algorithm,$out_file_msp_name,$input_file_name,$upper_expect_limit,DB=$sequence_DB,$k_tuple,$machine_readable\n";

    if(!$algorithm or !$input_file_name or !$sequence_DB){
         print "\n# (ERROR) One of \$sequence_DB, \$input_file_name, \$algorithm is missing\n";
         die;
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (1) Machine readable opt for fasta and ssearch set ??
    #____________________________________________________
    if($machine_readable=~/M/){
       #print "\n \$machine_readable for FASTA/SSEARCH is SET\n";
       if(-x "./Bin/$algorithm"){
              #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
              @temp=`./Bin/$algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
         }else{
              #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
              @temp=`$algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
         }
    }else{
       print "\n \$machine_readable for FASTA/SSEARCH is NOT set\n";
       if(-s "./Bin/$algorithm"){
                 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
                 @temp=`./Bin/$algorithm -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
         }else{
                 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
                 @temp=`$algorithm -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
         }
    }

    #print "\n# (i) \@temp has ", scalar(@temp), " lines @temp!\n";

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (2) @temp is parsed directly by open_sso_files to make MSP hashes
    #_______________________________________________________________________
    @msp_hashes_from_temp = @{&open_sso_files(\@temp, $add_range, "u=$upper_expect_limit", "l=$lower_expect_limit")};

    if(@msp_hashes_from_temp < 1){
        print "\n# (ERROR) run_fasta_sequence_search : Error, something is wrong with open_sso_files, LINE=", __LINE__, "\n";
        die;
    }else{
        print "\n# (i) run_fasta_sequence_search: Good, \@msp_from_temp has ",scalar(@msp_hashes_from_temp), " hashes !\n";
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Following is JUST to prevent empty @msp_hashes_from_temp due to low upper expec or any error
    #___________________________________________________________________________________________
    @msp_from_temp= values %{$msp_hashes_from_temp[0]};
    if($msp_from_temp[0]=~/\S{3}/){
        #print "\n# (i) run_fasta_sequence_search: Good, \@msp_from_temp has ",scalar(@msp_hashes_from_temp), " lines !\n";
    }else{
        #print "\n# (E) the content of \@temp has @temp\n";
        open(SSO_FILE_TO_DEBUG, ">$input_file_name\.$algorithm\.sso");
        for(@temp){ print SSO_FILE_TO_DEBUG $_ }; close(SSO_FILE_TO_DEBUG);
        $upper_expect_limit=$upper_expect_limit*10;
        @msp_hashes_from_temp = @{&open_sso_files(\@temp, $add_range, "u=$upper_expect_limit", "l=$lower_expect_limit")};
        @msp_from_temp= values %{$msp_hashes_from_temp[0]};
        if($msp_from_temp[0]=~/\S{3}/){ print "\n# (I) I set \$upper_expect_limit to $upper_expect_limit, to run save open_sso_files sub \n";
        }else{
             print "\n I tried to run open_sso_files again with hihger \$upper_expect_limit, but failed, dying \n";
             die;
        }
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (3) Writing MSP file, and gzipping if possible
    #________________________________________________
    open(MSP_RUN_FASTA, ">$out_file_msp_name") or die "\n# (ERROR) $out_file_msp_name could not be open\n";
    @msp_from_temp= map {$_->[0]} sort { $a->[1] <=> $b->[1]} map { [$_,  ($_=~/^\S+ +(\S+)/) ] } @msp_from_temp; # sorting by the E-value
    print MSP_RUN_FASTA "# Written by $0 : run_fasta_sequence_search sub, from \@msp_from_temp $algorithm used\n";
    for(@msp_from_temp){   print MSP_RUN_FASTA $_;    }
    close MSP_RUN_FASTA;
    $gzipped_msp_file=${&compress_files_by_gzip($out_file_msp_name)};

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (4) making SSO file if asked : 'c' option
    #__________________________________________________
    if($create_sso){
         $seq_name=${&get_base_names($input_file_name)};
         if($algorithm=~/ssearch/){
                $out_file_sso_name="$seq_name\.ssso";
                $out_file_sso_name_gz="$seq_name\.ssso\.gz";
         }elsif($algorithm=~/fasta/){
                $out_file_sso_name="$seq_name\.fsso";
                $out_file_sso_name_gz="$seq_name\.fsso\.gz";
         }
         open(SSO, ">$out_file_sso_name");
         for(@temp){  print SSO $_;  }; close (SSO);
         if($machine_readable){
                 $out_file_msso_name="$seq_name\.msso";
                 &cp( $out_file_sso_name, $out_file_msso_name);
         }
         @gzipped_sso_files=@{&compress_files_by_gzip($out_file_sso_name, $out_file_msso_name)};

    }
    if(-s $gzipped_msp_file and !(-s $out_file_msp_name)){
         #print "\n# (INFO) $gzipped_msp_file is found and will be returned";
         return(\$gzipped_msp_file);
    }elsif(-s $out_file_msp_name){
         #print "\n# (INFO) $out_file_msp_name is found and will be returned";
         return(\$out_file_msp_name);
    }
}## end of run_fasta_sequence_search


#______________________________________________________________________________
# Title     : check_if_defined
# Usage     : $defined=&check_if_defined($var, $file);
# Function  : checks if all the args are defined
# Example   :
# Keywords  :
# Options   :
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub check_if_defined{
		my($i, $input_arg_count);
		for($i=0; $i< @_; $i++){
			 $input_arg_count++;
			 if(-s $_[$i]){
					#print "\n# (INFO) check_if_defined: $input_arg_count does not exist as a file in PWD\n";
			 }elsif(! $_[$i]){
					print "\n# (INFO) check_if_defined: $input_arg_count is not defined\n";
					return(0);
			 }
		}
		return(1);
}

#______________________________________________________________________________
# Title     : die_if_file_not_present
# Usage     : &die_if_file_not_present($var, $file);
# Function  : checks if all the args are present
# Example   :
# Keywords  : die_unless_present, die_unless_file_present
# Options   :
# Author    : jong@salt2.med.harvard.edu
# Category  :
# Version   : 1.0
#------------------------------------------------------------------------------
sub die_if_file_not_present{
		my($i, $input_arg_count);
		for($i=0; $i< @_; $i++){
			 $input_arg_count++;
			 unless(-s $_[$i]){
					 print "\n# (i)die_if_file_not_present: ${input_arg_count}th exist as a file in PWD\n";
			 }
		}
}





#________________________________________________________________________
# Title     : cp
# Usage     : &cp( \$srcFile, \$dstFile); or  &cp( $srcFile, $dstFile);
#             or &cp(FILEHANDLE1, FILEHANDLE2),  or  &cp(FILEHANDLE1, $output)
# Function  : copies files fast, replacement of 'system("cp xxx xxxx"); '
# Example   : cp("cp.pl", *STDOUT);  # This will print cp.pl contents to your screen.
# Keywords  : copy files fast. cp_file, cp_files, copy_files, copy_file
# Options   :
# Argument  : 2 references of file name or  2 file names.
# Warning   : 27 times slower than 'cp' at prompt.  using system is 32 times slower
# Category  :
# Author    : Larry Wall, Jong
# Version   : 1.4
#--------------------------------------------------------------------
sub cp{
		croak(" Usage: cp ($file1, $file2)" ) unless(@_ >= 2);

		my($in)=$_[0];
		my($out)=$_[1];

		if( (ref($in) eq 'GLOB') || (ref($in) eq 'FileHandle')){
										*IN_CP = *$in; }  ## for  \*STDOUT like input
		elsif( ref(\$in) eq 'GLOB'){
										*IN_CP = $in;  }
		else{ open(IN_CP, "<$in") or die "Can't open output $in: $!\n";  }

		if( (ref($out) eq 'GLOB') || (ref($_[1]) eq 'FileHandle')){
										*OUT_CP = *$out;   }
		elsif( ref(\$out) eq 'GLOB'){  print "\n2\n";
										*OUT_CP = $out;  ## for  *STDOUT like input
		}else{  open(OUT_CP, ">$out") or die "Can't open output $out: $!\n";  }

		my ($access,$mod) = (stat IN_CP)[8,9];
		syswrite(OUT_CP, $buf, $len) while $len = sysread(IN_CP, $buf, 8192);
		close(IN_CP);
		close(OUT_CP);
		utime $access, $mod, $dstFile;
}

#__________________________________________________________________________
# Title     : sort_files_by_size
# Usage     : @sorted=@{&sort_files_by_size(\@files)};
# Function  : sort files by size and returns the ref of the array
# Example   :
# Keywords  : sort_file_by_size, sort_by_file_size, sort_by_size
# Options   :
# Returns   :
# Argument  :
# Category  :
# Version   : 1.0
#----------------------------------------------------------------------------
sub sort_files_by_size{
    my @in_files;
    if(ref $_[0] eq 'ARRAY'){
    @in_files = @{$_[0]};
    }else{
    @in_files = @_;
    }

    my @sorted = sort { (-s $a) <=> (-s $b) } @in_files;
    return(\@sorted);
}



#___________________________________________________________________
# Title     : scramble_array
# Usage     : @in=@{&scramble_array(\@in)};
# Function  : shuffles the elements of array
# Example   :
# Keywords  : randomise_array, randomize_array, shuffle_array
# Options   :
# Category  :
# Version   : 1.4
#---------------------------------------------------------------
sub scramble_array{
	srand(time()|$$);  # or use srand(time^$$);
	my ($i, @scrambled, @out, @each_array);

	for($i =0; $i< @_; $i++){
	   my @each_array = @{$_[$i]};
	   while (@each_array) {
		   push @scrambled, splice @each_array, int(rand(@each_array)), 1;
	   }
	   push(@out, \@scrambled);
	}
	if(@out > 1){
	   return(@out);
	}else{
	   return($out[0]);
	}
}



#______________________________________________________________________________
# Title     : put_msp_lines_to_hash_from_bla
# Usage     : @out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out,
#                                        $query,$matched,$evalue, $score, $seq_id,
#                                        $sorted_name, $query_range_start,
#                                        $query_range_stop,$match_string_start,
#                                        $match_string_stop, $read_point_found,
#                                        $PSI_BLA_ACCUMU_hits_eval_thresh,
#                                        $take_last_iter_PSI_BLA)};
# Function  :
# Example   :
# Keywords  :
# Options   :
# Author    : jong@salt2.med.harvard.edu,
# Category  :
# Version   : 1.5
#------------------------------------------------------------------------------
sub put_msp_lines_to_hash_from_bla{
    my (@finale_out, $sorted_name, $msp_line, $evalue, $score, $matched,
        $seq_id, $query_range_start,$PSI_BLA_ACCUMU_hits_eval_thresh,
        $query_range_stop, $query, $match_string_start, $match_string_stop,
        $read_point_found, %hash_out, %accumulative_hits, $Evalue_thresh);
    $Evalue_thresh=10;
    %hash_out=%{$_[0]};         %accumulative_hits=%{$_[1]};
    $query=$_[2];               $matched=$_[3];
    $evalue=$_[4];              $score=$_[5];
    $seq_id=$_[6];              $sorted_name=$_[7];
    $query_range_start=$_[8];   $query_range_stop =$_[9];
    $match_string_start=$_[10]; $match_string_stop=$_[11];
    $read_point_found=$_[12];   $PSI_BLA_ACCUMU_hits_eval_thresh=$_[13];
    $take_last_iter_PSI_BLA=$_[14];
    $PSI_BLA_ACCUMU_hits_eval_thresh=$_[15];
    $Evalue_thresh=$_[16];
    $query  ="$query\_$query_range_start\-$query_range_stop";

    if($matched !~/^\S+\_\d+\-\d+ *$/){         $matched="$matched\_$match_string_start\-$match_string_stop";
    }elsif($matched =~/^(\S+)\_\d+\-\d+ *$/){   $matched="$1\_$match_string_start\-$match_string_stop"; }

    if($score=~/\S/ and $evalue=~/\S/ and $match_string_start=~/\S/ and $Evalue_thresh > $evalue){
        $msp_line=sprintf("%-6s %-8s %-5s %-5s %-5s %-32s %-5s %-s\t%-s\t%-s\n",
                           $score, $evalue, $seq_id, $query_range_start, $query_range_stop,
                           $query, $match_string_start, $match_string_stop, $matched, $read_point_found);
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # This is where I really put the matches !!!
        #_____________________________________________________
        if($hash_out{$sorted_name}=~/^\S+\s+(\S+)\s+/){
            if($1 >= $evalue){
                $hash_out{$sorted_name}=$msp_line;
            }
        }else{
            $hash_out{$sorted_name}=$msp_line;
        }
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # This part is to rescue the hits dropped by matrix migration
            #_________________________________________________________________
            if(!$take_last_iter_PSI_BLA and $evalue <= $PSI_BLA_ACCUMU_hits_eval_thresh ){
                if($accumulative_hits{$sorted_name}){
                    if($accumulative_hits{$sorted_name}=~/^[\t ]*\S+[\t ]+(\S+)[\t ]/){
                         if($evalue < $1){
                                 $accumulative_hits{$sorted_name}=$msp_line;   }   }
                }else{ $accumulative_hits{$sorted_name}=$msp_line;     }
            }
    }else{
    }
    @finale_out=(\%hash_out, \%accumulative_hits, $read_point_found, $query,
                 $matched, $evalue, $score, $seq_id, $sorted_name,
                 $query_range_start, $query_range_stop, $match_string_start, $match_string_stop  );
    return(\@finale_out);
}




#______________________________________________________________________________
# Title     : do_psi_blast_search
# Usage     : &do_psi_blast_search(\@files, "d=$source_DB_file",
#                     "i=$input_seq_file",
#                     $over_write,
#                     $make_msp_in_sub_dir_opt);
# Function  :
# Example   :
# Keywords  :
# Options   :
#    $source_DB_file=           by d= s=
#    $input_seq_file=           by i=
#    $Eval_limit=               by E=
#    $PSI_BLA_iter_limit=          by j=
#    $PSI_BLA_step_eval=              by h= e=
#    $over_write=o              by o
#    $make_msp_in_sub_dir_opt=D by D
#    $Lean_output=L             by L -L
#    $put_NO_gaps=G             by G
#    $alignment_opt=            by m=
#    $make_MSP_files=M          by M
#    $take_last_iter_PSI_BLA    by l
# Author    :
# Version   : 2.7  Tue Apr 25 18:01:59 BST 2000
#------------------------------------------------------------------------------
sub do_psi_blast_search{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    $|=1;
    my(@final_msp_out, $PSI_BLA_iter_limit, $alignment_opt,  $source_DB_file,
         $input_seq_file, $over_write,
         $PSI_BLA_step_eval, $Eval_limit, %seq_input, $base, @seq_names, $final_out_pbla_file,
         $make_msp_in_sub_dir_opt, $sub_dir_size, $answer_for_self_self,
         $defined_all_ok, $ENV_BLASTDB_var, $Lean_output, $put_NO_gaps,
         $open_gap_penalty, $extension_gap_penalty, $make_MSP_files,
         $take_last_iter_PSI_BLA, $Accumulate_matches);

    my $blast_env_var='BLASTDB';
    $USER_HOME_DIR=$ENV{'HOME'};
    if(! -s ".ncbirc"){ &cp("$USER_HOME_DIR\/.ncbirc", ".ncbirc"); } ## Copying the problematic .ncbirc file

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Default parameters for blastpgp
    #_____________________________________________
    $PSI_BLA_ACCUMU_hits_eval_thresh=0.00001; ## for PSI blast
    $Accumulate_matches='';
    $Eval_limit=10;
    $alignment_opt=0;    # default is 0, This is for -m option (0-6). 6 is used for Jason's output
    $PSI_BLA_iter_limit=5;  # 7 is the default! arbitrary
    $PSI_BLA_step_eval=0.001;
    $sub_dir_size  =2;  # the default char number taken from seq name to make sub dirs
    $extension_gap_penalty=1;
    $open_gap_penalty=10;
    $gap_trigger_val=22.0; # default
    $make_msp_in_sub_dir_opt='D';

    if($ENV{'BLASTDB'}=~/^ *$/){
         print "\n# (i) Your ENV variable setting  \"BLASTDB\" is not set\n";
         print "\n# (i) BLASTDB is for the directory for various blast DBs\n";
         print "\n# (i) You can set it by \"export BLASTDB=/your/dir/for/db\" in bash shell\n";
         print "\n# (i) or \"set BLASTDB /your/dir/for/db\" in cshell or in .bashrc or .cshrc file\n";
         print "\n# (i) Do you want to set the directory path right NOW?(y/n)\n";
         $answer_for_blastenv=getc;
         if($answer_for_blastenv=~/^y/i){
                 &ask_for_ENV_vars($blast_env_var);
                 print "\n# (Warn) You need to source your .bashrc or .cshrc file to activate the ENV var\n";
         }elsif($answer_for_blastenv=~/^n/i){
                 print "\n# (i) I hope it is O.K., to run Blast without BLASTDB env var\n";
         }else{  print "\n# (ERROR) You pressed none of y or n, I am dieing \n";
                 die;
         }
    }elsif($ENV{'BLASTDB'}=~/\S/){
         print "\n# (i) $blast_env_var is set to ", $ENV{'BLASTDB'}, "\n";
         $ENV_BLASTDB_var=$ENV{'BLASTDB'};
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # getting options
    #________________________________________________________
    if($vars{'d'}){    $source_DB_file =$vars{'d'} };
    if(-s $vars{'i'}){ $input_seq_file =$vars{'i'} };
    if($vars{'E'}){    $Eval_limit     =$vars{'E'} };
    if($vars{'j'}){    $PSI_BLA_iter_limit=$vars{'j'} };
    if($vars{'e'}=~/(\S+)/ or $vars{'h'}=~/(\S+)/){ $PSI_BLA_step_eval=$1 };
    if($vars{'m'}=~/(\S+)/){ $alignment_opt=$1 };
    if($char_opt=~/o/){ $over_write='o' }
    if($char_opt=~/l/){ $take_last_iter_PSI_BLA= 'l'  } ## this is relevant to convert_bla_to_MSPA_file_format
    if($char_opt=~/D/){$make_msp_in_sub_dir_opt= 'D' } ## You'd better set this as default
    if($char_opt=~/L/){    $Lean_output        = 'L' }
    if($char_opt=~/q/){    $make_MSP_files     = 'q' }
    if($char_opt=~/G/){    $put_NO_gaps        = 'G';
        $open_gap_penalty=12; $extension_gap_penalty=1;
        $blast_g_option='F';
        $Eval_limit=3000;
        $PSI_BLA_iter_limit=3;
        $PSI_BLA_step_eval=30;
        $gap_trigger_val=22.0; # default is 22.0 , -N option
    }


    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Assign DB file and query FASTA file (if not given)
    #_________________________________________________________
    if(! $source_DB_file and ! $input_seq_file){
        print "\n# (Warn) You did not set d= and i= options for DB and Inquiry, I set automatically\n";
        $source_DB_file=${&get_largest_file(@file)};
        $input_seq_file=${&get_smallest_file(@file)};

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
        # If one of the files is missing, ask if it is for self self search and
        #  make query=target, else die
        #_______________________________________________________________________
        $defined_all_ok=&check_if_defined($source_DB_file, $input_seq_file);
        unless($defined_all_ok){
                print "\n# (Q) $source_DB_file $input_seq_file Did you want to do self self search? ->(y/n) ";
                $answer_for_self_self=<STDIN>;
                if($answer_for_self_self =~/y/i){
                        if($input_file_name){  $sequence_DB=$input_file_name }
                        else{ $input_file_name=$sequence_DB };
                        close(STDIN);
                }else{
                        print "\n# (E) You seemed made a mistake, O.K., I will kill myself!\n\n";
                        print chr(7);  die;
                }
        }
        print "\n# (i) \$source_DB_file is $source_DB_file, \$input_seq_file is $input_seq_file\n";
    }else{
        print "\n# (i) I have \$input_seq_file : $input_seq_file, \$source_DB_file : $source_DB_file\n";
        print "\n# (i) $0 uses \"gzip\" program, just warning :-) \n";
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Check if formatdb has to be been run. If not, Let's run it!
    #_______________________________________________________________
    $blast_db_phr_file="$source_DB_file\.phr";
    $blast_db_pin_file="$source_DB_file\.pin";
    $blast_db_psq_file="$source_DB_file\.psq";
    unless(-s $blast_db_phr_file and -s $blast_db_pin_file and -s $blast_db_psq_file ){
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
        # Checking in the ENV BLASTDB for xxxx.phr, xxx.pin, xxxx.psq
        #_____________________________________________________
        unless( -s "$ENV_BLASTDB_var\/$blast_db_phr_file" and
                -s "$ENV_BLASTDB_var\/$blast_db_pin_file" and
                -s "$ENV_BLASTDB_var\/$blast_db_psq_file"){
           if(&find_program_in_path("formatdb")){
                   system("formatdb -t $source_DB_file -i $source_DB_file");
           }else{
                print "\n# (E) I am trying to run formatdb to make blastdb, but can not find it\n";
                print "\n# (E) Without running formatdb over seq DB, you can not run psi-blast\n";
                die;
           }
        }else{
               print "\n# (i) do_psi_blast_search: I found ENV BLASTDB ($ENV_BLASTDB_var) set";
        }
    }

     $base=${&get_base_names(\$source_DB_file)};

     %seq_input=%{&open_fasta_files(\$input_seq_file)};

     @seq_names= keys %seq_input;
     @seq_names= @{&scramble_array(\@seq_names)};

     print "\n# (i) Number of sequences to search is: ", scalar(@seq_names), " \n";

    for($i=0; $i< @seq_names; $i++){
         my(%msp_format_from_bla, $first_char, $seq_name, $temp_file_name, $out_pbla_file,
            $out_msp_file, $gzipped_pbla);
         $seq_name=$seq_names[$i];
         print "\n# (i) $seq_name is being processed, please wait";
         $temp_file_name="$seq_name\.spfa";
         $out_pbla_file="$seq_name\.pbla";
         $out_msp_file="$seq_name\.mspa";
         $gzipped_pbla="$out_pbla_file\.gz";
         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         # If D option is set
         #____________________________________________
         if($make_msp_in_sub_dir_opt){
             $first_char= substr("\U$seq_name", 0, $sub_dir_size);
             unless(-d $first_char){
                 mkdir ("$first_char", 0777);
                 print "\n# (i) do_psi_blast_search: You set \'d\' or \'D\' opt\n";
                 print "# (i) making subDIRs ($first_char) with $seq_name $sequence_DB to store MSP files\n";
             }
         }
         #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         # Let's make each fasta file for each seq to be used in searching
         #_____________________________________________________________________
         %single_seq=($seq_name, $seq_input{$seq_name});
         &write_fasta(\%single_seq, $temp_file_name ); ## e for writing each file

         if( !(-s $out_pbla_file) or $over_write){
             if($make_msp_in_sub_dir_opt){  $final_out_pbla_file="$first_char/$out_pbla_file";
             }else{                         $final_out_pbla_file="$out_pbla_file";   }
             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # This is the actual running part for Blastpgp
             #________________________________________________________________
             print "\n#  (i) Running Blast with j=$PSI_BLA_iter_limit, h=$PSI_BLA_step_eval";
             if(-s "./Bin/blastpgp"){
                 system("/Bin/blastpgp -d $source_DB_file -i $temp_file_name -j $PSI_BLA_iter_limit -e $Eval_limit -h $PSI_BLA_step_eval -b 15000 -v 15000 -o $final_out_pbla_file");
             }elsif(&find_program_in_path("blastpgp")){
                 print "\n#  (i) Running blastpgp in PATH: -h $PSI_BLA_step_eval -j $PSI_BLA_iter_limit -g $blast_g_option -N $gap_trigger_val -G $open_gap_penalty -m $alignment_opt -E $extension_gap_penalty \n";
                 if($put_NO_gaps){
                     print "\n# (i) Putting NO gaps !!!!!";
                     system("blastpgp -d $source_DB_file -i $temp_file_name -j $PSI_BLA_iter_limit -e $Eval_limit -h $PSI_BLA_step_eval -b 15000 -v 15000 -G $open_gap_penalty -N $gap_trigger_val -m $alignment_opt -o $final_out_pbla_file ");
                 }else{
                     system("blastpgp -d $source_DB_file -i $temp_file_name -j $PSI_BLA_iter_limit -e $Eval_limit -h $PSI_BLA_step_eval -b 15000 -v 15000 -N $gap_trigger_val -G $open_gap_penalty -E $extension_gap_penalty -m $alignment_opt -o $final_out_pbla_file ");
                 }
             }elsif(-s "/usr/local/bin/blastpgp"){
                 system("/usr/local/bin/blastpgp -d $source_DB_file -i $temp_file_name -j $PSI_BLA_iter_limit -e $Eval_limit -h $PSI_BLA_step_eval -g $blast_g_option -b 15000 -v 15000 -G $open_gap_penalty -E $extension_gap_penalty -m $alignment_opt -o $final_out_pbla_file ");
             }elsif(-s "/local/bin/blastpgp"){
                 system("/local/bin/blastpgp  -d $source_DB_file -i $temp_file_name -j $PSI_BLA_iter_limit -e $Eval_limit -h $PSI_BLA_step_eval -g $blast_g_option -b 15000 -v 15000 -G $open_gap_penalty -E $extension_gap_penalty -m $alignment_opt -o $final_out_pbla_file");
             }else{
                 print "\n# Error, do_psi_blast_search in $0 can not find \"blastpgp\" (psi-blast) in PATH, dying\n\a\n"; die;
             }

             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
             # Going into SUB directory if 'D' is set and Writing MSP
             #________________________________________________________
             if($make_msp_in_sub_dir_opt){  chdir("$first_char");    }
             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             # Converting PBLA to MSP file format, for each run (for geanfammer each conversion is O.K.
             #___________________________________________________________________________________________________
             if($make_MSP_files){
                 print "#  (i) Writing MSP file with $seq_name $out_pbla_file, $Lean_output, $take_last_iter_PSI_BLA\n";
                 %msp_format_from_bla=%{&convert_bla_to_MSPA_file_format(\$out_pbla_file,
                                                            $Lean_output,
                                                            $take_last_iter_PSI_BLA,
                                                            "i=$PSI_BLA_iter_limit",
                                                            "SEQ_NAME=$seq_name",
                                                            "e=$PSI_BLA_ACCUMU_hits_eval_thresh",
                                                            "E=$Eval_limit",
                                                            $report_only_the_best,
                                                            $Accumulate_matches,
                                                            $verbose
                                                            )};
                 &write_mspa_files(\%msp_format_from_bla, $out_msp_file);
                 push(@final_msp_out, $out_msp_file);
             }
             &compress_files_by_gzip($out_pbla_file) unless $Lean_output;
             if($make_msp_in_sub_dir_opt){  chdir('..');  }
         }else{
             print "\n# (Warn) It seems $out_pbla_file or $gzipped_pbla is already present, skipping psi run\n";
         }
         unlink($temp_file_name) if $cleanup_dir;
    }
    #print "\n do_psi_blast_search is returning @final_msp_out \n";
    return(\@final_msp_out);
} # do_psi_blast_search



#________________________________________________________________________________
# Title     : convert_bla_to_MSPA_file_format
# Usage     : %hash_out_final=%{&convert_bla_to_MSPA_file_format(\$file, [$Lean_output])};
# Function  : reads in PSI blast output and produces MSPA file format.
#             Takes all the good hits below certain threshold in multiple iteration
#             Reports the best evalue with a given sequence name
# Example   : %hash_out=%{&convert_bla_to_MSPA_file_format(\$file)};
# Keywords  : pbla_to_mspaa, blast_to_mspaa, bla_2_mspaa, blastp_to_mspaa_format,
#             blast_to_mspaa_format, convert_bla_to_MSPA_file_format, convert_bla_to_MSPA_file_format_files
#             bla_to_mspaa
# Options   :
#   $pdbd_seq_only  d   for getting dxxxx_ like seq names only(pdb40d names for examp)
#   $all_seq  a         for forcing all seq conversion
#   $which_iteration= by i=    # choose which iteration result you want to take
#   $which_iteration   as just a digit
#   $report_only_the_best=b by b -b
#   $take_last_iter_PSI_BLA=l by l
#   $PSI_BLA_ACCUMU_hits_eval_thresh= by e=
#   $genome_seq_only=g      by g
#   $nrdb_seq_only=n        by n
#   $Evalue_thresh=         by E=
#   $Accumulate_matches=A   by A -A
#   $Lean_output=L          by L -L  # to remove search output to unclutter
#
# Author    : Sarah Teichmann and Jong Park, jong@salt2.med.harvard.edu
# Version   : 5.5
#--------------------------------------------------------------------------------
sub convert_bla_to_MSPA_file_format{
    my($i, $j, $k, @lines, $match_string_count,  $line_count, $query_string_count,
       $match_length, $Lean_output, $SEQ_NAME, $original_query,
       $duplicated_match_count, $new_sorted_name, $sorted_name, $verbose,
       $pdbd_seq_only, $which_iteration, $report_only_the_best,
       $genome_seq_only, $all_seq, $header_found, $PSI_BLA_ACCUMU_hits_eval_thresh,
       $take_last_iter_PSI_BLA, $nrdb_seq_only, $system_mem_size,
       $get_the_final_iteration, $read_entry_lines, $verbose, $Accumulate_matches,
       $CONVERGED_sign_found, $Evalue_limit, $entry_and_alignment_found, $query,
       );
    my $match_leng_thresh=10;
    ### This localization is critial NOT my, as I use a sub which relies on this
    local(%hash_out, %accumulative_hits, $file, $score, $score_ori, $evalue,
          $evalue_ori, $seq_id, $query_range_start, $query_range_stop,
          $match_string_start, $match_string_stop, $matched, $matched_seq_name,
          $read_point_found, $summary_lines_found, $entry_found, %good_matches_list);
    $duplicated_match_count=0;
    $Evalue_thresh=40;
    $Evalue_limit=5;
    $PSI_BLA_ACCUMU_hits_eval_thresh=0.0001; ## default eval threshes
    $query=$original_query='query_seq'; ## default query seq name, to avoid blank name
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Processing the input arguments to get file and options etc
    #_____________________________________________________________
    for (@_){
        if(ref $_ eq 'ARRAY'){ @lines =@{$_};
        }elsif( ref $_ eq 'SCALAR' and -s ${$_} ){ $file=${$_};
        }elsif( -s $_ ){            $file=$_;
        }elsif(/^ *d *$/){          $pdbd_seq_only='d'; $all_seq=''; $genome_seq_only='';
            print "\n $0: convert_bla_to_MSPA_file_format,  You set \$pdbd_seq_only option, I will skip others.\n";
        }elsif(/^ *b *$/){          $report_only_the_best='b';
        }elsif(/^ *a *$/){          $all_seq='a'; $genome_seq_only=''; $pdbd_seq_only=''; $nrdb_seq_only='';
        }elsif(/^ *g *$/){          $genome_seq_only='g'; $all_seq=''; $pdbd_seq_only='';$nrdb_seq_only='';
        }elsif(/^ *n *$/){          $nrdb_seq_only='n'; $all_seq=''; $pdbd_seq_only=''; $genome_seq_only=''; }
        if(/^ *l *$/){          $take_last_iter_PSI_BLA='l'; $Accumulate_matches='' }
        if(/^ *v *$/){          $verbose='v'; }
        if(/^ *L *$/){          $Lean_output='L'; }
        if(/^\s*e=(\S+)/){          $evalue_thresh=$1; }
        if(/^\s*SEQ_NAME=(\S+)/i){  $query=$original_query=$SEQ_NAME=$1;  }
        if(/^\s*E=(\S+)\s*/){       $Evalue_limit=$1;          }
        if(/^\s*A$/){           $Accumulate_matches='A'; $take_last_iter_PSI_BLA=''; }
        if(/^ *i= *(\d+) *$/){ $which_iteration=$1; }
     }
     unless($which_iteration){  $get_the_final_iteration=1 }

     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
     # If the input file is gzipped, uncompress it to text file and then open
     #__________________________________________________________________
     if($file=~/\.gz *$/){
         open(BLA_FILE, "gunzip -c $file|") || die "\n# $0: Failed to open $file\n\n\n";
         if($file=~/^([de]*\d\d*\w\w\w\w\w)\./){         $query=$1;          }
     }else{
         open(BLA_FILE, "$file") || die "\n# !! $0: convert_bla_to_MSPA_file_format : Failed to open \"$file\"\n\n\n";
         if($file=~/^([de]*\d\d*\w\w\w\w\w)\./){         $query=$1;          }
     }

     ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
     ###                                                                           ###
     ###  (1) Main reading in .pbla file (or any extension)                        ###
     ###   by putting pattern matches which occur most, I can save comparisons     ###
     ###___________________________________________________________________________###
     while(<BLA_FILE>){
         unless(/^(\S+) .{30,} +\d+ +([\d\.e\-]+)\s*$/ or /^\s*$/){ ## remove summary and NEW lines to reduce memory takeup
            push(@lines, $_);
         }
     }
     close(BLA_FILE);
     BLA2: for ($i=0; $i< @lines; $i++){
         if($lines[$i]=~/^Query=\s+(\S+)/){  $query=$original_query=$1; $i++;
         }elsif($lines[$i]=~/\<\S\>\s{,4}Query=\<\S\>/i){  $query=$original_query=$SEQ_NAME; $i++;  }
         if($lines[$i]=~/^Database:\s+(\S+)/){         $source_DB=$1;  $i++;       }
         if($lines[$i]=~/^Searching\.\..+[done]?/i){   $which_iteration++;
            splice(@lines, 0, $i); $i=0;
         }

         MATCH_FOUND:
         if($lines[$i]=~/^\>(\S+)/){
              $matched=$1;
              if($matched !~/\|/){
              }elsif($matched=~/pdb\|(\S+)\|(\S+)$/i){ $matched="$1$2"
              }elsif($matched=~/^gi\|\S*?\|?([^\|]+)$/i
                 or  $matched=~/^\S+\|\S*\|([^\|]+)$/){ $matched=$1 }

              $i+=2;
              until( $lines[$i]=~/^\s+Score/){  $i++;  }
              if( $lines[$i]=~/^\s+Score\s+=\s+(\S+)\s+bits\s+\(\S+\)\,\s+Expect\s+=\s+(\S+)/i
                  or /^\s+Score\s+=\s+(\S+)\s+bits.+\,\s+Expect\s+=\s+(\S+)/i ){
                  ($score, $evalue)=($1, $2);
              }

              if($evalue=~/^e\-\d\d\d/){ $evalue="1".$evalue; } ## bug fix for short eval in blast distribution
              $i++;
              if( $lines[$i]=~/^\s+Identities\s+=\s+\S+\/(\S+)\s+\(\s*(\S+)\s*\%\)/i){
                  $match_length=$1;
                  if($match_length < $match_leng_thresh){  $matched=''; $i+=2; goto MATCH_FOUND;
                  }else{   $seq_id=$2/100;  }
              }
              $i++;

              MATCH_CONTINUED:
              if($lines[$i]=~/^Query\:\s+(\d+)\s+(\D+)\s+(\d+)/){
                   $query_string_count++;
                   $aligned_seq_query .= $2;
                   if($query_string_count==1){  ($query_range_start, $query_range_stop)=($1, $3);
                   }elsif($query_string_count > 1){ $query_range_stop=$3;     }
                   $i+=2;
                   if($lines[$i]=~/^Sbjct\:\s+(\d+)\s+(\D+)\s+(\d+)/i){
                        $match_string_count++;
                        $aligned_seq_match .= $2;
                        if($match_string_count==1){ ($match_string_start, $match_string_stop)=($1, $3);
                        }elsif($match_string_count > 1){ $match_string_stop=$3;      }
                   }
                   $i++;
                   goto MATCH_CONTINUED;
              }
              $sorted_name=join(' ', sort("$query\_$query_range_start\-$query_range_stop",
                                          "$matched\_$match_string_start\-$match_string_stop"));
              if($Evalue_thresh >= $evalue){
                  $read_point_found=$which_iteration;
                  if($hash_out{$sorted_name}=~/^\S+\s+(\S+)\s+\S+/){
                      if($1 > $evalue){
                          $hash_out{$sorted_name}=sprintf("%-s\t%-5s\t%-4s\t%-4s %-s %-s %-3s %-s\t%-8s\t%s %s<=>%s\n",
                                     $score, $evalue, $seq_id, $query_range_start, $query_range_stop,
                                     $query, $match_string_start, $match_string_stop, $matched,
                                     $read_point_found, $aligned_seq_query, $aligned_seq_match);
                      }
                  }else{
                      $hash_out{$sorted_name}=sprintf("%-s\t%-5s\t%-4s\t%-4s %-s %-s %-3s %-s\t%-8s\t%s %s<=>%s\n",
                                 $score, $evalue, $seq_id, $query_range_start, $query_range_stop,
                                 $query, $match_string_start, $match_string_stop, $matched,
                                 $read_point_found, $aligned_seq_query, $aligned_seq_match);
                  }
                  #print ALIGNMENT ">$sorted_name\n$aligned_seq_query\n$aligned_seq_match\n";
              }
              $query_string_count=$score=$evalue=$seq_id=$query_range_stop=$query_range_start=
              $match_string_stop=$mspaa_line=$new_sorted_name=$match_string_count=$aligned_seq_query=$aligned_seq_match='';
              goto MATCH_FOUND;
         }
     }

     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
     # CLeaning up the BLA file if $Lean_output is set
     #_____________________________________________________
     $gzipped_search_file="$file\.gz";
     if($Lean_output ){ ## If Lean_out opt is set and $file exists and %hash_out is not empty, remove $file
          if(-s $file){
                  unlink($file);  ## removes fam_8_8.pbla etc,
          }elsif(-s $gzipped_search_file){
                  unlink($gzipped_search_file); ## removes fam_8_8.pbla.gz etc,
          }else{
             print "\n# (E) convert_bla_to_MSPA_file_format: tried to remove search out file for \$Lean_output opt,
                   but failed. Something is wrong. Think! or report to jong\@salt2.med.harvard.edu,
                   jong\@mrc-lmb.cam.ac.uk, sat\@mrc-lmb.cam.ac.uk, jong_p\@hotmail.com\n";
                   exit;
          }
     }
     return(\%hash_out);
}






#______________________________________________________________________________
# Title     : do_fasta_and_ssearch
# Usage     : $gzipped_msp_file=${&do_fasta_and_ssearch("a=$algorithm",
#                        "O=$out_file_msp_name", "File=$temp_file_name", "u=$upper_expect_limit",
#                        "DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
#
# Function  : runs FASTA or SSEARCH algorithm (given by a= option)
# Example   :
# Keywords  : run_ssearch_sequence_search, do_fasta_sequence_search,
#             do_fasta_search
# Options   :
#             Query_seqs=  for enquiry sequences eg)  "Query_seqs=$ref_of_hash"
#             DB=   for target DB  "DB=$DB_used"
#             File= to get file base(root) name.  "File=$file[0]"
#             i= to get file base(root) name. same as File=
#             m  for MSP format directly from FASTA or Ssearch result than through sso_to_msp to save mem
#             s  for the big single output (msp file output I mean)
#             s= for the single big msp file name
#             O= for Out file name, same as s=
#             o  for overwrite existing xxxx.fa files for search
#             c  for create SSO file (sequence search out file)
#             d  for very simple run and saving the result in xxxx.gz format in sub dir starting with one char
#             r  for reverse the query sequence
#             R  for attaching ranges of sequences
#             k= for k-tuple value. default is 1 (ori. FASTA prog. default is 2)
#             u= for $upper_expect_limit
#             l= for $lower_expect_limit
#             a= for choosing either fasta or ssearch algorithm
#             d= for defining the size of subdir made. 2 means it creates
#                    eg, DE while 1 makes D
#             d  for $make_gz_in_sub_dir_opt, putting resultant sso files in gz format and in single char subdir
#             D  for $make_msp_in_sub_dir_opt, convert sso to msp and put in sub dir like /D/, /S/
#             n  for new format to create new msp file format with sso_to_msp routine
#          PVM=  for PVM run of FASTA (FASTA only)
#             M  for machine readable format -m 10 option
#             M= for machine readable format -m 10 option
#             N  for 'NO' do not do any processing but, do the searches only.
#       FILE_AGE for defining the age of file in days to be overwritten.
# Author    : Jong Park, jong@salt2.med.harvard.edu, for commercial use, ask me.
# Category  :
# Version   : 1.3
#------------------------------------------------------------------------------
sub do_fasta_and_ssearch{
		#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
		my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
		my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
		my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
		my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
		my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
		if($debug==1){print "\n\t\@hash=\"@hash\"
		\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
		\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
		#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		my($algorithm, $upper_expect_limit, $sequence_DB, $k_tuple, $machine_readable,$create_sso,
			 @msp_from_temp, $gzipped_msp_file, @temp, $temp_file_name, $add_range,
			 $out_file_msp_name, $input_file_name, $out_file_sso_name,
			 $gzipped_sso_file, $machine_readable);
		$algorithm='fasta';    $out_file_msp_name='default_out_file.mspa';
		$upper_expect_limit=1;              $sequence_DB=$ENV{'NRDB_FASTA'};
		$k_tuple=1;            $machine_readable='M';
		$upper_expect_limit=1; $lower_expect_limit=0;
		$Score_thresh=100;

		if($vars{'a'}=~/\S+/){ $algorithm          = $vars{'a'}            };
		if($vars{'O'}=~/\S+/){ $out_file_msp_name  = $vars{'O'}            };
		if($vars{'e'}=~/\S+/){ $upper_expect_limit              = $vars{'e'}            };
		if($vars{'u'}=~/\d+/){ $upper_expect_limit = $vars{'u'}            };
		if($vars{'l'}=~/\d+/){ $lower_expect_limit = $vars{'l'}            };
		if($vars{'k'}=~/\d+/){ $k_tuple            = $vars{'k'}            };
		if($vars{'t'}=~/\d+/){ $Score_thresh       = $vars{'t'}            };
		if($vars{'r'}=~/\S+/){ $add_range          = 'r'                   };
		if($vars{'s'}=~/\S+/){ $single_big_msp     = 's'                   };
		if($vars{'DB'}=~/\S+/){            $sequence_DB=$vars{'DB'} ;
				if(-s $sequence_DB){
				}elsif(-s "../$sequence_DB"){  $sequence_DB= "../$sequence_DB"
				}elsif(-s "../../$sequence_DB"){  $sequence_DB= "../../$sequence_DB";
				}else{
						print "\n# (ERROR) do_sequence_search: You set DB param, but I can\'t find $sequence_DB\n";
						die;
				}
		}else{  print "\n# (ERROR) do_sequence_search: I need DB param defined, sorry, aborting\n"; }

		if($vars{'FILE'}=~/\S+/){ $input_file_name = $vars{'FILE'}; };
		if($vars{'File'}=~/\S+/){ $input_file_name = $vars{'File'}; };
		if($vars{'FILE_AGE'}=~/\S+/){ $age_in_days_of_out_file= $vars{'FILE_AGE'};  };
		if($vars{'Query_seqs'}=~/\S+/){ %seq_input = %{$vars{'Query_seqs'}}};
		if($vars{'Query'}=~/\S+/){      %seq_input = %{$vars{'Query'}}};
		if($vars{'u'}    =~/\S+/){ $upper_expect_limit          = $vars{'u'}            };
		if($vars{'PVM'}  =~/\S+/){ $PVM_FASTA_run  = $vars{'PVM'}; print "\n# PVM opt is set\n";     };
		if($vars{'M'}  =~/\S+/){ $machine_readable = $vars{'M'};           };

		if($char_opt=~/r/){    $add_range          = 'r' }
		if($char_opt=~/o/){    $over_write         = 'o' }
		if($char_opt=~/c/){    $create_sso         = 'c' }
		if($char_opt=~/s/){    $single_big_msp     = 's'; print "\n# Single file opt is set\n"; }
		if($char_opt=~/M/){    $machine_readable   = 'M' }
		if($char_opt=~/N/){    $No_processing      = 'N'; $create_sso='c'; }
		#print "\n# (INFO) do_fasta_and_ssearch: $algorithm,$out_file_msp_name,$input_file_name,$upper_expect_limit,DB=$sequence_DB,$k_tuple,$machine_readable\n";

		if(!$algorithm or !$input_file_name or !$sequence_DB){
				print "\n# (ERROR) One of \$sequence_DB, \$input_file_name, \$algorithm is missing\n";
				die;
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (1) Machine readable opt for fasta and ssearch set ??
		#____________________________________________________
		if($machine_readable=~/M/){
		   if(-s "./Bin/$algorithm"){
					 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
					 @temp=`./Bin/$algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
			 }else{
					 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
					 @temp=`$algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
			 }
		}else{
		   if(-s "./Bin/$algorithm"){
					 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
					 @temp=`./Bin/$algorithm -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
			 }else{
					 #print "\n# (INFO) Running $algorithm -m 10 -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple\n";
					 @temp=`$algorithm -H  -E $upper_expect_limit $input_file_name $sequence_DB $k_tuple`;
			 }
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (2) @temp is parsed directly by open_sso_files to make MSP hashes
		#_______________________________________________________________________
		@msp_hashes_from_temp = @{&open_sso_files(\@temp, $add_range, "u=$upper_expect_limit", "l=$lower_expect_limit")};
		if(@msp_hashes_from_temp < 1){
				print "\n# (ERROR) do_sequence_search : Error, something is wrong with open_sso_files, LINE=", __LINE__, "\n";
				die;
		}
		@msp_from_temp= values %{$msp_hashes_from_temp[0]};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (3) Writing MSP file, and gzipping if possible
		#________________________________________________
		open(MSP, ">$out_file_msp_name") or die "\n# (ERROR) $out_file_msp_name could not be open\n";
		for(@msp_from_temp){    print MSP $_;  }
		close MSP;
		$gzipped_msp_file=${&compress_files_by_gzip($out_file_msp_name)};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (4) making SSO file if asked
		#__________________________________________________
		if($create_sso){
			 $seq_name=${&get_base_names($input_file_name)};
			 if($algorithm=~/ssearch/){
					$out_file_sso_name="$seq_name\.ssso";
					$out_file_sso_name_gz="$seq_name\.ssso\.gz";
			 }elsif($algorithm=~/fasta/){
					$out_file_sso_name="$seq_name\.fsso";
					$out_file_sso_name_gz="$seq_name\.fsso\.gz";
			 }
			 open(SSO, ">$out_file_sso_name");
			 for(@temp){  print SSO $_;  }; close (SSO);
			 if($machine_readable){
					 $out_file_msso_name="$seq_name\.msso";
					 &cp( $out_file_sso_name, $out_file_msso_name);
			 }
			 @gzipped_sso_files=@{&compress_files_by_gzip($out_file_sso_name, $out_file_msso_name)};

		}
		if(-s $gzipped_msp_file and !(-s $out_file_msp_name)){
			 #print "\n# (INFO) $gzipped_msp_file is found and will be returned";
			 return(\$gzipped_msp_file);
		}elsif(-s $out_file_msp_name){
			 #print "\n# (INFO) $out_file_msp_name is found and will be returned";
			 return(\$out_file_msp_name);
		}
}## end of do_fasta_and_ssearch



__END__




Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.