Group
Extension

Geanfammer/Bin/divclus.pl

#!/usr/bin/perl
# Last Update by /gn0/jong/Perl/update_subroutines.pl: Mon Sep  1 18:37:22 BST 1997
#____________________________________________________________________________
# Title     : diviclus.pl
# Usage     : diviclus.pl xxxxx.mspa [s=100][t=30][e=30][f=2][v]
#             diviclus.pl *.mspa  <-- For multiple MSPA file processing
#                           -while xxx.mspa is a file with MSPA file format
# Function  : divides complex single linkage cluster into smaller duplication
#               module level sub clusters.
#             1) merges similar sequences in MSPA file fomrat making
#                hash output.
#             2) processes the merged mspa contents to sort small things
#             3) connects the sequences when there is any common
#                region between preliminary clusters.
#             4) makes various files(xx.clu, xx.sat, xx.mrg) and
#                also shows in STDOUT.
#
#             To controll the division of clusters, play with
#              the below parameters(if you do not specify, defaults:
#               t=30, s=100, e=0.01, f=7
#              if you give them mulitiple files, it will process them
#              all together.
# Example   : diviclus.pl xxxxxx.mspa s=90 t=40 e=0.001
#             Above is for score 90, seqlet leng 40, evalue 0.001.
#             However, usually you dont need options. Just put xxx.mspa
#
# Keywords  : divide_clusters, diviclust, find_linker, subcluster, divicl,
# Options   :
#               !!!!!!! < Do not remove the following optio lines > !!!!!!!!!!
#
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   t=<digit>   for seqlet(duplication module) length threshold
#                  (same as l opt, confusing, huh? )
#   s=<digit>   for score threshold
#   e=<digit>   for evalue threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   o           for overwriting
#   v           for verbose printout (for debugging information for Sarah
#                              and Jong, NOT for John, Jason, Tom, Jessy, Pat,,)
#   d           for dynamic factor determination(if the original cluster size
#                    is very big(say, over 100 seqs), it increases the factor by
#                    10% etc.(above default or given factor)
#
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#
#  $short_region=  S by S -S # taking shorter region overlapped
#  $large_region=  L by L -L # taking larger  region overlapped
#  $average_region=A by A -A # taking average region overlapped
#  $verbose = v by v -v
#  $range = r by r -r
#  $merge = m by m -m
#  $sat_file = s by s -s
#  $dindom = d by d -d
#  $indup  = i by i -i
#  $over_write = w by w -w
#  $optimize   = o by o -o
#  $dynamic_factor=D by D -D
#  $score         = by  s=   # Ssearch score cutoff, default 100
#  $factor        = by  f=   # factor is for the merge proces
#                           (misoverlap tolerance factor 3=33%, 2=50%)
#                           factor works within mspa chunk for one sequence
#                           to filter a good mergable seqlets
#  $thresh = by t=   # seqlet length cutoff, default 30
#  $evalue = by e=   # maximum evalue cutoff default 30
#
# Author    : J. Park, Sarah Teichmann, sat@mrc-lmb.cam.ac.uk, jong@salt2.med.harvard.edu
# Version   : 2.5
#------------------------------------------------------------------------------

#============= Default parameters used ==========================
   $optimize='o'; ## default is set to optimized (for merging-> removing similar seqs in merging )
   $factor=7;     ## default is 7 (70% overlap is thought to be good), now 7 means 70%
   $range='r';    ## default is to write ranges in the sequences
   $merge='m';    ## default is to speed up clustering by merging
   $score=75;     ## default ssearch(or any smith waterman) score cutoff is 110 (rather low)
   $evalue=0.001; ## default ssearch evalue cutoff is 0.001 (rather stringent)
   $thresh=30;    ## default minimum seqlet length as a possible domain(cf. 40aa for DALI)
   $dynamic_factor=''; # default dynamic factor is not set.
   #$indup ='i';   # now defunt, do not use please
   #$dindom='d';   # now defunt, do not use please
   #$sat_file='s'; # now defunt, do not use please
#============= Default parameters used ==========================

my $good_bad;

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Getting the input MSPA (matching seq pair) format file
#____________________________________________________________
my @file=@{&parse_arguments(1)};


#============= BELOW: Actual Run of divide_clusters which is the main sub of divclus ====================
&divide_clusters(
      \@file,
      "s=$score",
      "f=$factor", ## this is a very impo. parameter in the behaviour of divclus, Sarah!
      "t=$thresh",
      "e=$evalue", ## this is a very impo. parameter in the behaviour of divclus, Sarah!
      $dynamic_factor,
      $verbose,
      $range,
      $merge,
      $sat_file,
      $dindom,
      $indup,
      $over_write,
      $optimize,
      $short_region,
      $large_region,
      $average_region
);
#============= ABOVE: Actual Run of divide_clusters ====================

# Sub routine listing starts

#_______________________________________________________________________
# Title     : divide_clusters
# Usage     : &divide_clusters(\@file);
# Function  : This is the main funciton for divclus.pl
#               divides complex single linkage cluster into smaller duplication
#               module level sub clusters.
# Example   : &divide_clusters(\@file, $verbose, $range, $merge, $sat_file,
# 	                $dindom, $indup, "T=$length_thresh", "E=$Evalue_thresh", $over_write,
#                   $optimize, "s=$score", "f=$factor");
#
# Keywords  : divicl, divclus, div_clus, divide clusters
# Options   : _  for debugging.
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   t=<digit>   for seqlet(duplication module) length threshold
#                  (same as l opt, confusing, huh? )
#   s=<digit>   for score threshold
#   E=<digit>   for evalue threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   o           for overwriting
#   v           for verbose printout (infor)
#   D           for dynamic factor
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#   o  for $over_write
#
# Version   : 3.3
#------------------------------------------------------------------------
sub divide_clusters{
    #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
    my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
    my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
    my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
    my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
    my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
    if($debug==1){print "\n\t\@hash=\"@hash\"
    \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
    \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
    #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
         $score, @temp_show_sub, $optimize, $file, $Evalue_thresh, $over_write, $din_dom,
         $sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
         $average_region, $dynamic_factor, @sub_clustering_clu_files,
         @splited1, $link_or_not,  %duplicate);

    $Evalue_thresh=0.001; # the default
    $factor=7; # default factor is 7 for 70%
    $length_thresh=30;

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Dealing with options
    #_________________________________________
    if($char_opt=~/m/){        $merge='m';
    }if($char_opt=~/v/){       $verbose='v'; # for showing debugging information
    }if($char_opt=~/i/){       $indup='i';
    }if($char_opt=~/z/){       $optimize='z';
    }if($char_opt=~/o/){       $over_write='o';
    }if($char_opt=~/d/){       $din_dom='d';
    }if($char_opt=~/s/){       $sat_file='s';
    }if($char_opt=~/y/){       $dynamic_factor='y';
    }if($char_opt=~/S/){       $short_region  ='S';
    }if($char_opt=~/L/){       $large_region  ='L';
    }if($char_opt=~/A/){       $average_region='A';
    }if($vars{'T'}=~/\d+/){    $length_thresh= $vars{'T'};
    }if($vars{'l'}=~/\d+/){    $length_thresh= $vars{'l'}; ## synonym of 't'
    }if($vars{'f'}=~/\S+/){    $factor= $vars{'f'};
    }if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
    }if($vars{'e'}=~/\d+/){    $Evalue_thresh= $vars{'e'}; # synonym of e
    }if($vars{'E'}=~/\d+/){    $Evalue_thresh= $vars{'E'}; # synonym of e
    }
    $percent_fac=$factor*10; # <-- this is just to show the factor in %
    print "\n(i) Input to divide_clusters sub are: \"@file\"";
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
    #________________________________________________________________________________
    if(@file == 1){  #<=== @file has xxxx.mspa, yyyy.mspa  zzzz.mspa ....,
		$file=$file[0];
		$base_1=${&get_base_names($file)};
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (2) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
		#______________________________________________________________________________________________
		$output_clu_file="$base_1\_F${factor}\.clu";

		if( !$over_write and -s $output_clu_file){
			print "\n# $output_clu_file Already EXISTS, skipping. Use \'o\' opt to overwrite\n"; exit;
		}

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (3) merge_sequence_in_mspa_file does not do much. Just filtering and producing
		#     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format from mspa format
		#________________________________________________________________________________
        print "\n(i) Running merge_sequence_in_mspa_file";
        @grouped_seq_names=@{&merge_sequence_in_mspa_file(\@file, "s=$score", $optimize, $din_dom, $sat_file,
							$optimize, "T=$length_thresh", "E=$Evalue_thresh", "f=$factor", "$range", "$merge", $verbose,
							$short_region, $large_region, $average_region, $over_write, $dynamic_factor)};

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (4) This is critical seqlet merging step. Up to now, things are fine usually.!!!
		#________________________________________________________________________________
		unless(@grouped_seq_names == 1){  ##  if @grouped_seq_names has one string like 'FAM_8_7 FAM_8_4 FAM_8_3' skip
			F1: for($i=0; $i< @grouped_seq_names; $i++){
                @splited1=split(/\s+/, $grouped_seq_names[$i]);
				for($j=0; $j< @grouped_seq_names; $j++){
    				 if($grouped_seq_names[$i] eq $grouped_seq_names[$j]){ next  }
					 @splited2=split(/\s+/, $grouped_seq_names[$j]);
                     $link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1,
                                                                           \@splited2,
                                                                           "f=$factor")};
					if($link_or_not){
                        $optimize=1; ## This should be nearly always 1 !!!!!!!
                        if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
                            $grouped_seq_names[$i]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																		$short_region, $large_region, $average_region)} );
     				    }else{
							$grouped_seq_names[$i]=join(' ', grep { ! $duplicate{$_}++ } (@splited1, @splited2) );
					    }
                        splice(@grouped_seq_names, $j,1);
						$j--; $i--; next F1;
					}
				}

             }
		}
		#~~~~~~~~~~~~~~ I used to use a sub, but to save time above is inserted ~~~~~~~~~~~~~
        #@grouped_seq_names=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
	    #				 "f=$factor", $short_region, $large_region, $average_region, $optimize)};

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# (5) This is showing the result in clu file format
				#________________________________________________________________________________
                @temp_show_sub=&show_subclusterings(\@grouped_seq_names, $file, $sat_file, $dindom, $indup,
						   "E=$Evalue_thresh", "p=$percent_fac", "f=$factor" );
				$good_bad       = $temp_show_sub[0];
				$indup_c        = $temp_show_sub[1];
				$sum_seq_num   += $temp_show_sub[2];
				push(@sub_clustering_out_files, @{$temp_show_sub[3]});

				if($good_bad==1){      push(@good, $file);
				}else{                 push(@bad, $file);       }

				#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				# (6) Final write up stage (unecessary)
				#_______________________________________________________________
          &write_good_bad_list_in_divide_clusters(\@good, \@bad);

	 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # when more than one single file input is given (Default usually)
	 #_________________________________________________________________
	 elsif(@file >1 ){
			 my (@good, @bad);
			 if($indup =~/i/i){   open (INDUP, ">indup_stat\.txt");  } # this is not essential.

			 for($i=0; $i< @file; $i++){
						my (@grouped_seq_names, @temp_show_sub, $indup_c, $big_mspa_file);
						$indup_c=0;
						$big_mspa_file=$file[$i];
						unless(-s $big_mspa_file){ print "\n# (E) \$big_mspa_file does not exist\n"; exit }

						$base_1=${&get_base_names($big_mspa_file)};
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# (1) Define the output cluster file name:  eg, 3-232_cluster_F7.clu , F7 means factor used is 7
						#______________________________________________________________________________________________
						$output_clu_file="$base_1\_F${factor}\.clu";

						if( !$over_write and -s $output_clu_file){
							print "\n# $output_clu_file Already EXISTS, skipping. Use \'w\' opt to overwrite\n";
							next;  }

						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						#  (2) If clu file(eg 2-1618_ss.clu ) is in pwd, tries to skip
						#____________________________________________________________
						if((-s $output_clu_file) > 10 and $over_write !~/o/){
							print "# $output_clu_file exists, skipping, use \"o\" option to overwrite\n";  next;
						}

						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						# (3) merge_sequence_in_mspa_file does not do much. Just filtering and producing
						#     sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format of STRING from mspa format
						#     $big_mspa_file is an MSPA file
						#________________________________________________________________________________
                        @grouped_seq_names=@{&merge_sequence_in_mspa_file(\$big_mspa_file, "s=$score", $din_dom, $sat_file, $optimize,
																"T=$length_thresh", "E=$Evalue_thresh", "f=$factor", "$range", "$merge", $verbose, $over_write,
																 $short_region, $large_region, $average_region, $dynamic_factor )};
						#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
						#  (4) Clustering the sets of merged seqlets => CORE algorithm
						#_____________________________________________________________________
						unless(@grouped_seq_names == 1){  ##  if @grouped_seq_names has one string like 'FAM_8_7 FAM_8_4 FAM_8_3' skip
								F2: for($g=0; $g< @grouped_seq_names; $g++){
										@splited1=split(/ +/, $grouped_seq_names[$g]);
										for($h=0; $h< @grouped_seq_names; $h++){
												if($grouped_seq_names[$g] eq $grouped_seq_names[$h]){ next  }
												@splited2=split(/ +/, $grouped_seq_names[$h]);
												$link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1, \@splited2, "f=$factor")};
												if($link_or_not){
														if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
															 $grouped_seq_names[$g]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																													 $short_region, $large_region, $average_region)} );
														}else{
															 $grouped_seq_names[$g]=join(' ', grep { ! $duplicate{$_}++ } (@splited1, @splited2) );
														}
														splice(@grouped_seq_names, $h, 1); $h--; $g--; %duplicate=(); next F2;
												}
										}
								}
						}
						#~~~~~~~~~~~~~~ I used to use a sub, but to save time above is inserted ~~~~~~~~~~~~~
						#@grouped_seq_names=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
						#			 $short_region, $large_region, $average_region)};
						@temp_show_sub=&show_subclusterings(\@grouped_seq_names, $big_mspa_file, $sat_file, $dindom, $indup,
																										"E=$Evalue_thresh", "p=$percent_fac", "f=$factor");
												$good_bad       = $temp_show_sub[0];
												$indup_c        = $temp_show_sub[1];
												$sum_seq_num   += $temp_show_sub[2];
						push(@sub_clustering_out_files, @{$temp_show_sub[3]});

						if($good_bad==1){          push(@good, $big_mspa_file);
						}else{         push(@bad, $big_mspa_file);       }

					}
					#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
					&write_good_bad_list_in_divide_clusters(\@good, \@bad);
					sub write_good_bad_list_in_divide_clusters{
							 my  (@good, $i, @bad); @good=@{$_[0]}; @bad=@{$_[1]};
                             open(GOODBAD, ">good_bad.list") || warn "\n Can not open good_bad.list \n\n";
							 print GOODBAD "GOOD: all link    : 000\n";
							 for($i=0; $i< @good; $i++){  print GOODBAD "$good[$i]\n";  }
							 print GOODBAD "BAD : Not all link: 000\n";
							 for($i=0; $i< @bad; $i++){   print GOODBAD "$bad[$i]\n";   }
							 close(GOODBAD);
					}
					#_______________________________________________________________

	 }
	 return(\@sub_clustering_out_files); # contains (xxxx.clu, yyy.clu,, )
}





#______________________________________________________________
# Title     : get_overlapping_range
# Usage     : @n1=@{&get_overlapping_range(\@ranges1, \@ranges2)};
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : get_overlapping_range_in_mspa, get_overlapping_range_in_mspa_file,
#             get_overlapping_seq_match_range, get_overlap_seq_match_range
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.1
#--------------------------------------------------------------
sub get_overlapping_range{
   my (@new_range, $R_start1, $R_start2);
   ($R_start1, $R_end1)=@{$_[0]}[0..1];
   ($R_start2, $R_end2)=@{$_[1]}[0..1];

   if(($R_start1 <= $R_start2)&&        # ------------
	 ( $R_end1 >= $R_end2) ){           #   -------
	   @new_range= ($R_start2, $R_end2);
   }elsif(($R_start1 <= $R_start2)&&    # -----------
	 ( $R_end1 <= $R_end2) &&           #    -----------
	 ( $R_end1 >  $R_start2) ){
	   @new_range= ($R_start2, $R_end1);
   }elsif(($R_start1 >= $R_start2)&&    #    -----------
	 ( $R_end1 >= $R_end2  ) &&         # -----------
	 ( $R_end2 >  $R_start1) ){
	   @new_range= ($R_start1, $R_end2);
   }elsif(($R_start1 >= $R_start2)&&    #   ------
	 ( $R_end1 <= $R_end2) ){           # -----------
	   @new_range= ($R_start1, $R_end1);
   }else{                                #  ----
	  @new_range=(0,0);                  #        --------
   }
   return(\@new_range);
}


#______________________________________________________________
# Title     : get_mspa_enquiry_sequence
# Usage     :
# Function  : gets the name of sequence used as enquiry(target)
# Example   :
# Warning   :
# Class     :
# Keywords  : get_mspa_target_sequence, get_mspa_enquiry_sequence_name
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   :
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Version   : 1.0
#--------------------------------------------------------------
sub get_mspa_enquiry_sequence{
   my $lines1=${$_[0]} || $_[0];
   my ($SEQ, $matched_SEQ);
   if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
	  $SEQ        =$3;
	  $matched_SEQ=$6;
   }
   return \$SEQ;
}



#______________________________________________________________
# Title     : get_mspa_matched_sequence
# Usage     :
# Function  : gets the name of sequence used as enquiry(target)
# Example   :
# Warning   :
# Class     :
# Keywords  : get_mspa_matched_sequence_name
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   :
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Version   : 1.0
# Used in   :
# Enclosed  :
#--------------------------------------------------------------
sub get_mspa_matched_sequence{
   my $lines1=${$_[0]} || $_[0];
   my ($SEQ, $matched_SEQ);
   if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
	  $SEQ        =$3;
	  $matched_SEQ=$6;
   }
   return \$matched_SEQ;
}
#______________________________________________________________
# Title     : get_mspa_range
# Usage     : @range=@{&get_mspa_range($seqlet)};
#             @temp=&get_mspa_range($seqlet);
#
# Function  :
# Example   :
# Warning   :
# Class     :
# Keywords  : get_mspa_file_ranges
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   :
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Version   : 1.5
# Used in   :
# Enclosed  :
#--------------------------------------------------------------
sub get_mspa_range{
   my $lines1=${$_[0]} || $_[0];
   my ($SEQ, $num_seq, $matched_SEQ, @Ranges);
   if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
	  $SEQ        =$3;
	  $matched_SEQ=$6;
	  if($SEQ eq $matched_SEQ){ ## skipping self match
		  $num_seq++;
	  }else{
		  @Ranges=($1, $2, $4, $5);  ## <-- example. (10-20, 30-45)
	  }
   }
   return wantarray ? (\@Ranges, \$SEQ, \$matched_SEQ): \@Ranges;
}
#________________________________________________________________________
# Title     : set_debug_option
# Usage     : &set_debug_option;
# Function  : If you put '#' or  '##' at the prompt of any program which uses
#             this sub you will get verbose printouts for the program if the program
#             has a lot of comments.
# Example   : set_debug_option #    <-- at prompt.
# Warning   :
# Keywords  :
# Options   : #   for 1st level of verbose printouts
#             ##  for even more verbose printouts
# $debug  becomes 1 by '#'  or '_'
# $debug2 becomes 1 by '##'  or '__'
#
# Returns   :  $debug
# Argument  :
# Version   : 1.8
#--------------------------------------------------------------------
sub set_debug_option{
  my($j, $i, $level);
  unless( defined($debug) ){
	 for($j=0; $j < @ARGV; $j ++){
		 if( $ARGV[$j] =~/^(_+)$|^(#+)$/){ # in bash, '#' is a special var, so use '_'
			 print __LINE__," >>>>>>> Debug option is set by $1 <<<<<<<<<\n";
			 $debug=1;
				  print chr(7);
			 print __LINE__," \$debug  is set to ", $debug, "\n";
			 splice(@ARGV,$j,1); $j-- ;
			 $level = length($1)+1;
			 for($i=0; $i < $level; $i++){
				 ${"debug$i"}=1;
				 print __LINE__," \$debug${i} is set to ", ${"debug$i"}, "\n";
			 }
		 }
	 }
  }
}
#________________________________________________________________________
# Title     : default_help
# Usage     : &default_help2;  usually with 'parse_arguments' sub.
# Function  : Prints usage information and others when invoked. You need to have
#             sections like this explanation box in your perl code. When invoked,
#             default_help routine reads the running perl code (SELF READING) and
#             displays what you have typed in this box.
#             After one entry names like # Function :, the following lines without
#             entry name (like this very line) are attached to the previous entry.
#             In this example, to # Function : entry.
# Example   : &default_help2; &default_help2(\$arg_num_limit);   &default_help2( '3' );
#             1 scalar digit for the minimum number of arg (optional),
#             or its ref. If this defined, it will produce exit the program
#             telling the minimum arguments.
# Warning   : this uses format and references
# Keywords  :
# Options   :
# Returns   : formated information
# Argument  :
# Version   : 3.3
#--------------------------------------------------------------------
sub default_help{
  my($i, $perl_dir, $arg_num_limit, $head ,$arg_num_limit,
	  @entries, @entries_I_want_write );
  my($logname)=getlogin();
  my($pwd)=`pwd`;
  my($date)=`date`;
  chomp($date,$pwd);
  my($not_provided)="--- not provided ---\n";
  my($file_to_read) = $0;

  for($i=0; $i < @_; $i ++){
	  if((ref($_[$i]) eq 'SCALAR')&&(${$_[$i]} =~ /^\d$/)){
		  $arg_num_limit = ${$_[$i]};  }
	  elsif( (!(ref($_[$i]))) && ($_[$i] =~ /^\d$/)){
		  $arg_num_limit = $_[$i];     }
  }
  my %entries = %{&read_head_box(\$file_to_read )};
  if($option_tb_found ==1){
	 @option_tb=@{&read_option_table(\$file_to_read)};
  }

  @entries = keys %entries;
  foreach $help_item (@entries){
	  ${$help_item}= $not_provided if( ${$help_item}=~/^[\W]*$/  and  !defined(${$help_item}) );
  }
  #""""""""""""""""""""""""""""""""""""""""
  #########  Writing the format <<<<<<<<<<<
  #""""""""""""""""""""""""""""""""""""""""
  $~ =HEADER_HELP;
  write;   ## <<--  $~ is the selection operator
  $~ =DEFAULT_HELP_FORM;

  @entries_I_want_write=sort keys %entries;

  for( @entries_I_want_write ){  write  }

  print chr(7);  print "_"x72,"\n\n";

  if(@ARGV < $arg_num_limit){ print "\* $0 fataly needs $arg_num_limit arguments\n\n" }

  if(  $option_tb_found == 1){
	 #########  Printing the OPTION table contents <<<<<<<<<<<<
	 print "  Press \"Return\" key to see what options $logname\'s \n\n    \"$0\" take... \n";
		 $key_press=getc();
	 print @option_tb, "\n"x2 if(@option_tb > 0);
  }
format HEADER_HELP  =
_____________________________________________________________________
		  __  __      ______     __          _____
		 /\ \/\ \    /\  ___\   /\ \        /\  _ `\
		 \ \ \_\ \   \ \ \__/   \ \ \       \ \ \L\ \
		  \ \  _  \   \ \  _\    \ \ \       \ \ ,__/
		   \ \ \ \ \   \ \ \/___  \ \ \_____  \ \ \/
		    \ \_\ \_\   \ \_____\  \ \______\  \ \_\
		     \/_/\/_/    \/_____/   \/______/   \/_/ V 3.1`
_____________________________________________________________________
.
format DEFAULT_HELP_FORM =
 @<<<<<<<<<: @*
 $_        $entries{$_}
.
}

#________________________________________________________________________________________
# Title     : merge_sequence_in_mspa_file
# Usage     :
# Function  :
# Example   : INPUT: (MSPA file) ===>
#  59     2.6        47    64     d2pia_3        10    30     d1erd___10-30
#  161    1.1e-07    24    91     d2pia_3        16    85     d1frd___16-85
#
#  722    0          1     106    d1put__        1     106    d1put___1-106
#  66     4.9        2     68     d1put__        43    106    d2lbp___43-106
#  69     1.3        12    49     d1put__        81    120    d1cgo___81-120
#
#  60     3.3        13    38     d1frd__        32    57     d1orda1_32-57
#  65     1.7        21    58     d1frd__        40    69     d2mtac__40-69
#
#   ==== OUTPUT ===>
#    d1frd___1-98 d1frd___1-98_1-98 d1frd___16-85 d2pia_3_24-91_24-91
#    d1frd___16-85_16-85 d2pia_3_24-91
#    d1put___1-106 d1put___1-106_1-106
#    d2pia_3_1-98 d2pia_3_1-98_1-98
#
# Keywords  : mergr_seq_in_mspa_file, merge_sequence_in_mspa, merge_sequences_in_mspa_file
# Options   :
#  $dynamic_factor =  y by y -y   # adjusting factor value dynamically(more seq higher factor)
#  $short_region   =  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region   =  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region =  A by A -A # taking average region overlapped in removing similar regions
#
# Thanks    : Alexey Eroshkin <alexey@axyspharm.com>
# Version   : 3.5
#----------------------------------------------------------------------------------------
sub merge_sequence_in_mspa_file{
		#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
		my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
		my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
		my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
		my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
		my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
		if($debug==1){print "\n\t\@hash=\"@hash\"
		\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
		\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
		#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
		my ($mspa_value, @all_seqlets, %temp_hash, @mspa_chunks, $clu_out, $size_of_all_seqlets,
		    $ragne, $base, $optimize, $mrg_out, @arr, $sat_out, %final_hash_out, @final_pre_hash,
				$length_thresh, $merge, $factor, $Evalue_thresh, $score, $dynamic_factor, $score_match,
				$eval_match, $query_seq, $query_start, $query_stop, $match_seq, $match_start,
				$short_region, $large_region, $average_region, $original_clu_size, $match_stop,
				$total_mspa_line_count);
		$factor=$default_factor=7; #~~~~ default connection factor U, 7 means 70% now!
		$length_thresh=30;
		$Evalue_thresh=1;
		$score =75;
		$range='r';
		if(@file < 1){ print "\n# (E) merge_sequence_in_mspa_file needs at least 1 MSPA file\n"; die }

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Following changes the defaults with given parameters
		#_____________________________________________________________
		if($char_opt=~/z/i){       $optimize='z';    ## This will cause using remove_similar_seqlets than remove_dup_in_array !
		}if($char_opt=~/m/){       $merge='m';
		}if($char_opt=~/y/){       $dynamic_factor='y';
        }if($char_opt=~/r/){       $verbose='r';
        }if($char_opt=~/v/){       $verbose='v';
		}if($char_opt=~/S/){       $short_region='S';
		}if($char_opt=~/L/){       $large_region='L';
		}if($char_opt=~/A/){       $average_region='A';
		}if($vars{'T'}=~/\d+/){    $length_thresh=$vars{'T'};
		}if($vars{'f'}=~/\S+/){    $factor=$vars{'f'};  ## Here I give a generous $factor !
		}if($vars{'s'}=~/\d+/){    $score = $vars{'s'};
        }if($vars{'e'}=~/\S+/){    $Evalue_thresh= $vars{'e'};
        }if($vars{'E'}=~/\S+/){    $Evalue_thresh= $vars{'E'}; }

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		#  Just to inform what parameters have been chosen
		#_____________________________________________________________
        print "\n# (1) merge_sequence_in_mspa_file : default \$score      : $score";
        print "\n#                                 : default \$Evalue_thresh     : $Evalue_thresh";
        print "\n#                                 : used \$length_thresh : $length_thresh";
        print "\n#                                 : default \$factor     : $default_factor";
        print "\n#                                 : used    \$factor     : $factor";
        print "\n#                                 : \$dynamic_factor     : $dynamic_factor\n";

		for($c=0; $c< @file; $c++){
             open(MSPA, "$file[$c]") || die "Can not open $file[$c] \n";
			 $base=${&get_base_names($file[$c])};
			 $clu_out="$base\_F${factor}.clu"; # <-- This is the most important output. Sarah's program will process this
			 $sat_out="$base\_F${factor}.sat";
             my $total_mspa_lines=@mspa1=<MSPA>;
             print "\n $file[$c] is opened successfully \$total_mspa_lines : $total_mspa_lines\n";

			 for($i=0; $i< @mspa1; $i++){
					#~~~~~~~~~~ Include range or NOT in the seq name ~~~~~~~~~~~~~~~~~~~~~~~~~~`
					# %temp_hash is just to get the chunk of MSPA block. As mspa file uses empty line as a delimiter
					#____________________________________________________________________________
					if($char_opt=~/r/){
						 if($mspa1[$i]=~/^\s*(\S+)\s+(\S+)\s*\S*\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)/){
                              $total_mspa_line_count++;
									$score_match=$1;	$eval_match=$2;
                                    $query_seq=$5;      $query_start=$3;
									$query_stop=$4;		$match_seq=$8;
									$match_start=$6;	$match_stop=$7;
                                    if($score_match < $score or $eval_match > $Evalue_thresh){next};
									if($query_seq=~/\S+_\d+\-\d+$/){ $new_seq1=$query_seq }else{ $new_seq1="$query_seq\_$query_start\-$query_stop"; }
									if($match_seq=~/\S+_\d+\-\d+$/){ $new_seq2=$match_seq }else{ $new_seq2="$match_seq\_$match_start\-$match_stop"; }

									if($new_seq1 eq $new_seq2){ next};

									#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
									# Modifying $mspa1[$i] line !!!
									#______________________________
									$mspa1[$i]=sprintf("%s %-3s %s %s %s %s %s %s",
													$score_match, $eval_match, $query_start,
													$query_stop, $new_seq1, $match_start,
													$match_stop, $new_seq2);
									$temp_hash{$query_seq}.="$mspa1[$i]\n";
						 }
					}else{
						 if($mspa1[$i]=~/^\s*(\S+)\s+(\S+)\s*\S*\s+\d+\s+\d+\s+(\S+)[_\d+\-\d+]?\s+\d+\s+\d+\s+\S+/){
									if($1 < $score or $2 > $Evalue_thresh){	next };
									$temp_hash{$3}.="$mspa1[$i]\n";
						 }
					}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			}
            close(MSPA);
		}
        $original_clu_size=@mspa_chunks= values(%temp_hash); ## Using temp hash is more than 2 times faster than push

        print "\n The total seq to divclus is : $original_clu_size \$total_mspa_line_count: $total_mspa_line_count\n";
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Determining the dynamic factor here (when 'd' opt is set)
		#____________________________________________________________
		if($dynamic_factor){
				#--> 100 => 10, 1000 => 15, 10000 => 20
				print "\n# ### \$factor: $factor\n";
				$factor += (log($original_clu_size)*5)/10 - 1; ## This is a simplistic.
				if($factor > 9.5){ $factor=9.5 } # this is the very upper limit for any factor.
				print "\n# ### \$factor: $factor\n";
		}

		for($i=0; $i< @mspa_chunks; $i++){
            @arr=@{&merge_sequence_in_mspa_chunk($mspa_chunks[$i], $verbose, $optimize,
								"$merge", "E=$Evalue_thresh", "s=$score",
								"f=$factor", "T=$length_thresh",
								$short_region, $large_region, $average_region)};
			push(@all_seqlets,  @arr);
		}

		#~~~~~~~~~ sorting inner sequences in strings ~~~~~~~~~
		#______________________________________________________
		@all_seqlets=@{&sort_words_in_string(@all_seqlets)}; ## This speeds up about 2 times !!!

		#~~~~~~~ Sort by the _digit-  in seqlet names ~~~~~~~~~
		@all_seqlets= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2]  }
									map {/^\s*((\S+)_(\d+)\-(\d+).*)/ && [$1, $2, $3, $4]} @all_seqlets;

		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# merge sequences in a simple way until there is no change in the array
		#  This is an incomplete merge(merges first seqlets of string ...
		#______________________________________________________________________
		for($i=0; $i< @mspa_chunks; $i ++){
             ITERATION_RETURN_POINT:
             $size_of_all_seqlets=@all_seqlets;
             @all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
                                                      $short_region, $large_region,
                                                      $average_region, "f=$factor")};
             if($size_of_all_seqlets > @all_seqlets){
                 @all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
                                                $short_region, $large_region, $average_region, "f=$factor")};
                 print "\n $size_of_all_seqlets Iterating merge_similar_seqlets \n";
                 goto ITERATION_RETURN_POINT;
             }else{
                 last;
             }
		}

		if($optimize){
             @all_seqlets=@{&remove_similar_seqlets(\@all_seqlets,
                                             $short_region, $large_region, $average_region)};
             #@all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};

		}else{
             @all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};
		}
		return(\@all_seqlets);
}





#_______________________________________________________________________________
# Title     : add_ranges_in_mspa_line
# Usage     :
# Function  : this adds ranges to the seqnames of mspa files
#             mmp line is mspa line with additional sequences at the end
# Example   :
# Keywords  : convert_mspa_to_mmp, convert_mspa, convert_mspa_2_mmp
#             change_mspa_to_mmp, add_range_in_mspa, convert_mspa_line_to_mmp_line
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.5
#-------------------------------------------------------------------------------
sub add_ranges_in_mspa_line{
   my $input_mspa=${$_[0]} || $_[0];
   my($score, $evalue, $long_1, $new_seq1, $new_seq2, $middle,
	  $start1, $end1, $start2, $end2, $seq1, $seq2, $new);

   if($input_mspa=~/^ *(\d+) +(\S+) *\S*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
	  ($score, $evalue, $start1, $end1, $start2, $end2)=($1, $2, $3, $4, $6, $7);
	  ($seq1, $seq2)=($5, $8);
	  if($seq1=~/(\S+)\_\d+\-\d+/){
		 $new_seq1="$1\_$start1\-$end1";
	  }else{
		 $new_seq1="$seq1\_$start1\-$end1";
	  }
	  if($seq2=~/(\S+)\_\d+\-\d+/){
		 $new_seq2="$1\_$start2\-$end2";
	  }else{
		 $new_seq2="$seq2\_$start2\-$end2";
	  }
	  $new=sprintf("%-6s %-8s %-5s %-5s %-32s %-5s %-5s %-32s",
					$score, $evalue, $start1, $end1, $new_seq1, $start2, $end2, $new_seq2);
   }
   return(\$new);
}




#______________________________________________________________
# Title     : sort_by_digits_in_string
# Usage     :
# Function  : sorts arrays of strings like
#
#   MJ0228_314-573 MJ1197_348-601
#   MJ0228_451-576 sll0078_502-594 sll1425_489-611
#   MJ0228_479-572 sll0078_502-594
#
#   According to the digits after seq names _314-, _451-, _479-
#    in the above
#   This only looks at the very first sequence in the string
#
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.4
#--------------------------------------------------------------
sub sort_by_digits_in_string{
   my (@out, $i,  @temp1, @temp2, $old, @T);
   my @array_of_string=sort @{$_[0]};

   for($i=0; $i<= @array_of_string; $i++){
	  if($array_of_string[$i]=~/^((\S+)_(\d+)\-(\d+) *.*)$/){
		 unless(defined($old)){
			$old=$2;
			push(@temp1, $1);
			push(@temp2, $3);
		    next;
		 }elsif($2 eq $old){
			push(@temp1, $1);
			push(@temp2, $3);
			next;
		 }elsif( ($2 ne $old)||($i==$#array_of_string) ){
			&sort_and_put_strings_to_out;
		    push(@temp1, $1);
		    push(@temp2, $3);
			$old  =$2;

			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			sub sort_and_put_strings_to_out{
			   my ($j, $k, $num);
			   @temp2=sort { $a<=>$b } @temp2; ## sort numerically
			   F1: for($j=0; $j< @temp2; $j++){
				  $num=$temp2[$j];
				  for($k=0; $k< @temp1; $k++){
					 if($temp1[$k]=~/^(\S+)_$num\-/){
						push(@out, $temp1[$k]);
						splice(@temp1, $k, 1);
						$k--;
						splice(@temp2, $j, 1);
						$j--;
						next F1;
					 }
				  }
			   }
			   @temp1=@temp2=();

			 }#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	      }
	  }elsif($i > 0){ ## for the very last sort
		  &sort_and_put_strings_to_out;
	  }
   }
   return(\@out);
}

#________________________________________________________________________
# Title     : parse_arguments
# Usage     : &parse_arguments; or  (file1, file2)=@{&parse_arguments};
# Function  : Parse and assign any types of arguments on prompt in UNIX to
#             the various variables inside of the running program.
#             This is more visual than getopt and easier.
#             just change the option table_example below for your own variable
#             setttings. This program reads itself and parse the arguments
#             according to the setting you made in this subroutine or
#             option table in anywhere in the program.
# Example   : &parse_arguments(1);
#             @files=@{&parse_arguments(1)};
# Warning   : HASH and ARRAY mustn't be like = (1, 2,3) or (1,2 ,3)
# Keywords  :
# Options   : '0'  to specify that there is no argument to sub, use
#              &parse_arguments(0);
#             parse_arguments itself does not have any specific option.
#             '#' at prompt will make a var  $debug set to 1. This is to
#              print out all the print lines to make debugging easier.
#             'e=xxxx' for filtering input files by extension xxxx
#
# Returns   : Filenames in a reference of array
#             and input files in an array (file1, file2)=@{&parse_arguments};
# Argument  : uses @ARGV
# Version   : 1.8
#--------------------------------------------------------------------
sub parse_arguments{
  my( $c, $d, $f, $arg_num, $option_table_seen, $n, $option_filtered,
		$option_table_example, $input_line, @input_files,
		$extension);
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #   Checks if there were arguments
  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  if( @ARGV < 1 ){ #<-- If Argument is not given at prompt
	  for(@_){
		 if($_ eq '0'){
			 last;
		 }else{
			 print "\n \"$0\" requires at least one Argument, suiciding.\n\n";
			 print chr(7); #<-- This is beeping
			 print "  To get help type \"$0  h\"\n\n\n ";
			 exit;
		 }
	  }
  }
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #  Checking some input options like 'e=txt' for extension filtering
  #_____________________________________________________________________
  for($i=0; $i< @_; $i++){
	  if($_[$i]=~/e=(\S+)/){
		  push(@extension, $1);
	  }
  }

  #""""""""""""""""""""""""""""""""""""""""""""""""""
  #   Some DEFAULT $debug variables for debugging purposes
  #""""""""""""""""""""""""""""""""""""""""""""""""""
  &set_debug_option;
  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  If there is only one prompt arg. and is [-]*[hH][elp]*, it calls
  #   &default_help and exits
  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  if( ( @ARGV == 1 ) && ($ARGV[0] =~ /^[\-]*[hH\?][elp ]*$/) ){
		&default_help;
		exit;
  }
  for($f=0; $f < @ARGV; $f++){
	 if( $ARGV[$f] =~ /\w+[\-\.\w]+$/ and -f $ARGV[$f] ){
		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		 # When extension is defined, filter files by it
		 #____________________________________________
		 if(@extension > 0){
		     for($e=0; $e < @extension; $e++){
				 $extension=$extension[$e];
				 if($ARGV[$f]=~/\S\.$extension/){
					 push(@input_files, $ARGV[$f] );
				 }else{ next }
			 }
		 }else{
			 push(@input_files, $ARGV[$f] );
			 next;
		 }
	 }
  }

  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #     Reading the running program script
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""
  &assign_options_to_variables;
  if($HELP == 1){ &default_help }
  return(\@input_files);
}
#________________________________________________________________________
# Title     : read_option_table
# Usage     :
# Function  : Reads the option table made by Jong in any perl script. The
#             option table is a box with separators.
# Example   :
# Warning   :
# Keywords  :
# Options   :
# Returns   :
# Argument  :
# Version   : 1.0
#--------------------------------------------------------------------
sub read_option_table{
	my($table_found, @option_tb, $head);
	 open(SELF, "${$_[0]}");
	 while(<SELF>){
		if( (/^ *#+/) && ( $table_found== 1) ){
		  push (@option_tb, "$_");
		}elsif( ($table_found != 1)&&(/^ *\#+ *[Oo]ption *[Tt]able */) ){
			$table_found=1; $head="############## Option Table for $logname\'s \"$0\"\n"; ##
			push(@option_tb, $head);
		}
		if( ($table_found==1)&&(/^ *###################+ *$/)){  ### to find the end point of reading
			$table_found =0; last; }
	 }
	 return(\@option_tb);
}
#______________________________________________________________
# Title     : get_internal_dup_in_a_cluster
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#             #  for debugging.
#  $short_region=  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region=  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region=A by A -A # taking average region overlapped in removing similar regions
#
# Version   : 1.2
#--------------------------------------------------------------
sub get_internal_dup_in_a_cluster{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

	$cluster_line=$_[0] || ${$_[0]};
	my(@seq, %out, $seq_name, $short_region, $large_region, $average_region);
	my $overlap_factor=30;
	my $min_inside_dom_size=30;
	@seq=split(/ +/, $cluster_line);  ## These sequence are single seq with different regions
	@seq= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2] }
			             map {/^((\S+)_(\d+)\-(\d+) *.*)$/ && [$1, $2, $3, $4]} @seq;
	if($char_opt=~/S/){       $short_region='S'; }
	if($char_opt=~/L/){    $large_region='L';   }
	if($char_opt=~/A/){    $average_region='A'; }

	F1:for($i=0; $i< @seq; $i++){
	   $seq1=$seq[$i];
	   if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
		  $seq_name=$1;
		  $start1=$2;
		  $end1=$3;
	   }
	   F:for($j=1; $j< @seq; $j++){
		  $seq2=$seq[$j];
		  if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
		  if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
			 $start2=$2;
			 $end2=$3;
		  }
		  $leng2=$end2-$start2;
		  $margin=$leng2/12;   ## 8% overlap is regarded as not overlapping

		  if(( ($start1+$margin) > $end2)||
		    ( ($start2+$margin) > $end1)){ # skips non overlapping seqlets

			$out{"$start1\-$end1"}.="$start2\-$end2 ";

			splice(@seq, $j, 1);
			$j--;
		  }
	   }
	}
	#@out=sort (@out);
	#@out=@{&remove_dup_in_array(\@out)};
	#@out=@{&remove_similar_seqlets(\@temp, "f=2", $short_region, $large_region, $average_region)};
	return(\%out);
}
#________________________________________________________________________
# Title     : read_head_box
# Usage     : %entries = %{&read_head_box([\$file_to_read, \@BOXED ] )};
# Function  : Reads the introductory header box(the one you see on top of sub routines of
#             Jong's programs.). Make a hash(associative array) to put entries
#             and descriptions of the items. The hash values have new lines '\n' are
#             attached, so that later write_head_box just sorts Title to the top
#             and prints without much calculation.
#             This is similar to read_head_box, but
#             This has one long straight string as value(no \n inside)
#             There are two types of ending line one is Jong's #---------- ...
#             the other is Astrid's  #*************** ...
# Example   : Output is something like
#             ('Title', 'read_head_box', 'Tips', 'Use to parse doc', ...)
# Warning   :
# Keywords  : open_head_box, open_headbox, read_headbox
# Options   : 'b' for remove blank lines. This will remove all the entries
#             with no descriptions
# Returns   : A hash ref.
# Argument  : One or None. If you give an argu. it should be a ref. of an ARRAY
#              or a filename, or ref. of a filename.
#             If no arg is given, it reads SELF, ie. the program itself.
# Version   : 2.7
#--------------------------------------------------------------------
sub read_head_box{
  my($i, $c, $d, $j, $s, $z, @whole_file, $title_found, %Final_out,
	  $variable_string, $TITLE, $title, @keys, $end_found, $line, $entry,
	  $entry_match, $End_line_num, $remove_blank,  $title_entry_null,
	  $end_found, $Enclosed_entry, $Enclosed_var, $blank_counter,
	  $title_entry_exist, $entry_value, $temp_W, $Warning_part
	);

	if(ref($_[0]) eq 'ARRAY'){ ## When array is given
		@whole_file = @{$_[0]};
	}elsif(-e ${$_[0]}){       ## When filename is given in a ref
		open(FILE, "${$_[0]}");
		@whole_file=(<FILE>);
	}elsif(-e $_[0]){          ## When filename is given
		open(FILE, "$_[0]");
		@whole_file=(<FILE>);
	}elsif( $_[0] eq 'b'){          ## When filename is given
		$remove_blank = 1;
	}elsif( ${$_[0]} eq 'b'){          ## When filename is given
		$remove_blank = 1;
	}else{
		open(SELF, "$0");
		@whole_file=(<SELF>);
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	for($i=0; $i<@whole_file; $i++){
	   $whole_file[$i] =~ tr/\t/ {7}/;  ## This is quite important to some parsing!!!
	   #########################################
	   ##  The first and second line of box 1 ##
	   #########################################
	   if( ($whole_file[$i]=~/^#[_\*\~\-\=]{20,}$/)&&    ##  '#______' is discarded
		   ($whole_file[$i+1]=~/ *\# {0,3}([TitlNam]+e) {0,8}: {1,10}([\w\.:]*) *(Copyright.*)/i) ){
		   $TITLE = $1;      $title = "$2\n";   $Final_out{'Warning'}.="$3\n";
		   $entry_match=$TITLE; ## The very first $entry_match is set to 'Title' to prevent null entry
		   if($TITLE =~ /^Title|Name$/i){   #
				if( ($title=~/^\s+$/)||( $title eq "\n") ){
					$title_entry_null =1;  $title = '';  }    }
		   $Final_out{$TITLE}=$title;
		   $title_found ++ ;   $i++;  ## << this is essential to prevent reading the same line again.
		   last if $title_found > 1;    }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ## The first and second line of box 2, #__________ or #**************
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&($whole_file[$i]=~/^#[_\*]{20,}$/)&&
		   ($whole_file[$i+1]=~/^# {1,3}(\w{1,6}\s{0,2}\w+) {0,7}: {1,5}(.*) */i) ){
		   $title_found ++ ;        $i++;
		   $entry_match=$1;       $entry_value=$2;
		   $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;  ## Capitalize words
		   $Final_out{$entry_match}.= "$entry_value\n";
		   last if $title_found > 1;  next;   }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ##  'Enclosed' : section. After this, everything is read without discrimination ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($Enclosed_entry == 1)&&($whole_file[$i] =~/^#{1,} {1,}(.*)$/) ){
		   $Final_out{$Enclosed_var}.= "$1\n";    }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ##  With proper entry 1 : for  'eg)'
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&($title_found==1)&&
		   ($whole_file[$i]=~ /^# {1,12}(eg ?\)) {0,8}(.*)/i)){
		   $entry_match='Example';
		   $Final_out{$entry_match}.= "$2\n";
	   }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ##  With PROPER entry 2 : descriptins like. 'Ussage : ssssssxxjkk  kj'
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&($title_found==1)&&
		   ($whole_file[$i]=~ /^# {1,2}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)] {0,6}(.*) */i)){
		   $entry_match=$1;       $entry_value=$2;
		   $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
		   $Final_out{$entry_match}.= "$entry_value\n";
		   if($entry_match=~/^(Enclosed?)$/i){
				$Enclosed_entry = 1;  $Enclosed_var=$1;        }    }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   #  With proper entry 3 : descriptins like. 'Ussage :', But blank description ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&($title_found==1)&&
		   ($whole_file[$i]=~ /^# {1,2}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)]( {0,})$/i)){
		   $entry_match=$1;       $entry_value=$2;
		   $entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
		   $Final_out{$entry_match}.= " $entry_value\n";
		   if($entry_match=~/^(Enclosed?)$/i){
				$Enclosed_entry = 1;  $Enclosed_var=$1;      }    }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   #  $option variable matching                ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1) && ($title_found==1) &&
		   ($whole_file[$i]=~ /^# {1,15}([\$\@]\w+ +[\w=\>]+ +\S+ \w+ \S+ *.*)/ )){
		   $Final_out{$entry_match} .= "$1\n";  }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ###  all space line matching                 ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&  ##<----- If blank line is matched. Take the line
		   ($title_found==1)&&($whole_file[$i]=~/^# {0,}$/) ){
		   $blank_counter++;
		   if($blank_counter > 2){ $blank_counter--; }
		   else{ $Final_out{$entry_match}.= " \n";  }     }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ###  Anything after 3 space to 12 positions  ##
	   ###  To match 'examples' etc. INC. ':'       ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&
		   ($title_found==1)&&($whole_file[$i]=~/^#( {2,12})(.+)/) ){
		   $Final_out{$entry_match}.= "$1$2\n"; $blank_counter=0; }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ###  Anything after 1 space to 11 positions  ##
	   ###  To match 'examples' etc. EXC. ':'       ##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($end_found != 1)&&
		   ($title_found==1)&&($whole_file[$i]=~/^# {1,12}([^:.]+)/) ){
		   $Final_out{$entry_match}.= "$1\n"; $blank_counter=0;}

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   ###-------End of the read_box reading--------##
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( ($title_found==1)&&
		   ($whole_file[$i]=~ /^#[\~\=\*\-]{15,}/)){  ## to match '#-----..' or '#******..'(Astrid's)
		   $End_line_num = $i;       $end_found++;
		   last;      }

	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   #  <<<<  Check if there is option table >>>>  #
	   #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	   elsif( (/^#{10,} option table of this program   #{10,}/)&&($end_found >=1) &&($title_found==1)){
		   $option_tb_found++; ### This is a global var.
	   }
	} ## < End of for loop


	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### If title is not there at all     ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	@keys=keys %Final_out;
	for(@keys){
		if(/^Title$/i){    ## No Entry of Title at all??
			$TITLE =$&;
			$title_entry_exist = 1;
			if($Final_out{$_}=~/^ *$/){   ## if Title => Null or just space
				$title_entry_null = 1;    }  }  }

	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### When title entry is not there    ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if( $title_entry_exist != 1){
		  for($s=$End_line_num+1; $s < $End_line_num+20; $s++){
			  if( $whole_file[$s] =~ /^sub {1,5}([\w\.]+) {0,6}\{/){
				  $Final_out{'Title'} = "$1\n";   last;       }
			  elsif( $whole_file[$s] =~/^#________________________+/){
				  #######################################
				  ## Uses running file name as titile  ##
				  #######################################
				  $Final_out{'Title'} = "$0";     last;
			  }else{
				  #######################################
				  ## Uses running file name as titile  ##
				  #######################################
				  $Final_out{'Title'} = "$0";
			  }
		  }
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	### When title is blank              ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	elsif($title_entry_null ==1){  ## It looks for 'sub xxxxx{ ' line to get title
		  ### $End_line_num is the last line read.
		  for($s = $End_line_num+1; $s < $End_line_num+20; $s++){
			  if( $whole_file[$s] =~ /^sub {1,5}(\w+\.*\w*) {0,7}{/){
				  $Final_out{$TITLE} = "$1\n";    last;     }
			  elsif( $whole_file[$s] =~/^#________________________+/){
				  #######################################
				  ## Uses running file name as titile  ##
				  #######################################
				  $Final_out{$TITLE} = "$0";     last;
			  }else{
				  #######################################
				  ## Uses running file name as titile  ##
				  #######################################
				  $Final_out{$TITLE} = "$0";
			  }
		  }
	}
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	## Error handling, if no head box is found   ####
	#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	if($title_found < 1){ print "\nFatal: No headbox found by read_head_box2 sub.\n";  }
	return(\%Final_out);
}               ##<<--- ENd of the sub read_head_box

#______________________________________________________________
# Title     : check_linkage_of_2_similar_seqlet_sets
# Usage     :
# Function  : connects two clusters of seqlets if they share
#              identical or near identical seqlets
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#  $factor = by f=  # eg)  "f=$factor" in the higher level sub
#
# Returns   :
# Argument  :
# Version   : 2.0
#--------------------------------------------------------------
sub check_linkage_of_2_similar_seqlet_sets{
	 my ($seq1, $name1, $start1, $end1, $seq2, $leng1, $leng2,
	    $name2, $start2, $end2, $diff_start,  $diff_end, @splited1,
	    @splited2, $link_or_not, $factor, $s, $t, $final_factor);
	 @splited1=@{$_[0]};
	 @splited2=@{$_[1]};

	 $link_or_not=0;
	 $factor=7;  # this means 70% sequence region overlap of the intermediate is chosen

	 if($_[2]=~/f=(\S+)/i){	  $factor=$1;	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # Breaks the splited1 and splited2 strings to words to compare
	 #_________________________________________________________________
	 F1: for($s=0; $s<@splited1; $s++){
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# Checks if the input has :  XXXXX_10-400 format or simple name like XXXXXX
			#______________________________________________________________________________
			if($splited1[$s]=~/^ *((\S+)_(\d+)\-(\d+))/){ $seq1=$1;	$name1=$2; $start1=$3; $end1=$4;
			}else{   $seq1=$splited1[$s]; $name1=$start1=$end1='';    }
			#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			# Breaks the splited2
			#_____________________________________________________________________
			F2: for($t=0; $t< @splited2; $t++){
				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 # If splited1 has XXXX_10-100 format(def($name1)), then compare regions
				 #_________________________________________________________________________
                 if($name1 and $splited2[$t]=~/^ *((\S+)_(\d+)\-(\d+))/){ $seq2=$1; $name2=$2; $start2=$3; $end2=$4;
					 if($seq1 eq $seq2){ $link_or_not=1; return(\$link_or_not) }
					 if($name1 ne $name2){
						 next F2;
					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
                     # The most impoartant part is here. $final_factor=$smaller_leng - $smaller_leng*($factor/10);
					 #____________________________________________
                     }elsif($name1 eq $name2){
						 $leng1=$end1-$start1; $leng2=$end2-$start2;
						 if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
						 $diff_start=abs($start1-$start2);
						 $diff_end  =abs($end1  -$end2  );
                         $final_factor=$smaller_leng - $smaller_leng*($factor/10);
                         $final_diff=($diff_start+$diff_end)/2;
                         if($final_diff <= $final_factor ){
                            $|=1;
                            print "\n$seq1 $seq2: $final_diff ($diff_start, $diff_end): $smaller_leng, $final_factor, ";
							$link_or_not=1; return(\$link_or_not);
    					 }else{  print "\n$seq1 $seq2: $final_diff ($diff_start, $diff_end): $smaller_leng, $final_factor 0\n"; }
					 }## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 }else{
				     $seq2=$splited2[$t];
					 if($seq1 eq $seq2){ $link_or_not=1; }
				 }
			}
	 }
	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # If $link_or_not has become 1 in any of the above part, 1 is returned
	 #________________________________________________________________________
	 return(\$link_or_not);
}



#__________________________________________________________________________
# Title     : show_subclusterings
# Usage     : &show_subclusterings(\@out);
# Function  : This is the very final sub of divclus.pl
# Example   : @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup);
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : print_subclusterings, sum_subclusterings, write_subclustering
#             show_clusterings, display_subclusterings
# Options   :
#             f  for file output, eg: xxxxxxx.sat
# Category  :
# Version   : 2.9
#-------------------------------------------------------------------------
sub show_subclusterings{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	my ($max_size, $sat_file_name, $clu_file_name,
	$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
	$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
	@sub_clustering_out_files);  # clusall_1e-5_clu_14-324_ss.sat
	my @out=@{$array[0]};
	$indup_count=0;

	if($char_opt=~/d/){	    $dindom=1;	}
	if($char_opt=~/i/){		$indup=1;	}
	if($vars{'f'}=~/\S+/){     $factor= $vars{'f'}; }
	if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
	if($vars{'s'}=~/\d+/){	   $score = $vars{'s'};	}
	if($vars{'e'}=~/\d+/){	   $evalue= $vars{'e'};	}

	print "\n# (1) show_subclusterings : \@file has : @file";
    if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.mspa/  or
		$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/   ){
         $ori_cluster_size=$2;
         $ori_cluster_num =$3;
         $base=$1;
		 $sat_file_name="$base\.sat";
         $clu_file_name="$base\.clu";
	}else{
         $ori_cluster_size="0000";
	     $ori_cluster_num ="0000";
	     $base=${&get_base_names($file[0])};
	     $clu_file_name="$base\.clu";
		 warn "\n# (2) LINE:",__LINE__," WARN: the \@file input to show_subclusterings is not the right format, dying\n";
		 warn "     Sarah!, right format looks like: 13-234.mspa or 8-420_cluster.mspa \n";
	}

	open(CLU, ">$clu_file_name") || die "\n# (ERROR) show_subclusterings failed miserably to CREATE \"$clu_file_name\" \n";
	push(@sub_clustering_out_files, $clu_file_name);


	@out=@{&sort_string_by_length(\@out)};

	for($i=0; $i< @out; $i++){ # @out has ( 'YAL054C_98-695 YBR041W_90-617', 'YBR115C_230-842 YBR222C_16-537 YER015W_121-686', etc)
	   my $count+=$i+1;
	   my ( $int_dup_number, $sub_clu_size, $seq_with_range, @sp, $new_clus_NAME,
	        %tem, %tem2, %tem3, $j, @keys, $num_seq);
	   if($out[$i]=~/^ *$/){ next }
	   @sp=sort split(/ +/, $out[$i]);

	   for($j=0; $j < @sp; $j++){
		  $seq_with_range=$sp[$j];
		  if($seq_with_range=~/^((\S+)_((\d+)\-(\d+)))/){
			 $tem{$2}++;
			 $tem2{$2}.=sprintf("%-15s ", $1);
			 $tem3{$2} =$3;
			 $tem4{$2} =$5-$4;
		  }
	   }

	   @keys=sort keys %tem;
	   $num_seq=$sub_clu_size=@keys;

	   if($max_size < $sub_clu_size){
		  $max_size=$sub_clu_size; ## This is to collect the sizes of clusters to see if it is good.
	   }
	   $indup_count= &print_summary_for_divclus(
		         $count, \%tem2, \%tem,
		         $ori_cluster_num,
		         $ori_cluster_size,
		         $dindom,
		         $clu_file_name,
								 \%tem3, \%tem4,
								 $indup, );

					 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
					 # Local subroutine
					 #_______________________________________________________________
	   sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							 my(@keys, $indup_count, $x, $m, $percentage_fac);
							 my $count=$_[0]; # count of cluster
	       my %tem2=%{$_[1]};	my $num_seq=@keys=sort keys %tem2;
	       my %tem=%{$_[2]};	my $ori_cluster_num=$_[3];
	       my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
	       my $ori_cluster_size=$_[4];
	       my $dindom=$_[5];	my %tem3=%{$_[7]};
	       my $indup=$_[9];	my (%internal_dup);
	       my %tem4=%{$_[8]};
							 #~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
	       if($dindom){
	          for($x=0; $x <@keys; $x++){
											 @domain_inside_domain=@{&get_domain_inside_domain($tem2{$keys[$x]})};
											 @domain_inside_domain=@{&remove_dup_in_array(\@domain_inside_domain)};
											 for($m=0; $m< @domain_inside_domain; $m++){ print "  # Dindom: $m : $domain_inside_domain[$m]\n";   }
											 print "\n";
		  }
							 }
							 #==========================================================================================

	       #~~~~~~~~~~ Internal duplication  ~~~~~~~~~~~~~~
	       if($indup==1){
		   # @keys is the same as sub cluster size,
		   for($x=0; $x < @keys; $x++){
														 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
														 # Checks each sequence for duplication
														 #___________________________________________________
														 my %internal_dup=%{&get_internal_dup_in_a_cluster( $tem2{$keys[$x]} )};
														 my @dup_keys=keys %internal_dup;
														 if(@dup_keys > 0){
																		 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
																		 #  This calculates the actual duplicated number rather than jus tthe sequences
																		 #______________________________________________________________________________
																		 $indup_count++;
																		 printf ("%-14s %-12s %-4s", $keys[$x], $new_clus_NAME, $num_seq);
																		 for($m=0; $m< @dup_keys; $m++){
																						 printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
																		 }
														 }
										}
								 }

								#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
								print  CLU  "Cluster size $num_seq\n";
																				printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
																					$new_clus_NAME, $evalue, $factor, $percentage_fac,
																					$ori_cluster_size, $num_seq, $ori_cluster_num);
								print       "Cluster size $num_seq\n";
								printf     ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
															$new_clus_NAME, $evalue, $factor, $percentage_fac,
															$ori_cluster_size, $num_seq, $ori_cluster_num);
								for($x=0; $x <@keys; $x++){
									 printf CLU ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
															 $num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
									 printf ("   %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
													$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
								}
								return($indup_count);
	   }
	}
		close(CLU); ## this is a bug fix

	if($max_size == $ori_cluster_size){   $good_bad=1;
	}else{	                              $good_bad=0;	}

    print "\n# Sarah, Do you think the subclusterings are O.K.?" if $verbose;
    print "\n#   Tell me, if you feel suspicious, jong\@salts.med.harvard.edu\n\n" if $verbose;
    return($good_bad, $indup_count, $ori_cluster_size, \@sub_clustering_out_files);
}







#______________________________________________________________________
# Title     : sort_string_by_length (synonym of sort_str_by_length  )
# Usage     : @output = @{&sort_string_by_length(@any_input_strings, [-r], @more)};
# Function  : sorts strings in array according to their sizes
#             bigger comes first.
# Example   :
# Warning   :
# Keywords  : sort_array_by_length, sort_str_by_length, sort_array_string_by
#             sort_string_by_leng, sort_by_length, sort_by_leng,
# Options   : -r  reverse the order
# Version   : 1.2
#-------------------------------------------------------------------
sub sort_string_by_length{
	my(@input, $i, $small_first, @output);
	for($i=0; $i<@_; $i++){
		if( $_[$i]=~/^\-?r$/i){
			$small_first =1;
			splice(@_, $i, 1);
		}elsif(ref($_[$i]) eq 'ARRAY'){
		    push(@input, @{$_[$i]});
		}elsif(ref($_[$i]) eq 'SCALAR'){
			if(${$_[$i]}=~/^\-?r$/i){
			   $small_first=1;
			   splice(@_, $i, 1);
			}else{
			   push(@input, ${$_[$i]});
			}
		}elsif( !ref($_[$i]) ){
		    push(@input, $_[$i]);
		}
	}
	if($small_first ==1){
	    @output = sort {length($a) <=> length($a) || ($b cmp $a)} @input;
	}else{
	    @output = sort {length($b) <=> length($a) || ($a cmp $b)} @input;
	}
	return (\@output);
}


#______________________________________________________________
# Title     : cluster_merged_seqlet_sets
# Usage     : @out=@{&cluster_merged_seqlet_sets(\@lines)};
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  :
# Options   : _  for debugging.
#             #  for debugging.
#  $short_region=  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region=  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region=A by A -A # taking average region overlapped in removing similar regions
#
# Version   : 1.8
#--------------------------------------------------------------
sub cluster_merged_seqlet_sets{
	 #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	 my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	 my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	 my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	 my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	 my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	 if($debug==1){print "\n\t\@hash=\"@hash\"
	 \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	 \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	 #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my ($optimize, @splited1, @splited2, $verbose, $link_or_not);
	 my @seq_names_in_clu=@{$array[0]};
	 $link_or_not=0;
	 my $factor=7; # 7 means 70% now

	 if($vars{'f'}=~/(\S+)$/){ $factor=$1 }
	 if($char_opt=~/o/){ $optimize=1 }
	 if($char_opt=~/S/){ $short_region='S'; }
	 if($char_opt=~/L/){ $large_region='L';   }
	 if($char_opt=~/A/){ $average_region='A'; }
	 if($char_opt=~/v/){ $verbose=1 }

	 if($verbose){ print "\n# (1) cluster_merged_seqlet_sets: Checking linkage and merging <<<<<>>>>>\n@seq_names_in_clu\n";   }

	 F1: for($i=0; $i< @seq_names_in_clu; $i++){
			@splited1=split(/ +/, $seq_names_in_clu[$i]);
			for($j=0; $j< @seq_names_in_clu; $j++){
				if($seq_names_in_clu[$i] eq $seq_names_in_clu[$j]){ next  }
				@splited2=split(/ +/, $seq_names_in_clu[$j]);

                $link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1, \@splited2, "f=$factor")};
				print "\n +++++ \$link_or_not is  $link_or_not +++" if $verbose;
				if($link_or_not==1){
						 if($verbose){
								 print "\n# (2) cluster_merged_seqlet_sets: \n $seq_names_in_clu[$i]  \n and $seq_names_in_clu[$j] \n are linked \n";
						 }

						 if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
								$seq_names_in_clu[$i]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
																						$short_region, $large_region, $average_region)} );
						 }else{
								$seq_names_in_clu[$i]=join(' ', sort @{&remove_dup_in_array( [@splited1, @splited2])} );
						 }
						 splice(@seq_names_in_clu, $j,1);
						 $j--; $i--;
						 next F1;
		 }
	  }
	 }
	 return(\@seq_names_in_clu);
}

#______________________________________________________________
# Title     : sort_words_in_string
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Class     :
# Keywords  : sort_words_in_sequences, sort_sequences_in_string,
#             sort_strings_in_string,
# Options   : _  for debugging.
#             #  for debugging.
# Package   : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns   :
# Tips      :
# Argument  :
# Todo      :
# Author    : A Scientist
# Version   : 1.1
#--------------------------------------------------------------
sub sort_words_in_string{
   my @in=@{$_[0]} || @_;
   my (@OUT);
   for (@_){
	  push(@OUT, join(' ', sort split(/ +/) ));
   }
   return(\@OUT);
}

#________________________________________________________________________
# Title     : show_array
# Usage     : &show_array(\@input_array);
# Function  : for debugging purpose. Shows any array elem line by line.
# Example   : Output:      item1
#             Output:      item2
#             Output:      item3
# Warning   : can handle scalar ref, too.
# Keywords  :
# Options   : -h  for horizontal display of elements
#             c   for compact (do not put new line between array chunk)
#             s   for putting new line between arrays
# Returns   :
# Argument  :
# Version   : 2.4
#--------------------------------------------------------------------
sub show_array{
  my($k, $i, $t,  @in2, $in, $space, $show_horizontally, $compact);
  my(@in)=@_;

  ## This is to get the option of 'horizontal' to make horizontal output.
  for($t=0; $t < @in ; $t++){
	 if($in[$t] =~/\-?[hH][orizontal]*$/){   ### No ref.
		 $show_horizontally = "h";
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/-?[hH][orizontal]*$/){  ### ref.
		 $show_horizontally = "h";
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/^s$/i){  ### ref.
		 $space = "s";
		 $compact='';
		 splice(@in, $t, 1);  $t--;
	 }elsif(${in[$t]} =~/^c$/i){  ### ref.
		 $compact = "c";
		 $space='';
		 splice(@in, $t, 1);  $t--;
	 }
  }

  for($k=0; $k < @in; $k++){
	 if(ref($in[$k]) eq 'ARRAY'){
		 &show_array(@{$in[$k]}, "$show_horizontally", "$compact", "$space" );
	 }elsif(ref($in[$k]) eq 'SCALAR'){
		 if($show_horizontally eq "h"){
			 print ${$in[$k]}, ",  ";
		 }elsif(  $show_horizontally ne "h"){
			 print ${$in[$k]}, "\n";
		 }
	 }elsif( !ref($in[$k]) ){
		 if($show_horizontally eq 'h'){
			 print  $in[$k] , ",  ";
		 }elsif(  $show_horizontally ne "h"){
			 print  $in[$k] , "\n";
		 }
	 }
  }
  if($compact !~/^c$/i){
	print "\n"; #### This is necessary to distinguish different arrays.
  }
}


#_________________________________________________________________________
# Title     : merge_similar_seqlets
# Usage     : @all_seqlets = @{&merge_similar_seqlets(@all_seqlets)};
# Function  : merges seqlet sets which have identical
#             sequences and share similar regions by connection factor of 30%
#             This means, if any two seqlets from the same sequences which
#             share more than 70% seqlet regions overlapping are merged
#             This only sees the very first sequence in the seqlets line!!!
#             (so, PARTIAL MERGE !!)
# Example   : INPUT:
#
#   @input=( 'seq1_1-30 seq2_1-40 seq3_1-50',
#            'seq1_2-49 seq3_4-40 seq4_2-99'....)
#
#   @output=('seq1_1-30 seq2_1-45 seq3_2-45 seq4_2-99');
#
# Keywords  : merge_similar_sequences, merge_sequence_names, merge_sequences,
#              merge_sequence_ranges, merge_similar_sequences_with_ranges,
#              merge_seqlets, merge_duplication_modules
# Options   :
#
#   f=<digit>   for determing the factor in filtering out non-homologous
#                  regions, 7 = 70% now!!
#   l=<digit>   for seqlet(duplication module) length threshold
#   z           for activating remove_similar_sequences, rather than remove_dup....
#   S  $short_region=  S by S -S  # taking shorter region overlap in removing similar reg
#   L  $large_region=  L by L -L  # taking larger  region overlap in removing similar reg
#   A  $average_region=A by A -A  # taking average region overlap in removing similar reg
#
# Version   : 2.2
#-------------------------------------------------------------------------------
sub merge_similar_seqlets{
	 my (@all_seqlets, @result_all_seqlets, $i, $j, $k, $seq1, $start1, $end1, $seq2,
	   $smaller_leng, $start2, $end2, @split, @split1, @split2, $factor, $leng_thresh, $optimize,
			 $short_region, $large_region, $average_region, $overlapping_seq_match_size);
	 $factor=7;     #  30% sequence mismatch region is allowed(3)
	 $leng_thresh=30;
	 $optimize=1;
	 $average_region='A'; # default

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
	 # Sorting (parsing) input to get options and input array
	 #_________________________________________________________
	 for($i=0; $i< @_; $i++){
	     if(ref($_[$i]) eq 'ARRAY'){
             @all_seqlets=@{$_[$i]}; #<------------ @all_seqlets is a very very big array with all the mspa chunks altogether
			 }elsif($_[$i]=~/f=(\S+)/){  $factor=$1;
			 }elsif($_[$i]=~/z/i){       $optimize=1;
			 }elsif($_[$i]=~/l=(\d+)/i){ $leng_thresh=$1;
			 }elsif($_[$i]=~/^S/){       $short_region='S';   $large_region=$average_region='';
			 }elsif($_[$i]=~/^L/){       $large_region='L';   $short_region=$average_region='';
			 }elsif($_[$i]=~/^A/){       $average_region='A'; $short_region=$large_region  =''; }
	 }
	 if(@all_seqlets==1){
         return(\@all_seqlets);
	 }

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # This is to remove which are identical in @all_seqlets;
	 #_________________________________________________________
	 F1: for($i=0; $i< @all_seqlets; $i++){
		my $merged_two_seqlet_lines;

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
        # The following is correct. Don't touch again
        #__________________________________________________
        if($all_seqlets[$i] eq $all_seqlets[$i+1]){
			splice(@all_seqlets, $i+1, 1);
			$i-- if $i >0;    next F1;
	    }else{
            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # @split1 and 2 are arrays from different string entry in @all_seqlets
            #______________________________________________________________________
            @split1=sort split(/\s+/, $all_seqlets[$i]);
            @split2=sort split(/\s+/, $all_seqlets[$i+1]);
		}

	    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	    #   (3) If the first elements of @split1 and 2 are identical, lets merge the two arrays. For example,
	    #    aa_EC1427_1-390 aa_EC388_1-374 ap_EC143_23-399 dr_6457710_11-405 ec_1787201_9-360 mj_MJ1649_5-387 mj_MJ1653_4-383 pa_5459109_1-394 ph_PH1915_1-394 tm_4982274_20-385
        #    aa_EC1427_1-390 aa_EC388_1-372 ap_EC143_40-399 dr_6457710_11-407 dr_6459463_3-373 ec_1787201_4-367 mj_MJ1649_5-385 mj_MJ1653_39-382 pa_5459109_21-392 ph_PH1915_21-392 tm_4982274_12-382
	    #__________________________________________________________________________________________________
		if($split1[0] eq $split2[0] or $split1[0] eq $split2[1] or $split1[1] eq $split2[0]){
              @split=(@split1, @split2);
              #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              # This step is proven to be fine. optimize option removes similar seqlets
              #___________________________________________________
              if(1){
                 $all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
		                              $short_region, $large_region, $average_region)} );
		      }else{
				 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
				 # Only removes exactly identical ones
				 #__________________________________________________________
				 $all_seqlets[$i]=  join(' ', @{&remove_dup_in_array(\@split, 's')} );
		      }
		      splice(@all_seqlets, $i+1, 1);     $i-- if $i >0;     next F1;
	    }

	    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# (4) If the first elements of @split1 and 2 are NOT identical, lets check the sequence ranges
	    #_____________________________________________________________________________________________
		F2: for($j=0; $j < @split1; $j++){
			if($split1[$j] =~/^\s*(\S+)_(\d+)\-(\d+)/){
				 my ($seq1, $start1, $end1)=($1, $2, $3);

				 F3: for($k=0; $k<@split2; $k++){
					 if($split2[$k] =~/(\S+)_(\d+)\-(\d+)/){
						 my($seq2, $start2, $end2)=($1, $2, $3);

						 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
						 # Check if the seqs are identicl (from the two arrays), no point to merge which are not identical from the first
						 #__________________________________________________________________________________________
						 if($seq1 eq $seq2){
                             $diff_start=abs($start1-$start2); $diff_end  =abs($end1  -$end2  );
                             $leng1=$end1-$start1; $leng2=$end2-$start2;
                             if($leng1 >= $leng2){  $smaller_leng=$leng2; $larger_leng =$leng1
                             }else{  $smaller_leng=$leng1;  $larger_leng =$leng2       }

                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                             # Checking the minimal seq region leng here
                             #______________________________________________________
                             if($smaller_leng < $leng_thresh){ next }

                             $overlapping_seq_match_size=${&get_overlapping_seq_match_size($start1, $end1, $start2, $end2)};
                             $averge_seq_leng_of_2_seqs=($leng1+$leng2)/2;

                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                             # This is the critically important part
                             #_______________________________________________________________
                             if($average_region){      $finally_adjusted_seq_leng=$averge_seq_leng_of_2_seqs*($factor/10);
                             }elsif($short_region){    $finally_adjusted_seq_leng=$smaller_leng*($factor/10);
                             }elsif($large_region){    $finally_adjusted_seq_leng=$larger_leng*($factor/10);     }

                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
                             # Now let's check if we regard them homologous or not\
                             #_______________________________________________________
                             if( $overlapping_seq_match_size >=  $finally_adjusted_seq_leng){
                                 @split= (@split1, @split2);
                                 if($optimize){ #~~~~~ $optimize option removes similar seqlets
                                     $all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
                                                         $short_region, $large_region, $average_region)} );
                                 }else{
                                       $all_seqlets[$i]= join(' ', @{&remove_dup_in_array(\@split, 's')} );
                                 }
                                 $merged_two_seqlet_lines=1;
                                 splice(@all_seqlets, $i+1, 1);
                                 $i-- if $i >0;  next F1;
                             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
                             # We believe they are not homologous
                             #____________________________________________
                             }else{  next F3;  }
                          }
                       }
                       #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                       # If there is no range (region) in seq naem, let's skip, as there is no way to check
                       #__________________________________________________________________________________
                       else{ # when split2 does not match xxx_10-20 format
                               next;
                       }
                  }
            }else{  next; } # when split1 does not match xxx_10-20 format
        }
        unless($merged_two_seqlet_lines){   }
	 }
	 return(\@all_seqlets);
}



#________________________________________________________________________________
# Title     : get_overlapping_seq_match_size
# Usage     : $ovlapsize=${&get_overlapping_seq_match_size($st1, $en1, $st2, $en2)
# Function  :
# Example   :
# Keywords  :
# Options   :
# Version   : 1.1
#--------------------------------------------------------------------------------
sub get_overlapping_seq_match_size{
    my($start1, $end1, $start2, $end2, $overlapping_region_matched);
    if(@_ == 4){
       $start1=$_[0]; $end1 =$_[1];  $start2=$_[2]; $end2  =$_[3];
    }elsif(@_==2){
       if( $_[0]=~/(\d+)\-(\d+)/ ){
           $start1=$1;      $end1  =$2;
       }elsif($_[1]=~/(\d+)\-(\d+)/ ){
           $start2=$1;      $end2  =$2;
       }else{
           print "\n# (E) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
           print "   They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
           print "   You got it, Sarah?? Try again my dear!\n";
       }
    }else{
           print "\n# (E) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
           print "   They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
           print "   You got it, Sarah?? Try again my dear!\n";
    }

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #     ---------
    #  ------
    #___________________________________
    if($start1 >= $start2 and $end1 >= $end2){
        $overlapping_region_matched=$end2-$start1;
    }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ---------
    #     ----------
    #___________________________________
    elsif($start1 <= $start2 and $end1 <= $end2){
        $overlapping_region_matched=$end1-$start2;
    }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #      -----
    #    ----------
    #___________________________________
    elsif($start1 >= $start2 and $end1 <= $end2){
        $overlapping_region_matched=$end1-$start1;
    }
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #  ---------
    #    ----
    #___________________________________
    elsif($start1 <= $start2 and $end1 >= $end2){
        $overlapping_region_matched=$end2-$start2;
    }
    return(\$overlapping_region_matched);
}


#_____________________________________________________________________________
# Title     : remove_similar_seqlets
# Usage     : @seqlets=@{&remove_similar_seqlets(\@split)};
# Function  : merges(gets average starts and ends ) of similar
#             seqlets to reduce them into smaller numbers. This can also handle
#              names like XLBGLO2R_8-119_d1hlm__.
#
# Example   : @seqlets=@{&remove_similar_seqlets(\@mrg1, $mrg2, \@mrg3)};
#               while @mrg1=qw(M_2-100 M_2-110 M_8-105 M_4-108 N_10-110 N_12-115);
#                     $mrg2='Z_3-400 Z_2-420';
#                     @mrg3=('X_2-300 X_3-300', 'X_2-300', 'X_5-300 X_2-301' );
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : merge_sequence_names, merge_seq_names, merge_sequence_ranges
#             merge_seq_ranges
# Options   : _  for debugging.
#             #  for debugging.
#             f= for factor
#             S  for shorter region matched is used
#             A  for average region matched is used
#             L  for larger region matched is used
#
# Version   : 2.1
#-------------------------------------------------------------------------------
sub remove_similar_seqlets{
	 my ($i, $j, $seq1, $smaller_leng, $leng1, $leng2, $start1, $end1, $seq2, $start2,
	   $av_diff, $num_of_seq, $av_end, $av_start, $end2, @seqlets,
	   @array_input, @seqlet, $tail1, $tail2, $shorter_region, $larger_region,
	   $average_region, $factor);
	 $factor=7;  ## !!! This var makes big difference in the final clustering
	 $average_region = 'A'; ## default is to get the average of comparing regions

	 for($i=0; $i< @_; $i++){
	    if(ref($_[$i]) eq 'ARRAY'){
		     @array_input=@{$_[$i]};
		     for($j=0; $j<@array_input; $j++){
			      @seqlet=split(/ +/, $array_input[$j]);
					  push(@seqlets, @seqlet);
		     }
		     #if($verbose){
				 #   print "\n# remove_similar_seqlets: ARRAY ref is given as input\n";
				 #   print "#  They are: @seqlets\n";
				 #}
	    }elsif($_[$i]=~/f=(\S+)/){   $factor=$1
	    }elsif($_[$i]=~/^(S) *$/){   $shorter_region=$1 ; $average_region=0;
	    }elsif($_[$i]=~/^(L) *$/){   $larger_region =$1 ; $average_region=0;
	    }elsif($_[$i]=~/^(A) *$/){   $average_region=$1 ; $shorter_region=$larger_region=0;
	    }elsif($_[$i]=~/\S+\_\d+\-\d+/){
		     push(@seqlets, split(/ +/, $_[$i]) );
	    }elsif(ref($_[$i]) eq 'SCALAR' and ${$_[$i]}=~/\S+\_\d+\-\d+/){
	       push(@seqlets, split(/ +/, ${$_[$i]}) );
	    }
	 }
	 #print "\n# remove_similar_seqlets : I am using \$factor : $factor\n" if $verbose;

	 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	 # Sorting is necessary as I am not doing the real thorough comparison
	 #______________________________________________________________________
	 $num_of_seq=@seqlets=sort @seqlets;

	 my ($short_start, $large_start, $short_end, $large_end);

	 for($i=0; $i< @seqlets; $i++){
           if($seqlets[$i]=~/^ *(\S+)_(\d+)\-(\d+)(\S*)/){  ## last (\S*) is necessary for XLBGLO2R_8-119_d1hlm__
               my($seq1, $start1, $end1, $tail1)=($1, $2, $3, $4);
               if($seqlets[$i+1]=~/^(\S+)_(\d+)\-(\d+)(\S*)/){
                   ($seq2, $start2, $end2, $tail2)=($1, $2, $3, $4);
                   if($seq1 eq $seq2){
                        $diff_start=abs($start1 - $start2);
                        $diff_end  =abs($end1   - $end2  );
                        $leng1=$end1-$start1;
                        $leng2=$end2-$start2;

                        if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
                        if( ($diff_start+$diff_end)/2 <= $smaller_leng*($factor/10) ){

                            if($average_region){
                                 $av_start=int(($start1+$start2) / 2);
                                 $av_end  =int(($end1 + $end2) / 2);
                                                 $seqlets[$i]="$seq1\_$av_start\-${av_end}$tail1";  # $tail1 is for names like XLBGLO2R_8-119_d1hlm__
                                                 # print "\n# new seqlet : $seqlets[$i]\n" if $verbose;
                                 splice(@seqlets, $i+1, 1);
                                 $i--;
                            }else{
                                 if($start1 < $start2){
                                 $short_start=$start2; $large_start=$start1;  ## note that short start should be $start2 if $start2 is bigger
                                 }else{
                                        $short_start=$start1; $large_start=$start2;
                                 }
                                 if($end1 < $end2){
                                        $short_end=$end1;  $large_end=$end2;
                                 }else{
                                        $short_end=$end2;  $large_end=$end1;
                                 }
                                 if($shorter_region){
                                         $seqlets[$i]="$seq1\_$short_start\-${short_end}$tail1";
                                 }elsif($larger_region){
                                         $seqlets[$i]="$seq1\_$large_start\-${large_end}$tail1";
                                 }

                                 splice(@seqlets, $i+1, 1);
                                 $i--;
                            }
                        }
                   }
            }
	    }
	 }
	 #print "\n# (3) remove_similar_seqlets: The final out are: @seqlets\n" if $verbose;
	 return(\@seqlets);
}




#______________________________________________________________
# Title     : get_domain_inside_domain
# Usage     :
# Function  :
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : find_dindoms, domain_inside_domain, domain_in_domain
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.0
#--------------------------------------------------------------
sub get_domain_inside_domain{
	$cluster_line=$_[0] || ${$_[0]};
	my($i, $j, @seq, @out);
	my $overlap_factor=40;
	my $min_inside_dom_size=40;
	@seq=split(/ +/, $cluster_line);
	F1:for($i=0; $i< @seq; $i++){
	   $seq1=$seq[$i];
	   if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
		  $seq_name=$1;
		  $start1=$2;
		  $end1=$3;
	   }
	   F:for($j=0; $j< @seq; $j++){
		  $seq2=$seq[$j];
		  if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
		  if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
			 $start2=$2;
			 $end2=$3;
		  }
		  if(($start1 > $end2)||($start2 > $end1)){ # skips non overlapping seqlets
			 next;
		  }
		  if(($start1 > $start2)&&($end1 < $end2)){  #   -----
			 $leng_seq1=$end1-$start1;               # ----------
			 $leng_seq2=$end2-$start2;
			 if(( ($leng_seq2/2) >= $leng_seq1 )&&
			    ($leng_seq1 > $min_inside_dom_size) ){   # if seq1 is less than 60% of seq2, it is a hidden domain
				push(@out, "$seq2\($seq1\)");
			 }
		  }elsif(($start1 < $start2)&&($end1 > $end2)){  # -----------
			 $leng_seq1=$end1-$start1;                   #   ------
			 $leng_seq2=$end2-$start2;
			 if(( ($leng_seq1/2) >= $leng_seq2)&&
			    ($leng_seq2 > $min_inside_dom_size) ){   # if seq1 is less than 60% of seq2, it is a hidden domain
				push(@out, "$seq1\($seq2\)");
			 }
		  }
	   }
	}
	return(\@out);
}
#________________________________________________________________________
# Title     : show_hash
# Usage     : &show_hash(\@input_array);
# Function  : for debugging purpose. Shows any array elem line by line.
#             the line is 60 elements long (uses recursion)
# Example   : Output:      item1
#             Output:      item2
#             Output:      item3
# Warning   : There is a global variable:  $show_hash_option
#             It tries to detect any given sting which is joined by ','
# Keywords  :
# Options   : -s or -S or s or S for spaced output. Eg)
#             seq1       1 1 1 1 1 1 1 1 1 1 1 1
#
#             instead of
#             seq1       111111111111
#
#             -h or -H or h or H for horizontal line of '---------...'
#
# Returns   :
# Argument  :
# Version   : 1.7
#--------------------------------------------------------------------
sub show_hash{
  my($k, $i, $t, @in2, $in, $LEN, %TEM ); ## You should not put $show_hash_option
  my(@in)=@_;                     ## and $horizontal_line  in my !!!
  my($KL)=2; # default keys string length;
  my($VL)=80; # default values string length;
  my($GAP)=2;  # default space between keys and values
  my($horizontal_line, $show_hash_optionXX, $Hash_counter, @line);

  ## This is to get the option of 'space' to make spaced output.
  for($t=0; $t < @in; $t++){
	 if($in[$t] =~/^[-]+[sS][pace]*$/){
		 $show_hash_optionXX = 1;
		 splice(@in, $t, 1);
	 }elsif(${in[$t]} =~/^[-]+[sS][pace]*$/){
		 $show_hash_optionXX = 1;
		 splice(@in, $t, 1);
	 }elsif($in[$t] =~/^[-]+[hH][rR]*$/){
		 $horizontal_line = 1;
		 splice(@in, $t, 1);
	 }elsif(${in[$t]} =~/^[-]+[hH][rR]*$/){
		 $horizontal_line = 1;
		 splice(@in, $t, 1);
	 }
  }

  ######## Main loop #################
  if($horizontal_line ==1){  ## This puts the delimiter '--------------(  )'
	  $Hash_counter ++;
	  print "\n","-"x78,"(${Hash_counter}th hash)", "\n";
  }

  for($k=0; $k < @in; $k++){
	 if(ref($in[$k]) eq 'ARRAY'){  ### When the hashes were given in array ref.
		 &show_hash(@{$in[$k]}, $show_hash_optionXX, $horizontal_line);
		 print "\n";
	 }
	 elsif(ref($in[$k]) eq 'HASH'){  ### recursion
		 &show_hash(%{$in[$k]});
		 print "\n";
	 }
	 elsif(ref($in[$k+1]) eq 'HASH'){  ### recursion
		 &show_hash(%{$in[$k+1]}); print "\n";
	 }
	 elsif(ref($in[$k]) eq 'SCALAR'){  print ${$_[$k]}, "\n";  }
	 elsif( !ref($in[$k]) ){
		 if( !ref($in[$k+1]) && defined($in[$k+1])  ){
			 if($show_hash_optionXX == 1){  #### space option checking.
				#if($in[$k+1] =~ /\,.+\,/){  #### if the string is joined with ','
				#	 @line = split(/\,/, $_[$k+1]);
				# }else{
				#	 @line = split(//, $_[$k+1]);
				# }
				%TEM = @in;
				$LEN = ${&max_elem_string_array_show_hash(keys %TEM)};
				 if($LEN > $KL){ $KL = $LEN + $GAP +2};
				 printf ("%-${KL}s ", $in[$k]);  $k++;
				 printf ("%-${VL}s\n","@line");
			 }else{                        ### If not option is set, just write
				%TEM = @in;
				$LEN = ${&max_elem_string_array_show_hash( keys %TEM)};
				 if($LEN > $KL){ $KL = $LEN + $GAP +2};
				 printf ("%-${KL}s ", $in[$k]);  $k++; # print $in[$k], "\t";  $k++;
				 printf ("%-${VL}s\n",$in[$k]);        # print $in[$k], "\n";
			 }
		 }
		  #________________________________________________________
		  # Title    : max_elem_string_array_show_hash
		  # Keywords : largest string length of array
		  # Function : gets the largest string length of element of any array of numbers.
		  # Usage    : ($out1, $out2)=@{&max_elem_array(\@array1, \@array2)};
		  #            ($out1)       =${&max_elem_array(\@array1)          };
		  # Argument : numerical arrays
		  # returns  : one or more ref. for scalar numbers.
		  # Version  : 1.1
		  #-------------------------------------------------------
		  sub max_elem_string_array_show_hash{
			 my(@input, $i, $max_elem);
			 @input = @{$_[0]} || @_;
			 for($i=0; $i< @input ; $i++){
					$max_elem = length($input[0]);
					if (length($input[$i]) > $max_elem){
						$max_elem = length($input[$i]);
					}
			 }
			 \$max_elem;
		  }
		  #####################################insert_gaps_in_seq_hash
	 }
  }
}
#________________________________________________________________________
# Title     : handle_arguments
# Usage     : Just put the whole box delimited by the two '###..' lines below
#             to inside of your subroutines. It will call 'handle_arguments'
#             subroutine and parse all the given input arguments.
#             To use, claim the arguments, just use the variable in the box.
#             For example, if you had passed 2 file names for files existing
#             in your PWD(or if the string looks like this: xxxx.ext),
#             you can claim them by $file[0], $file[1] in
#             your subroutine.
# Function  : Sorts input arguments going into subroutines and returns default
#             arrays of references for various types (file, dir, hash, array,,,,)
#             If you give (\@out, @file), it will put @out into @array as a ref
#             and also the contents of @out will be dereferenced and put to
#             raw_string regardless what is in it).
#
# Example   : 'handle_arguments(\@array, $string, \%hash, 8, 'any_string')
# Warning   :
# Keywords  : handling arguments, parsing arguments,
# Options   :
# Returns   : Following GLOBAL variables
#
#             $num_opt,    @num_opt     @file          @dir
#             $char_opt,   @char_opt    %vars          @array,
#             @hash        @string,     @raw_string    @range,
#
#             $num_opt has 10,20
#             @num_opt has (10, 20)
#             @file has  xxxx.ext
#             @dir has  dir  or /my/dir
#             $char_opt has 'A,B'
#             @char_opt has (A, B)
#             @array has  (\@ar1, \@ar2)
#             @hash has (\%hash1, \%hash2)
#             @string  ('sdfasf', 'dfsf')
#             @raw_string (file.ext, dir_name, 'strings',,)
#             @range has values like  10-20
#             %vars deals with x=2, y=3 stuff.
#
# Argument  : any type, any amount
# Version   : 4.8
#--------------------------------------------------------------------
sub handle_arguments{
	my($c, $d, $e, $f, $i, $j, $k, $l, $s, $t, $x, $y, $z, $char_opt, $dir, @hash,
		$file, $in_dir, $num_opt, @char_opt, @dir, @file, @string, @file_dir, @k,
		@num_opt, @raw_string,@string, @array, %vars, @range, @temp, $temp,
		@char_options);

  &set_debug_option;
  if(@_<1){ print chr(7),"\n This is handle_arguments. No args Passed, Error?\n"}
  elsif( (@_ ==1)&& (ref($_[0]) eq 'ARRAY') ){ # when there is only 1 argument
	  push(@array, $_[0]);
	  push(@k, $_[0]);
  }elsif( (@_==1)&&( !ref($_[0]) ) ){
	  if(-f $_[0]){ push(@file, $_[0]);   push(@string, $_[0]) }
	  elsif(-d $_[0]){ push(@dir, $_[0]); push(@string, $_[0]) }
	  elsif($_[0]=~/^\d+$/){ push(@num_opt, $_[0]); $num_opt.=$_[0] }
	  elsif($_[0]=~/^\w+$/){ push(@string, $_[0]); }
  }elsif(@_ >=1){ @k = @_ }

  #####______Start of  general argument handling______######
  for($k=0; $k < @k ;$k++){
	  if( !ref($k[$k]) ){
		  if($k[$k]=~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){  push(@char_opt, $1); $char_opt .= "$1\,";
		  }elsif($k[$k]=~ /^\-([a-zA-Z]+)$/){          ## When multiple option is given,
			  @char_options = split(/\,|/, $1);  push(@char_opt, @char_options);
			  $char_opt .= join("\,", @char_options); ## '-' should be used. eg. '-HEGI'
		  }elsif($k[$k]=~ /^(\w+)\=(\S* *)$/){  $vars{$1}=$2;  $vars .= "$1\,";
		  }elsif($k[$k]=~ /^(\-?\d+)$/){ push(@num_opt, $1);  $num_opt .= "$1\,";
		  }elsif($k[$k]=~ /^\d+\.?\d*\-\d+\.?\d*$/){  push(@range,  $k[$k] );
		  }elsif(-f $k[$k]){                          push(@file,   $k[$k] );
		  }elsif(-d $k[$k]){                          push(@dir,    $k[$k] );
		  }elsif($k[$k]=~ /\/[\w\d\.\-]+[\/].+[\/]$/){push(@dir,    $k[$k] );
		  }elsif($k[$k]=~ /^\/[\w\d\.\-]+[\/]*$/){    push(@dir,    $k[$k] );
		  }elsif($k[$k]=~ /^[\/\w\d\-\.]+\.\w+$/){    push(@file,   $k[$k] );
		  }elsif($k[$k]=~ /\S\/[\/\w\d\-\.]+\.\w+$/){ push(@file,   $k[$k] );
		  }elsif($k[$k]=~/^\w+[\/\\\w\d\.\-]+$/){     push(@string, $k[$k] );
		        # string does not have space, but includes '\', '/', '.'
		  }else{                                      push(@raw_string, $k[$k] );  }

	  }elsif( ref($k[$k]) ){
		  if( ref($k[$k]) eq "SCALAR"){
			 if(${$k[$k]} =~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){ push(@char_opt, $1); $char_opt  .= "$1\,";
				}elsif(${$k[$k]}=~ /^\-([a-zA-Z]+)$/){ push(@char_opt, @char_options);
					$char_opt  .= join("\,", @char_options);  ## as an option string.
				}elsif(${$k[$k]}=~ /^(\w+)\=(\S* *)$/){  $vars{$1}=$2;  $vars .= "$1\,";
				}elsif(${$k[$k]}=~ /^(\-?\d+)$/){ $num_opt .= "$1\,";  push(@num_opt, $1);
			    }elsif(${$k[$k]}=~ /^\d+\.?\d*\-\d+\.?\d*$/){    push(@range,  $k[$k] );
				}elsif(-f ${$k[$k]}){                            push(@file,   ${$k[$k]} );
				}elsif(-d ${$k[$k]}){                            push(@dir,    ${$k[$k]} );
				}elsif(${$k[$k]}=~ /\/[\/\w\d\.\-]+[\/].+[\/]/){ push(@dir,    ${$k[$k]} );
				}elsif(${$k[$k]}=~/^\/[\/\w\d\.\-]+[\/]*$/){     push(@dir,    ${$k[$k]} );
				}elsif(${$k[$k]}=~ /^[\/\w\d\-\.]+\.\w+$/){      push(@file,   ${$k[$k]} );
				}elsif(${$k[$k]}=~/^\w+[\w\d\.\-]+$/){           push(@string, ${$k[$k]} );
				}else{                                           push(@raw_string, ${$k[$k]}); }
		  }elsif(ref($k[$k]) eq "ARRAY"){ my @temp_arr = @{$k[$k]}; push(@array, $k[$k]);
			for ($i=0; $i<@temp_arr; $i++){
			   if(-f $temp_arr[$i]){                            push(@file, $temp_arr[$i]);
			   }elsif($temp_arr[$i]=~/^\d+\.?\d*\-\d+\.?\d*$/){ push(@range,$temp_arr[$i] );
			   }elsif(-d $temp_arr[$i]){                        push(@dir , $temp_arr[$i]);
			   }elsif($temp_arr[$i]=~/\/[\/\w\d\.\-]+[\/].+[\/]/){ push(@dir, $temp_arr[$i] );
			   }elsif($temp_arr[$i]=~/^\/[\/\w\d\.\-]+[\/]*$/){ push(@dir, $temp_arr[$i] );
			   }elsif($temp_arr[$i]=~/^[\/\w\d\-\.]+\.\w+$/){   push(@file,$temp_arr[$i] );
																push(@string,$temp_arr[$i] );
			   }elsif($temp_arr[$i]=~/^\w+[\w\d\.\-]+$/){       push(@string,$temp_arr[$i]);
			   }else{                                           push(@raw_string, $temp_arr[$i]); }
			 }
		  }elsif(ref($k[$k]) eq "HASH"){                             push(@hash,   $k[$k] ); }
	  }
  }
  @raw_string=(@raw_string, @string);
  @file = @{&remove_dup_in_arrayH(\@file)};
  #-----------------------------------------------------
	 sub remove_dup_in_arrayH{  my($i, @nondup, @out_ref, %duplicate, @orig, @out_ref);
		for($i=0; $i<@_; $i++){  undef(%duplicate);
	       if(ref($_[$i]) eq 'ARRAY'){    @orig = @{$_[$i]};    }
		   @nondup = grep { ! $duplicate{$_}++ } @orig; push(@out_ref, \@nondup);  }
		if(@out_ref ==1){ return($out_ref[0]);}
		elsif(@out_ref >1){  return(@out_ref);}
	 }
  #-----------------------------------------------------
  return(\@hash, \@array, \@string, \@dir, \@file, \@num_opt,
			\@char_opt, \$num_opt, \$char_opt, \@raw_string, \%vars, \@range );
}

#______________________________________________________________
# Title     : convert_mspa_line_to_mmp_line
# Usage     :
# Function  : this adds ranges to the seqnames of mspa files
#             mmp line is mspa line with additional sequences at the end
# Example   :
# Keywords  : convert_mspa_to_mmp, convert_mspa, convert_mspa_2_mmp
#             change_mspa_to_mmp, add_range_in_mspa
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.5
#--------------------------------------------------------------
sub convert_mspa_line_to_mmp_line{
   my $input_mspa=${$_[0]} || $_[0];
   my($score, $evalue, $long_1, $new_seq1, $new_seq2, $middle,
	  $start1, $end1, $start2, $end2, $seq1, $seq2, $new);

   if($input_mspa=~/^ *(\d+) +(\S+) *\S*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
	  ($score, $evalue, $start1, $end1, $start2, $end2)=($1, $2, $3, $4, $6, $7);
	  ($seq1, $seq2)=($5, $8);
	  if($seq1=~/(\S+)\_\d+\-\d+/){
		 $new_seq1="$1\_$start1\-$end1";
	  }else{
		 $new_seq1="$seq1\_$start1\-$end1";
	  }
	  if($seq2=~/(\S+)\_\d+\-\d+/){
		 $new_seq2="$1\_$start2\-$end2";
	  }else{
		 $new_seq2="$seq2\_$start2\-$end2";
	  }
	  $new=sprintf("%-6s %-8s %-5s %-5s %-32s %-5s %-5s %-32s",
					$score, $evalue, $start1, $end1, $new_seq1, $start2, $end2, $new_seq2);
   }
   return(\$new);
}
#________________________________________________________________________
# Title     : remove_dup_in_array
# Usage     : @out2 = @{&remove_dup_in_array(\@input1, \@input2,,,,)};
#             @out1 = &remove_dup_in_array(\@input1 );
# Function  : removes duplicate entries in an array. You can sort the
#             result if you wish by 's' opt. Otherwise, result will keep
#             the original order
# Example   : (1,1,1,1,3,3,3,3,4,4,4,3,3,4,4);  --> (1,3,4);
# Warning   :
# Keywords  : merge array elements, remove_repeting_elements,
#             remove_same_array_elements
# Options   :
#   s for sorting the array output
# Returns   : one or more references.
# Argument  : one or more refs for arrays or one array.
# Version   : 1.4
#--------------------------------------------------------------------
sub remove_dup_in_array{
  my($i, $sort_opt, @out_ref, @nondup,%duplicate, @orig, @out_ref);
  my @in=@_;
  for($i=0; $i<@in; $i++){
	 if($in[$i] eq 's'){
		$sort_opt=1;  splice(@in, $i, 1); $i--;
	 }elsif( ref($in[$i]) eq 'SCALAR'  and  ${$in[$i]} eq 's' ){
		$sort_opt=1;  splice(@in, $i, 1); $i--;
	 }
  }
  for($i=0; $i<@in; $i++){
	  undef(%duplicate);
	  if(ref($in[$i]) eq 'ARRAY'){    @orig = @{$in[$i]};    }
	  else{ @orig=@in }
	  @nondup = grep { ! $duplicate{$_}++ } @orig;    ## NOTE -> $_
	  if($sort_opt==1){ @nondup= sort @nondup }
	  push(@out_ref, \@nondup);
  }
  if(@out_ref ==1){ return($out_ref[0]);}
  elsif(@out_ref >1){  return(@out_ref);}
}
#________________________________________________________________________
# Title     : assign_options_to_variables
# Usage     : &assign_options_to_variables(\$input_line);
# Function  : Assigns the values set in head box to the variables used in
#             the programs according to the values given at prompt.
#             This produces global values.
#             When numbers are given at prompt, they go to @num_opt
#              global variable. %vars global option will be made
#
# Example   : When you want to set 'a' char to a variable called '$dummy' in
#             the program, you put a head box commented line
#             '#  $dummy    becomes  a  by  -a '
#             Then, the parse_arguments and this sub routine will read the head
#             box and assigns 'a' to $dummy IF you put an argument of '-a' in
#             the prompt.
# Warning   : This is a global vars generator!!!
# Keywords  :
# Options   : '#' at prompt will make a var  $debug set to 1. This is to
#              print out all the print lines to make debugging easier.
# Returns   : Some globaly used variables according to prompt options.
#             @num_opt,
#
# Argument  : None.
# Version   : 2.5
#--------------------------------------------------------------------
sub assign_options_to_variables{
  my($i, $j, $op, $z, $n, $symb, $value, $var, %val, @val, $option_table_example, @input_options);

  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #      Defining small variables for option table reading
  #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  my($g)='gets';                my($if)='if';
  my($is)='is';                 my(@input_files);
  my($o)='or';   my(@arguments) = sort @ARGV;

  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #  Assigning global arguments(@num_opt, %vars) variables
  #_______________________________________________________________
  for($i=0; $i< @arguments; $i++){
	 if(($arguments[$i]=~/^(\-?\d+[\.\d+]?)$/)&&   ### it mustn't be a file
		( !(-f $arguments[$i]) ) ){                ### getting NUM opt
		push(@num_opt, $1);
	 }elsif( $arguments[$i]=~/^(\S+)=(\S+)$/){
		$vars{$1}=$2;
	 }
  }

  #""""""""""""""""""""""""""""""""""""""""""""""""""
  #   Some DEFAULT $debug variables for debugging purposes
  #""""""""""""""""""""""""""""""""""""""""""""""""""
  &set_debug_option;

  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  #   The main processing of self
  #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
  open(SELF, "$0");    ## opens the program you ran to get the options table.
  while(<SELF>){

	  if( $first_border_and_title > 6 ){  ## This is to make it read only the first headbox.
		  last;                            #  $first_border_and_title is an incremental counter.
	  }elsif( /^ *#[_\*\-]{15,}$/ and /^ *# *[Tt][itle]*[ :]*/ ){
		  $first_border_and_title++;
		  print __LINE__, "# assign_options_to_variables : Title line found\n" if $debug eq 1;
	  }elsif(/^ {0,5}# {1,50}[\$\%\@].+$/){
		  $op = $&;  ## $op is for the whole input option line which has $xxxx, @xxx, %xxxx format
		  $op =~ s/^( *\# *)(\W\w+.+)$/$2/;  ## This is removing '#  ' in the line.
		  $op =~ s/^(\W\w+.+)(\s+\#.*)$/$1/;  ## This is removing any comments in the line.
			 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
			 ## matching the following line input format.
			 ## $av_sc_segment     becomes    a  by  a  # To smooth the SC rates. Gets the averages of
			 ## $ARG_REG is for arguments regular expression variable.
			 ##  This reg. exp. matches = 'a or A or E or e' part
			 ##  which represents alternative prompt arguments possibilities. \=$b$g$is$e$set
			 #"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
			 $ARG_REG ='(\S*) *[or=\,]* *(\S*) *[=or\,]* *(\S*) *[=or\,]* *(\S*) *[=or\,]* *(\S*)';
			 if($op=~/^([\$\@\%])([\w\-]+) {0,20}[=|$g|$is] *[\$\@\%]*([\- \w\.\d]+) *[bB]y +$ARG_REG/){
							 ## $sym     $var        becomes          a [$a...]       by       a -a -A
				  my $sym = $1;  #### The symbols like ($, @, %), '$' in the above.
				  my $var = $2;  #### Actual variable name 'var' from $var, 'av_sc_segment' in the above.
				  my $val = $3;  #### The becoming value  first 'a' in the above.
				  my @arg = ($4, $5, $6, $7, $8);  ## The alternative prompt arguments, second 'a' in the above..
			      print "\n $sym $var $val \n" if $debug==1;
			      print "\n \@arg are @arg \n" if $debug==1;

				  #""""""""""""""""""""""""""""""""""""""""""""""""""""
				  #  Going through the PROMPT args.
				  #""""""""""""""""""""""""""""""""""""""""""""""""""""
				  for($z=0; $z < @arguments; $z++){     ## $arguments[$z]  is from @ARGV
					  if($arguments[$z]=~/^\-\w+$/){
						  $arguments[$z] =~ s/\-//;
					  }
					  for ($i=0; $i < @arg; $i ++ ){
						 if( ("$arg[$i]" eq "$arguments[$z]" )&& ($arg[$i] !~ /\=/)
							 && ($sym eq '$') ){
							 ${"$var"}="$val";
							 if($debug == 1){
								 print __LINE__," \$${var} is set to \"$1\"\n";
							 }

						 }#'''''''''''''''' $arg = by s=  syntax ~~~~~~~~~~~~~~~~~~~~~~~~~~~
						 elsif( ( $arg[$i] =~ /^(\w+) *\=/ ) &&
							( $arguments[$z] =~ /^${1}= *([\w\.*\-*]+)$/) &&
							( $sym eq '$') ){
							  ${"$var"}="$1";
							  if($debug eq 1){ print __LINE__,"\$${var} is set to \"$1\"\n";  }
						 }
					  }
				  }
			  }
		}
	}
}


#__________________________________________________________________________
# Title     : merge_sequence_in_mspa_chunk
# Usage     :
# Function  : merges sequences which are linked by common regions
#             This filters the sequences by evalue and ssearch score
#             This is the main algorithm of merging similar sequences.
#             MSPA lines become pairs of seq_regions
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : connect_sequence_in_mspa, link_sequence_in_mspa_chunk
#             connect_sequence_in_mspa_chunk, link_sequence_in_mspa
#             merge_sequence, link_sequence, connect_sequence
# Options   : _  for debugging.
#             #  for debugging.
#             m  for merge file output format (.mrg)
#             t= for threshold of seqlet length eg)  "t=30"
#             f= for overlap factor (usually between 2 to 7 )
#                 2 means, if the two regions are not overlapped
#                  by more than HALF of of the smaller region
#                  it will not regard as common seqlet block
#             s= for ssearch score minimum
#             e= for ssearch e value maximum
#             S  for S -S  # taking shorter region overlapped in removing similar regions
#             L  for L -L  # taking larger  region overlapped in removing similar regions
#             A  for A -A # taking average region overlapped in removing similar regions
#
# Returns   :
# Argument  :
# Thanks    : Alexey Eroshkin <alexey@axyspharm.com>
# Version   : 2.9
#--------------------------------------------------------------
sub merge_sequence_in_mspa_chunk{
	 #"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	 my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	 my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	 my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	 my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	 my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	 if($debug==1){print "\n\t\@hash=\"@hash\"
	 \@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	 \@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	 #""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
	 my ($ssearch_score2, $evalue_found2, $evalue_found1, $ssearch_score1, $optimize );
	 my ($L, %out_hash, @out, $LL, @Final_out, $verbose, $final_factor, $R_diff, @seqlets,
			 $short_region, $large_region, $average_region, $factor, $score, $evalue, $length_thresh);
	 $factor =7; # default factor for around 30% sequence mis-overlap is the threshold for common block
	 #~~~~~~~~~~~~~~ The lower the factor the larger clustering will occur ~~~~~~~~~~~~
	 $score  =75; # default ssearch score. seq below this will be chucked out
	 $evalue =10; # default maximum e value used. Seq higher than this will be thrown out
	 $length_thresh =30; # sequence length threshold. overlap less than this will be ignored

	 if($char_opt=~/v/){     $verbose = 'v'
	 }if($char_opt=~/z/){    $optimize = 'z'
	 }if($char_opt=~/S/){    $short_region='S';
	 }if($char_opt=~/L/){	   $large_region='L';
	 }if($char_opt=~/A/){	   $average_region='A'; }

	 if($vars{'T'}=~/\d+/){   $length_thresh=$vars{'T'};
	 }if($vars{'f'}=~/\S+/){  $factor=$vars{'f'};
	 }if($vars{'s'}=~/\d+/){  $score = $vars{'s'};
     }if($vars{'e'}=~/\d+/){  $evalue= $vars{'e'};
     }if($vars{'E'}=~/\d+/){  $evalue= $vars{'E'};
	 }

     @seqlets=split(/\n+/, (${$_[0]} || $_[0]) );

	 F1: for($i=0; $i < @seqlets; $i ++){
			if($seqlets[$i]=~/^\s*((\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+))\s+(\S+)\s*(.*)/){
		     if($6 eq $9){ splice(@seqlets, $i, 1); $i--; next };
				 ($long_match1, $enq_seq1, $mat_seq1, $R_start1, $R_end1 )=($1, $6, $9, $4, $5);
                 $Region_leng1=$R_end1-$R_start1;  $ssearch_score1= $2;  $evalue_found1 = $3;
	       }
	       #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	       # Following lines are disabled as I believe seqlets have been checked in previous sub
		   #________________________________________________________________________________________________
	       if( ($Region_leng1 < $length_thresh) || ($ssearch_score1 < $score) ){ splice(@seqlets, $i, 1); $i--; next; }
	       if( $evalue_found1 > $evalue){ splice(@seqlets, $i, 1); $i--; next; }

		   F2: for($j=0; $j < @seqlets; $j ++){
		     if($seqlets[$i] eq $seqlets[$j]){ next };
		     if($seqlets[$j]=~/^\s*((\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+))\s+(\S+)\s*(.*)/){
			      ($long_match2, $enq_seq2, $mat_seq2, $R_start2, $R_end2)=($1, $6, $9, $4, $5);
			      $Region_leng2=$R_end2-$R_start2;	$ssearch_score2=$2;	$evalue_found2= $3;
	         }

			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # Following lines are disabled as I believe seqlets have been checked in previous sub
			 #________________________________________________________________________________________________
		     #if( ($Region_leng2 < $length_thresh)||($ssearch_score2 < $score) ){ splice(@seqlets, $j, 1); $j--; next; }
		     #if( $evalue_found2 > $evalue){ splice(@seqlets, $j, 1); $j--; next; }

             $R_diff=abs($Region_leng1-$Region_leng2);   ## <<<---- Note it is div by 2

		     if($Region_leng2 < $Region_leng1){ $smaller_leng=$Region_leng2; }else{ $smaller_leng=$Region_leng1; }

             $Start_diff=abs($R_start1-$R_start2); ## <<<---- Note it is div by 2
             $final_factor=$smaller_leng - $smaller_leng*($factor/10);

			 #~~~~~~~~~~ If average R_diff and average Start_diff are less then 1/7 of the smaller seqlet
			 #~~~~~~~~~~ we regard they are same selqets
             if( $R_diff <= $final_factor ){  ### if diff is less than around 30% of the smaller length
					  if($Region_leng2 >= $Region_leng1){
							 #~~~~~ $mat_seq1 or $mat_seq2 can increase to 'slr1453,sll0238', so you need ',' in the middle only
                             $extended_name="$mat_seq2|-|$mat_seq1";
							 $L=length($extended_name);
							 $LL=length($long_match2)+2;
							 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
							 # This makes "368   2.3e-06  0.352  4    189   af_AF2051  20   208   hi_HI1334,hi_34343"
							 #_________________________________________________________________________________________
							 $seqlets[$i]= sprintf("%-${LL}s %-${L}s", $long_match2, $extended_name);
							 splice(@seqlets, $j, 1);
							 $i-- unless($i==0);
							 $j--;
							 next F1;
					  }elsif( $Region_leng1 >= $Region_leng2){  ## chooses the bigger range seq
							 $extended_name="$mat_seq1|-|$mat_seq2"; # must be ',' not ' '
							 $L=length($extended_name);
							 $LL=length($long_match1)+2;
							 $seqlets[$i]=sprintf("%-${LL}s %-${L}s", $long_match1, $extended_name);
							 splice(@seqlets, $j, 1);
							 $i-- unless($i <= 0);
							 $j--;
							 next F1;
					  }
	       }else{
			      next F2;
		   }
	    }
	 }
	 #print "\n @seqlets \n";
	 if($char_opt=~/m/){ # #             m  for merge file output format (.mrg)
            for($i=0; $i< @seqlets; $i++){
				 if($seqlets[$i]=~/^\s*\S+\s+\S+\s+\d+\s+\d+\s+(\S+)\s+\d+\s+\d+\s+(\S+)/){
						if($1 eq $2){ next }
						$leading_seq=$1; $long=$2; $long=~s/\|\-\|/ /g;
						push(@Final_out, "$leading_seq $long" );
				 }
			}
	 }
	 @Final_out=sort @Final_out;
     #print "\n========># \@Final_out: @Final_out ";
	 return(\@Final_out);
}


#______________________________________________________________
# Title     : sort_words_in_string
# Usage     :
# Function  : sort words in strings sperated by ' ' or "\n"
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : sort_words_in_sequences, sort_sequences_in_string,
#             sort_strings_in_string,
# Options   : _  for debugging.
#             #  for debugging.
# Returns   :
# Argument  :
# Version   : 1.0
#--------------------------------------------------------------
sub sort_words_in_string{
   my @in=@{$_[0]} || @_;
   my @OUT;
   for (@_){
	  push(@OUT, join(' ', sort split(/ +|\n/) ));
   }
   return(\@OUT);
}



#________________________________________________________________________
# Title     : get_base_names
# Usage     : $base =${&get_base_names(\$file_name)};
#             :   or @bases = &get_base_names(\@files);  # <-- uses `pwd` for abs directory
# Function  : produces the file base name(eg, "evalign"  out of "evalign.pl" ).
# Example   : $base => 'test'  with 'test.txt' or '/home/dir/of/mine/text.txt'
# Warning   :
# Keywords  : get_base_name{, base_name, file_base_name ,  get_file_base_name
#             get_basename, basename, get_root_name
# Options   :
# Returns   :
# Argument  : handles both ref and non-ref.
# Version   : 1.3
#--------------------------------------------------------------------
sub get_base_names{
	my($x, $pos, $pos1, @out_file, $file_only, $file, @file, $base, @base);
	@file=@{$_[0]} if (ref($_[0]) eq 'ARRAY');
	@file=@_ if !(ref($_[0]) eq 'ARRAY');
	for($x=0; $x < @file; $x ++){
		if( ref($file[$x]) ){
			$file = ${$file[$x]};
			$pos1=rindex($file, "/");
	        $file_only=substr($file, ($pos1+1));
			$pos = rindex($file_only, ".");
	        $base= substr($file_only, 0, $pos);
		}else{
			$file = $file[$x];
			$pos1=rindex($file, "/");
	        $file_only=substr($file, ($pos1+1));
			$pos = rindex($file_only, ".");
	        $base= substr($file_only, 0, $pos);
		}
		push(@base, $base);
	}
	if(@base == 1 ){ \$base[0] }else{ \@base }
}

__END__

#______________________________________________________________
# Title     : merge_similar_seqlets
# Usage     : @all_seqlets = @{&merge_similar_seqlets(@all_seqlets)};
# Function  : merges seqlet sets which have identical
#             sequences and share similar regions by connection factor of 30%
#             This means, if any two seqlets from the same sequences which
#             share more than 70% seqlet regions overlapping are merged
#             This only sees the very first sequence in the seqlets line!!!
#             (so, PARTIAL MERGE !!)
# Example   : INPUT:
#
#   @input=( 'seq1_1-30 seq2_1-40 seq3_1-50',
#            'seq1_2-49 seq4_4-40 seq8_2-99'....)
#
# Keywords  : merge_similar_sequences, merge_sequence_names,
#              merge_sequence_ranges, merge_similar_sequences_with_ranges
# Options   : _  for debugging.
#             #  for debugging.
#  $short_region=  S by S -S  # taking shorter region overlapped in removing similar regions
#  $large_region=  L by L -L  # taking larger  region overlapped in removing similar regions
#  $average_region=A by A -A # taking average region overlapped in removing similar regions
#
# Version   : 1.7
#--------------------------------------------------------------
sub merge_similar_seqlets{
   my (@all_seqlets, @result_all_seqlets, $i, $seq1, $start1, $end1, $seq2,
	   $smaller_leng, $start2, $end2, @split, @split1, @split2,
	   $short_region, $large_region, $average_region);
   my $factor=6.5;     #  33% sequence mismatch region is allowed(3)
   my $leng_thresh=30;
   my $optimize=0;
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
   # Sorting (parsing) input to get options and input array
   #_________________________________________________________
   for($i=0; $i< @_; $i++){
	   if(ref($_[$i]) eq 'ARRAY'){
		   @all_seqlets=@{$_[$i]};
	   }elsif($_[$i]=~/f=(\S+)/){ $factor=$1
	   }elsif($_[$i]=~/o/i){      $optimize=1
	   }elsif($_[$i]=~/^S/){      $short_region='S';
	   }elsif($_[$i]=~/^L/){      $large_region='L';
	   }elsif($_[$i]=~/^A/){      $average_region='A'; }
   }

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   # This is to remove which are identical in @all_seqlets;
   #_________________________________________________________
   for($i=0; $i< @all_seqlets; $i++){
	  if($all_seqlets[$i] eq $all_seqlets[$i+1]){
		  push(@result_all_seqlets, $all_seqlets[$i]);
		  $i++;
		  next;
	  }
	  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	  # @split1 and 2 are arrays from different string entry in @all_seqlets
	  #_________________________________________________________
	  @split1=sort split(/ +/, $all_seqlets[$i]);
	  @split2=sort split(/ +/, $all_seqlets[$i+1]);

	  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
	  #  (1) If the first elements of @split1 and 2 are identical, lets merge the two arrays
	  #________________________________________________________________________________
	  if($split1[0] eq $split2[0]){
		  @split=(@split1, @split2);
		  if($optimize==1){ #~~~~~ optimize option removes similar seqlets
			 push(@result_all_seqlets, join(' ', sort @{&remove_similar_seqlets(\@split,
			                              $short_region, $large_region, $average_region)} ));
		  }else{
			 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			 # Only removes exactly identical ones
			 #__________________________________________________________
			 push(@result_all_seqlets, join(' ', @{&remove_dup_in_array(\@split, 's')} ));
		  }
		  $i++;
		  next;
	  }
	  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
	  # (2) If the first elements of @split1 and 2 are NOT identical, lets check the sequence ranges
	  #________________________________________________________________________________
	  if($split1[0] =~/^(\S+)_(\d+)\-(\d+)/){
		   ($seq1, $start1, $end1)=($1, $2, $3);
		   if($split2[0] =~/^(\S+)_(\d+)\-(\d+)/){
			   ($seq2, $start2, $end2)=($1, $2, $3);

			   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
			   # Check if the seqs are identicl (from the two arrays), no point to merge which are not identical from the first
			   #__________________________________________________________________________________________
			   if($seq1 eq $seq2){
					$diff_start=abs($start1-$start2);
					$diff_end  =abs($end1  -$end2  );
					$leng1=$end1-$start1;
					$leng2=$end2-$start2;
					if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }

					#~~~~~~ If the sum of overhangs are smaller than a third of average length
					if( ( ($diff_start+$diff_end)/2 <= $smaller_leng/$factor ) &&
						($smaller_leng > $leng_thresh ) ){
						@split=(@split1, @split2);
						if($optimize==1){ #~~~~~ optimize option removes similar seqlets
						   push(@result_all_seqlets, join(' ', sort @{&remove_similar_seqlets(\@split,
						                            $short_region, $large_region, $average_region )} ));
						}else{
						   push(@result_all_seqlets, join(' ', @{&remove_dup_in_array(\@split, 's')} ));
						}
						$i++;
						next;
					}else{
						push(@result_all_seqlets, join(' ', @split1));
						next;
					}
			   }
			   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			   # As they are not teh same, lets just check the next one in @split2
			   #_____________________________________________________________________
			   else{
					push(@result_all_seqlets, join(' ', @split1));
					next;
			   }
		   }
		   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		   # If there is no range (region) in seq naem, let's skip, as there is no way to check
		   #__________________________________________________________________________________
		   else{
			   push(@result_all_seqlets, join(' ', @split1));
			   next;
		   }
	  }
   }
   return(\@result_all_seqlets);
}

#______________________________________________________________
# Title     : merge_sequence_in_mspa_chunk
# Usage     :
# Function  : merges sequences which are linked by common regions
#             This filters the sequences by evalue and ssearch score
#             This is the main algorithm of merging similar sequences.
# Example   :
# Warning   : You MUST NOT delete '# options : ..' entry
#              as it is read  by various subroutines.
# Keywords  : connect_sequence_in_mspa, link_sequence_in_mspa_chunk
#             connect_sequence_in_mspa_chunk, link_sequence_in_mspa
#             merge_sequence, link_sequence, connect_sequence
# Options   : _  for debugging.
#             #  for debugging.
#             m  for merge file output format (.mrg)
#             t= for threshold of seqlet length eg)  "t=30"
#             f= for overlap factor (usually between 2 to 7 )
#                 2 means, if the two regions are not overlapped
#                  by more than HALF of of the smaller region
#                  it will not regard as common seqlet block
#             s= for ssearch score minimum
#             e= for ssearch e value maximum
#             S  for S -S  # taking shorter region overlapped in removing similar regions
#             L  for L -L  # taking larger  region overlapped in removing similar regions
#             A  for A -A # taking average region overlapped in removing similar regions
#
# Returns   :
# Argument  :
# Version   : 2.2
#--------------------------------------------------------------
sub merge_sequence_in_mspa_chunk{
	#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
	my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
	my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
	my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
	my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
	my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
	if($debug==1){print "\n\t\@hash=\"@hash\"
	\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
	\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
	#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
   my ($ssearch_score2, $evalue_found2, $evalue_found1, $ssearch_score1, $optimize );
   my ($L, %out_hash, @out, $LL, @Final_out, $verbose, $final_factor, $R_diff,
		$short_region, $large_region, $average_region);
   my $factor =6; # default factor for around 30% sequence mis-overlap is the threshold for common block
	  #~~~~~~~~~~~~~~ The lower the factor the larger clustering will occur ~~~~~~~~~~~~
   my $score  =105; # default ssearch score. seq below this will be chucked out
   my $evalue =40; # default maximum e value used. Seq higher than this will be thrown out
   my $thresh =40; # sequence length threshold. overlap less than this will be ignored

   if($char_opt=~/v/){     $verbose = 'v'
   }if($char_opt=~/o/){    $optimize = 'o'
   }if($char_opt=~/S/){    $short_region='S';
   }if($char_opt=~/L/){	   $large_region='L';
   }if($char_opt=~/A/){	   $average_region='A'; }

   if($vars{'t'}=~/\d+/){
	  $thresh=$vars{'t'}; print "\n# merge_sequence_in_mspa_chunk: Thresh is $thresh\n" if (defined $verbose);
   }if($vars{'f'}=~/\d+/){
	  $factor=$vars{'f'}; print "\n# merge_sequence_in_mspa_chunk: Factor is $factor\n" if (defined $verbose);
   }if($vars{'s'}=~/\d+/){
	  $score = $vars{'s'}; print "\n# merge_sequence_in_mspa_chunk: Score is $score\n" if (defined $verbose);
   }if($vars{'e'}=~/\d+/){
	  $evalue= $vars{'e'}; print "\n# merge_sequence_in_mspa_chunk: Evalue is $evalue\n" if (defined $verbose);
   }
   my @seqlets=split(/\n+/, (${$_[0]} || $_[0]) );

   F1: for($i=0; $i < @seqlets; $i ++){
	  if($seqlets[$i]=~/^ *((\d+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
		  if($6 eq $9){ splice(@seqlets, $i, 1); $i--; next };
		  ($long_match1, $enq_seq1, $mat_seq1, $R_start1, $R_end1 )=($1, $6, $9, $4, $5);
		  $Region_leng1=$R_end1-$R_start1;
		  $ssearch_score1= $2;
		  $evalue_found1 = $3;
	  }
	  if( ($Region_leng1 < $thresh) || ($ssearch_score1 < $score) ){ splice(@seqlets, $i, 1); $i--; next; }
	  if( $evalue_found1 > $evalue){ splice(@seqlets, $i, 1); $i--; next; }

	  F2: for($j=0; $j < @seqlets; $j ++){
		 if($seqlets[$i] eq $seqlets[$j]){ next };
		 if($seqlets[$j]=~/^ *((\d+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
			($long_match2, $enq_seq2, $mat_seq2, $R_start2, $R_end2)=($1, $6, $9, $4, $5);
			$Region_leng2=$R_end2-$R_start2;
			$ssearch_score2=$2;
			$evalue_found2= $3;
	     }
		 if( ($Region_leng2 < $thresh)||($ssearch_score2 < $score) ){ splice(@seqlets, $j, 1); $j--; next; }
		 if( $evalue_found2 > $evalue){ splice(@seqlets, $j, 1); $j--; next; }

		 $R_diff=abs($Region_leng1-$Region_leng2)/2;   ## <<<---- Note it is div by 2

		 if($Region_leng2 < $Region_leng1){ $smaller_leng=$Region_leng2; }else{ $smaller_leng=$Region_leng1; }

		 $Start_diff=abs($R_start1-$R_start2)/2; ## <<<---- Note it is div by 2
		 $final_factor=$smaller_leng/$factor;


		 #~~~~~~~~~~ If average R_diff and average Start_diff are less then 1/7 of the smaller seqlet
		 #~~~~~~~~~~ we regard they are same selqets
		 if(( $R_diff < $final_factor ) &&       ### $Start_diff is essential!
			($Start_diff < $final_factor ) ){  ### if diff is less than around 30% of the smaller length
			if($verbose=~/v/){
			   print "\n\$R_diff:$R_diff \$Start_diff:$Start_diff $smaller_leng $final_factor $factor";
			}
			if($Region_leng2 >= $Region_leng1){
			       #~~~~~ $mat_seq1 or $mat_seq2 can increase to 'slr1453,sll0238', so you need ',' in the middle only
				   $extended_name="$mat_seq2,$mat_seq1";
				   $L=length($extended_name);
				   $LL=length($long_match2)+2;
				   $seqlets[$i]= sprintf("%-${LL}s %-${L}s", $long_match2, $extended_name);
				   splice(@seqlets, $j, 1);
				   $i-- unless($i==0);
				   $j--;
				   next F1;
			}elsif( $Region_leng1 >= $Region_leng2){  ## chooses the bigger range seq
				   $extended_name="$mat_seq1,$mat_seq2"; # must be ',' not ' '
				   $L=length($extended_name);
				   $LL=length($long_match1)+2;
				   $seqlets[$i]=sprintf("%-${LL}s %-${L}s", $long_match1, $extended_name);
				   splice(@seqlets, $j, 1);
				   $i-- unless($i <= 0);
				   $j--;
				   next F1;
			}
	     }else{
			next F2;
		 }
	  }
   }
   if($char_opt=~/m/){
	  for($i=0; $i< @seqlets; $i++){
		if($seqlets[$i]=~/^ *\d+ +\d+\.?[e\-\d]* +\d+ +\d+ +(\S+) +\d+ +\d+ +(\S+) *$/){
		   if($1 eq $2){ next }
		   $leading_seq=$1; $long=$2; $long=~s/\,/ /g;
		   push(@Final_out, "$leading_seq $long" );
		}
	  }
   }
   sort @Final_out;
   return(\@Final_out);
}





Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.