Group
Extension

InSilicoSpectro/scripts/convertSpectra.pl

#!/usr/bin/env perl
use strict;
use Carp;
use Pod::Usage;

=head1 NAME

convertSpectra.pl


=head1 DESCRIPTION

Converts ms and msms peaklist from/to various formats. Formats can be specified or will hopefully be deduced from the files extensions

=head1 SYNOPSIS

convertSpectra.pl --in=[format:]file --out=[format:]file

=head1 ARGUMENTS

=over 4

=item -in=[format:]file

If the format is not idj, this parameter can be repeated

=item -out=[format:]file

=back

If no files are specified (thus the format must be) I/O are stdin/out.

=head1 OPTIONS

=over 4

=item --defaultcharge=charge

Defined a default charge for the precursor (msms) of the peak (ms) (it might be overwritten if the input file definesit explicitely. The charge argument can be something like '1+', '1', '2,3', '2+ AND 3+' etc.

=item --forcedefaultcharge

Will force the default charge defined into the file to be replaced

=item --title=string

Allows for setting a title (one line text)

=item --filter=file

Allows for using a filter. See 'InSilicoSpectro/t/Spectra/Filter/examples.xml' for more information.

=item --sampleinfo='name1=val1[;name2=val2[...]]'

Set sample related info example 'instrument=QTOF;instrumentId=xyz'

=item --trustprecursorcharge=medium

turn 2+ and 3+ precursor  into (2+ OR 3+)

=item --trustprecursorcharge=1:1/2:2,3/3:2,3,4

(or similar) will attribute 1+=>1+, 2+=>2+ or 3+, 3+=>2+,3+ or 4+.

=item --duplicateprecursormoz=i1:i2

If for example i1=-1 and i2=2, precursor moz will be replicate with -1, +1 and +2 Dalton

=item --skip=[msms|pmf]

Do not read msms or pmf information

=item --showinputformats

Prints all the possible format for input

=item --showoutputformats

Prints all the possible format for output

=item --showformats=(xml|json)

=item --showdef=[xml|json]

report all possible input/ouput format + relative info (optional/compulsory arguements...)

=item --propertiessave=file

=item --propertiesprefix=string

=item --usefilecaching

activates a caching system for large files

=item --version

=item --help

=item --man

=item --verbose

=back


=head1 EXAMPLE

./convertSpectra.pl --in=dta:somedir --out=somefile.idj.xml

=head1 COPYRIGHT

Copyright (C) 2004-2005  Geneva Bioinformatics www.genebio.com

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

=head1 AUTHORS

Alexandre Masselot, www.genebio.com

=cut

use Getopt::Long;
use File::Basename;
use File::Spec qw(tempfile tempdir);
use File::Temp;
use Archive::Tar;
use Archive::Zip qw( :ERROR_CODES :CONSTANTS);

my(@fileIn, $fileOut, $showInputFmt, $showOutputFmt, $showDef, $sampleInfo, $precursorTrustParentCharge, $defaultCharge, $forceDefaultCharge, $title, $fileFilter, $dpmStr, @skip, $doNotMergePrecursors,
   $excludeKeysFile,
   $propertiesFile,$propertiesPrefix,
   $useFileCache,
   $phenyxConfig, $help, $man, $verbose, $showVersion);

use InSilicoSpectro;
use InSilicoSpectro::Utils::io;

if (!GetOptions(
		"in=s@"=>\@fileIn,
		"out=s"=>\$fileOut,
		"sampleinfo=s"=>\$sampleInfo,
		"showinputformats"=>\$showInputFmt,
		"showoutputformats"=>\$showOutputFmt,
		"showdef=s"=>\$showDef,

		"duplicateprecursormoz=s"=>\$dpmStr,
		"donotmergeprecursors"=>\$doNotMergePrecursors,
		"trustprecursorcharge=s"=>\$precursorTrustParentCharge,
		"defaultcharge=s"=>\$defaultCharge,
		"forcedefaultcharge"=>\$forceDefaultCharge,
		"title=s"=>\$title,

		"excludekeysfile=s"=>\$excludeKeysFile,

		"filter=s"=>\$fileFilter,

		"propertiessave=s"=>\$propertiesFile,
		"propertiesprefix=s"=>\$propertiesPrefix,

		"usefilecaching"=>\$useFileCache,

		"skip=s@"=>\@skip,
		"phenyxconfig=s" => \$phenyxConfig,


                "version" => \$showVersion,
                "help" => \$help,
                "man" => \$man,
                "verbose" => \$verbose,
               )
    || $help || $man || $showVersion || (((not @fileIn) || (not $fileOut)) and  (not $showInputFmt) and (not $showOutputFmt) and (not $showDef))){

  if($showVersion){
    print basename($0)." InSilicoSpectro version $InSilicoSpectro::VERSION\n";
    exit(0);
  }

  pod2usage(-verbose=>2, -exitval=>2) if(defined $man);
  pod2usage(-verbose=>1, -exitval=>$help?0:2);
}

my %sampleInfo;
foreach(split /;/, $sampleInfo){
  my($n, $v)=split /=/, $_, 2;
  $sampleInfo{$n}=$v;
}

#CORE::die "invalid --trustprecursorcharge=(medium)" if $precursorTrustParentCharge and $precursorTrustParentCharge !~ /^(medium)$/;

use InSilicoSpectro::Spectra::MSRun;
use InSilicoSpectro::Spectra::MSSpectra;
use InSilicoSpectro::Spectra::Filter::MSFilterCollection;

#FIXME REMOOVE!
$InSilicoSpectro::Spectra::MSMSSpectra::MERGE_MULTIPLE_PREC_CHARGES=0 if $doNotMergePrecursors;

$InSilicoSpectro::Spectra::MSSpectra::USE_FILECACHED=1 if $useFileCache;
InSilicoSpectro::Utils::FileCached::verbose($verbose);

$InSilicoSpectro::Utils::io::VERBOSE=$verbose;

if ((defined $showInputFmt) or (defined $showOutputFmt)) {
  if (defined $showInputFmt) {
    print "input formats MS/MS: ".(join ',',(InSilicoSpectro::Spectra::MSRun::getReadFmtList(), InSilicoSpectro::Spectra::MSMSSpectra::getReadFmtList()))."\n";
  }
  if (defined $showOutputFmt) {
    print "output formats MS/MS: ".(InSilicoSpectro::Spectra::MSRun::getWriteFmtList(), InSilicoSpectro::Spectra::MSMSSpectra::getWriteFmtList())."\n";
  }
  exit(0);
}

if($showDef){
  getDef(out=>$showDef);
  exit(0);
}

my $run=InSilicoSpectro::Spectra::MSRun->new();
$run->set('defaultCharge', InSilicoSpectro::Spectra::MSSpectra::string2chargemask($defaultCharge));
$run->set('title', $title);

@skip = split(/,/,join(',',@skip));
foreach (@skip) {
  $_=lc $_;
  s/^ms$/pmf/;
  $run->{read}{skip}{$_}=1;
}
my $is=0;

#explodes @file into glob + archive stuff...
my @tmpFileIn;

foreach (@fileIn) {
  s/ /\\ /g;
  foreach my $fi (glob $_) {
    push @tmpFileIn, $fi;
  }
}
undef @fileIn;
while (my $fileIn=shift @tmpFileIn){
  my ($format, $source);
  if ($fileIn=~/(.*?):(.*)/) {
    ($format, $source)=(lc($1), $2);
  } else {
    ($format, $source)=(InSilicoSpectro::Spectra::MSRun::guessFormat($fileIn), $fileIn);
  }
  my $tmpdir=File::Spec->tmpdir;
  if($source=~/\.(tar|tar\.gz|tgz)$/i && $format ne 'dta'){
    my $tar=Archive::Tar->new;
    $tar->read($source, $source =~ /gz$/i);
    foreach ($tar->list_files()){
      my ($fdtmp, $tmp)=File::Temp::tempfile(SUFFIX=>$format, UNLINK=>1);
      $tar->extract_file($_, $tmp);
      push @fileIn, {format=>$format, file=>$tmp, origFile=>basename($_)};
      close $fdtmp;
    }
  }elsif($source=~/\.zip$/i && $format ne 'dta'){
    my $zip=Archive::Zip->new();
    unless($zip->read($source)==AZ_OK){
      CORE::die "zip/unzip: cannot read archive $source";
    }else{
      my @members=$zip->members();
      foreach (@members){
	my (undef, $tmp)=File::Temp::tempfile("$tmpdir/".(basename($_->fileName())."-XXXXX"), UNLINK=>1);
	$zip->extractMemberWithoutPaths($_, $tmp) && croak "cannot extract ".$_->fileName().": $!\n";
	push @fileIn, {format=>$format, file=>$tmp, origfile=>$_->fileName()};
      }
    }
    next;
  }elsif($source=~/(.*)\.gz$/i){
    my $base=$1;
    my (undef, $tmp)=File::Temp::tempfile(DIR=>$tmpdir, SUFFIX=>basename($base), UNLINK=>1);
    $source=InSilicoSpectro::Utils::io::uncompressFile($source, {remove=>0, dest=>$tmp});
    push @tmpFileIn, "$format:$source";
    next;
  }

  push @fileIn, {format=>$format, file=>$source, origFile=>$source};
}

foreach (@fileIn) {
  my ($inFormat, $src, $origFile)=($_->{format}, $_->{file}, $_->{origFile});
  $run->set('format', $inFormat);
  $run->set('source', $src);
  $run->set('origFile', $origFile);
  if((defined $InSilicoSpectro::Spectra::MSRun::autoDetectFormatHandlers{$inFormat})){
    my $tmp = $InSilicoSpectro::Spectra::MSRun::autoDetectFormatHandlers{$inFormat}{read}->($src);
    die "cannot auto detect [$inFormat] from $src" unless $tmp;
    $inFormat=$tmp;
    $run->set('format', $inFormat);
  }
  unless (defined $InSilicoSpectro::Spectra::MSRun::handlers{$inFormat}{read}) {
    my %h;
    foreach (keys %$run) {
      next if /^spectra$/;
      $h{$_}=$run->{$_};
    }
    my $sp=InSilicoSpectro::Spectra::MSSpectra->new(%h);
    $run->addSpectra($sp);
    $sp->set('sampleInfo', \%sampleInfo) if defined %sampleInfo;
    $sp->origFile($origFile);
    $sp->setSampleInfo('sampleNumber', $is++);
    $sp->open(forcedefaultcharge=>$forceDefaultCharge);
    $run->set('defaultCharge', $run->get('defaultCharge')||$sp->get('defaultCharge'));
  } else {
    croak "not possible to set multiple file in with format [$inFormat]" if $#fileIn>0 and ! $InSilicoSpectro::Spectra::MSRun::handlers{$inFormat}{readMultipleFile};
    $InSilicoSpectro::Spectra::MSRun::handlers{$inFormat}{read}->($run);
  }
}

if($excludeKeysFile){
  my %xKeys;
  open (FD, "<$excludeKeysFile") or CORE::die "cannot open for reading [$excludeKeysFile] :$!";
  while (<FD>){
    chomp;
    s/\#.*//;
    s/\s+$//;
    next unless /\S/;
    $xKeys{$_}=1;
  }
  close FD;
  my $imax=$run->getNbSpectra()-1;
  my $i=0;
  while ($i<=$imax){
    my $sp=$run->getSpectra($i);
    if($xKeys{$sp->get('key')}){
      $run->removeSpectra($i);
      $i--;
      $imax;
    }else{
      next unless ref($sp) eq 'InSilicoSpectro::Spectra::MSMSSpectra';
      my $jmax=$sp->size()-1;
      my $j=0;
      while ($j<=$jmax){
	my $cmpd=$sp->get('compounds')->[$j];
	if($xKeys{$cmpd->get('key')}){
	  print STDERR "remove $j\n";
	  splice @{$sp->get('compounds')}, $j, 1;
	  $j--;
	  $jmax--;
	}
	$j++;
      }
    }
    $i++;
  }
}


#to filter the spectra
if ($fileFilter) {
  my $fc = new InSilicoSpectro::Spectra::Filter::MSFilterCollection();
  $fc->readXml($fileFilter);
  $fc->filterSpectra($run);
  use Data::Dumper;
  print Dumper ($run);
}

if ($precursorTrustParentCharge){
  my %charge2trust;
  if ($precursorTrustParentCharge eq 'medium') {
    %charge2trust=(
		   2=>4+8,
		   3=>4+8,
		  );
  } elsif ($precursorTrustParentCharge=~/\d+:\d+/) {
    foreach(split/\//, $precursorTrustParentCharge){
      CORE::die "[$_] does not fit /\d+:\d+[</\d+[...]]/" unless /^(\d+):([\d,]+)$/;
      my $c=$1;
      my $l=$2;
      $charge2trust{$c}=0;
      foreach (split /,/, $l){
	$charge2trust{$c}|=(1<<$_);
      }
    }
  } else {
    CORE::die "unknow --trustprecursorcharge"
  }

  my $origpd_cmask;
  my $alterpd_charge;
  my $imax=$run->getNbSpectra()-1;
  foreach my $i (0..$imax) {
    my $sp=$run->getSpectra($i);
    next unless ref($sp) eq 'InSilicoSpectro::Spectra::MSMSSpectra';
    my $jmax=$sp->size()-1;
    for my $j (0..$jmax) {
      my $cmpd=$sp->get('compounds')->[$j];
      my $ipdcmask=$cmpd->get('parentPD')->getFieldIndex('chargemask');
      unless(defined $ipdcmask){
	my $ipdcharge=$cmpd->get('parentPD')->getFieldIndex('charge') or CORE::die "neither charge not chargemask is defined for cmpd parent \n$cmpd";
	if ($cmpd->get('parentPD')."" ne "$origpd_cmask") {
	  $origpd_cmask=$cmpd->get('parentPD');
	  $alterpd_charge=InSilicoSpectro::Spectra::PhenyxPeakDescriptor->new($origpd_cmask);
	  $alterpd_charge->getFields()->[$ipdcharge]='chargemask';
	  $ipdcmask=$ipdcharge;
	} else {
	  $alterpd_charge=$cmpd->get('parentPD');
	  $ipdcmask=$ipdcharge;
	}
      }
      my $cmask=$cmpd->getParentData()->[$ipdcmask];
      foreach (0..31){
	next unless $cmask & (1<<$_);
	$cmpd->getParentData()->[$ipdcmask]|=$charge2trust{$_};
      }
#      $cmpd->getParentData()->[$ipdcmask]|= (1<<3) if $cmpd->getParentData()->[$ipdcmask] & (1<<2);
#      $cmpd->getParentData()->[$ipdcmask]|= (1<<2) if $cmpd->getParentData()->[$ipdcmask] & (1<<3);
    }
  }
}
if($dpmStr){
  my $el_hplus;
  eval{
    use InSilicoSpectro;
    InSilicoSpectro::init();
    $el_hplus=InSilicoSpectro::InSilico::MassCalculator::getMass('el_H+');
  };
  if($@){
    $el_hplus=1.00728;
  }
  CORE::die "invalid value [$dpmStr] insteadd of --duplicateprecursormoz=i1:i2" unless $dpmStr=~/^([\-\d]+):([\-\d]+)$/;
  my ($min, $max)=($1, $2);
  my $imax=$run->getNbSpectra()-1;

  my $origpd_cmask;
  my $alterpd_charge;
  my $cmpdNumber=0;
  foreach my $i(0..$imax){
    my $sp=$run->getSpectra($i);
    next unless ref($sp) eq 'InSilicoSpectro::Spectra::MSMSSpectra';
    my $jmax=$sp->size()-1;
    for my $j (0..$jmax){
      my $cmpd=$sp->get('compounds')->[$j];

      my @charges=$cmpd->precursor_charges();
      CORE::die "cannot duplicate with undefined precursor charges" unless @charges;
      foreach my $c(@charges){
	my $ipdcharge=$cmpd->get('parentPD')->getFieldIndex('charge');
	unless(defined $ipdcharge){
	  my $ipdcmask=$cmpd->get('parentPD')->getFieldIndex('chargemask') or CORE::die "neither charge not chargemask is defined for cmpd parent \n$cmpd";
	  if($cmpd->get('parentPD')."" ne "$origpd_cmask"){
	    $origpd_cmask=$cmpd->get('parentPD');
	    $alterpd_charge=InSilicoSpectro::Spectra::PhenyxPeakDescriptor->new($origpd_cmask);
	    $alterpd_charge->getFields()->[$ipdcmask]='charge';
	    $ipdcharge=$ipdcmask;
	  }else{
	    #$alterpd_charge=$cmpd->get('parentPD');
	    $ipdcharge=$ipdcmask;
	  }
	}
	foreach my $shift($min..$max){
	  next if $shift==0;
	  my $newcmpd=InSilicoSpectro::Spectra::MSMSCmpd->new($cmpd);
	  $newcmpd->{compoundNumber}=$cmpdNumber++;
	  delete $newcmpd->{key};
	  my @prec=@{$cmpd->getParentData()};
	  $newcmpd->setParentData(\@prec);
	  $newcmpd->set('parentPD', $alterpd_charge);
	  $prec[$ipdcharge]=$c;
	  $prec[0]+=$el_hplus*$shift/$c;
	  $newcmpd->title(($cmpd->title() || "")." [charge=$c+, ".(($shift>0)?"+":"")."$shift isotope]");
	  $sp->addCompound($newcmpd);

	}
      }
    }
  }
}

my ($outformat, $outfile);
if ($fileOut=~/(.*?):(.*)/) {
  $outformat=$1;
  $outfile=($2 or \*STDOUT);
} else {
  $outfile=$fileOut;
  $outformat=InSilicoSpectro::Spectra::MSSpectra::guessFormat($outfile);
}

# use Data::Dumper;
# $Data::Dumper::Maxdepth=3;
# print Dumper($run);
# use Devel::Size qw(size total_size);

# InSilicoSpectro::Utils::FileCached::dump_all();

# print "mrun total_size=".total_size($run)."\n";


$run->write($outformat, ">$outfile");

if($propertiesFile){
  require Util::Properties;
  my $prop=Util::Properties->new();
  $propertiesPrefix.="." if $propertiesPrefix && $propertiesPrefix!~/\.$/;
  $prop->file_name($propertiesFile);

  #count nb frag spectra.
  my $nbFragSp=0;
  my $imax=$run->getNbSpectra()-1;
  foreach my $i(0..$imax){
    my $sp=$run->getSpectra($i);
    next unless ref($sp) eq 'InSilicoSpectro::Spectra::MSMSSpectra';
    $nbFragSp+=$sp->size();
  }
  $prop->prop_set($propertiesPrefix."msms.nbcompounds", $nbFragSp);
  

}
sub getDef{
  my %hparams=@_;
  my $out=$hparams{out} || 'xml';


  my %ret=(
	   input=>[],
	   output=>[],
	   version=>$InSilicoSpectro::VERSION,
	   #might add some compulsory attribute if needed
	   extraargs=>[
			  {
			   key=>'duplicateprecursormoz',
			   description=>['it is possible to duplicate precursor moz for example "-1,1,2,3", when tolearance is small but isotope error are suspected'],
			   shortdescription=>['dupplicate prec. m/z'],
			   type=>'String',
			   regexp=>['([\+\-]?\d+,)*[+\-]?\d+'],
			   level=>3,
			  },
			  {
			   key=>'defaultcharge',
			   description=>['precursor default charge, to be assigned when no precursor is defined for a given ion'],
			   shortdescription=>['prec. defaut charge'],
			   type=>'Select',
			   choices=>['1+', '2+', '2+ AND 3+', '3+'],
			   level=>1,
			   default=>['2+ AND 3+'],
			  },
			  {
			   key=>'title',
			   shortdescription=>['title'],
			   description=>['run general title'],
			   type=>'String',
			   level=>1,
			   compulsory=>1,
			   default=>['no title'],
			  },
			  {
			   key=>'filter',
			   shortdescription=>['peaklist filter'],
			   description=>['a configuration file to filter peaklist'],
			   type=>'File',
			   level=>3,
#			   compulsory=>0,
			   default=>[''],
			  },
			  {
			   key=>'usefilecaching',
			   shortdescription=>['use file caching'],
			   description=>['activates a memory usage control system - a CPU time cost'],
			   type=>'Flag',
			   level=>3,
#			   compulsory=>1,
			   default=>undef,
			  },
			 ],
	   );
  
  while (my ($key, $h)=each %handlers){
    if($h->{read}){
      my %h1;
      $h1{description}=[$h->{description}||""];
      $h1{type}=$key;
      $h1{aggregateruns}=[$h->{readMultipleFile}||0||'no'];
      $h1{containsmultipleruns}=['no'];
      push @{$ret{input}}, \%h1
    }
    if($h->{write}){
      my %h1;
      $h1{description}=[$h->{description}||""];
      $h1{mimetype}=[$h->{mimetype}||"text/plain"];
      $h1{type}=$key;
      push @{$ret{output}}, \%h1
    }
  }
  while (my ($key, $h)=each %InSilicoSpectro::Spectra::MSMSSpectra::handlers){
    if($h->{read}){
      my %h1;
      $h1{description}=[$h->{description}||""];
      $h1{type}=$key;
      $h1{aggregateruns}=['yes'];
      $h1{containsmultipleruns}=['no'];
      push @{$ret{input}}, \%h1
    }
  }
  if($out eq 'xml'){
    require XML::Simple;
    my $xs = XML::Simple->new;
    print $xs->XMLout(\%ret,
		      KeyAttr=>{input=>'+key'},
		      KeyAttr=>{output=>'+key'},
		      GroupTags=>{
				  extraargs=>"oneExtraArg",
				  input=>"oneinput",
				  output=>"oneoutput",
				  choices=>'oneChoice',
				 },
		      RootName=>'InSilicoSpectroFormats',
		     );
    return;
  }
  die "out [$out] is not defined";
}


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.