#!/usr/bin/perl # Gives FASTA seq as an argument or STDIN. # Calculate the pairwise differences # Copyright 2013, Naoki Takebayashi # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Version: 20130612 my $usage="Usage: $0 [-t geneticCodeTbl] [-d distanceCorrection] [-f output_format] [alignedDNASeq]\n" . " -c: eliminate codons which include one or more gaps\n" . " This option can be combined with -e\n" . " -e: eliminate base sites where gaps are observed in at least one seq\n". " Gap sites are eliminated pair-wise without this option\n" . " -s: use a file which list names of sequences to be used in the analysis\n". " -a: average differece and average number of sites are printed\n" . " -d: \"kimuraProt\", \"poissonProt\", \"p\". (default is p)\n" . " -n: do not print seq names\n" . " -f: output format, currently only phylip, PAML style if not specfied"; use Getopt::Std; getopts('af:d:hces:n') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } my %synoSitesPerCodon = (); my (%synoDiff, %nonSynoDiff); my %aaSeq; die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; my $dnaFile = shift; # $mode is set "nucleotide" or "protein" # read in the data and store them in @seqName and @seqDat ReadInFASTA($dnaFile); #foreach $i (0..$#seqName) { # print ">$seqName[$i]\n$seqDat[$i]\n"; #} if (defined($opt_c) && $mode eq "protein") { die "ERROR: infile was Protein sequences, -c (codon) doesn't make sense\n"; } # select the sequences (and reorder if necessary) if (defined($opt_s)) { ReorderNameAndDat($opt_s); } if (defined($opt_e)) { @seqDat = CleanSeqs(@seqDat); } # for debugging # while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; }; # foreach $k (keys %synoDiff) # {print "**$k** => $synoDiff{$k} $nonSynoDiff{$k}\n";}; PrintAllPairDiff(); exit (0); # takes an arg; name of a file from which data are read # Then read in the data and store them in @seqName and @seqDat sub ReadInFASTA { my $infile = shift; my @line; my $i = -1; my @result = (); open (INFILE, "<$infile") || die "Can't open $infile\n"; $mode = "nucleotide"; while () { chomp; if (/^>/) { # name line in fasta format $i++; s/^>\s*//; s/^\s+//; s/\s+$//; #@line = split (/\|/); # note it takes only the name before | #$line[0] =~ s/\s+$//; $seqName[$i] = $_; $seqDat[$i] = ""; } else { s/^\s+//; s/\s+$//; s/\s+//g; next if (/^$/); # skip empty line if (! /^[ATGCUatgcu\-\.\?]+$/) { $mode="protein"; } $seqDat[$i] = $seqDat[$i] . uc($_); } } close(INFILE); if ($mode eq "nucleotide") { foreach (@seqDat) { # CHECK THIS s/U/T/g; # change U to T } } } sub ReadSeqNameFile { my $file = shift; my @result = (); open(INFILE, "<$file") || die "Can't open $file\n"; while () { chomp; s/#.*$//; # remove comments (#) s/^\s+//; s/\s+$//; next if (/^$/); push @result, $_; } return @result; } # Reorder @seqName and @seqDat sub ReorderNameAndDat { my $filename = shift; my $i; my @selectedSeq = ReadSeqNameFile($filename); my %nameIdx=(); foreach $i (0..$#seqName) { if (exists ($nameIdx{$seqName[$i]})) { die "There are multiple seqs with a same name: $seqName[$i]\n"; } else { $nameIdx{$seqName[$i]} = $i; } } my @orderedName = (); my @orderedDat =(); foreach $i (@selectedSeq) { if (exists ($nameIdx{$i})) { push @orderedName, $seqName[$nameIdx{$i}]; push @orderedDat, $seqDat[$nameIdx{$i}]; } else { die "ERROR: $i in seqName files doesn't exist in the data file\n"; } } @seqName = @orderedName; @seqDat = @orderedDat; } sub PrintAllPairDiff { my ($i, $j, $seq1, $seq2, $seqName1, $seqName2); # calc pair diff my @nameVect = (); my @cntVect = (); my @pDist = (); for ($i = 2; $i <= @seqDat; $i++) { $seq1 = $seqDat[$i-1]; $seqName1 = $seqName[$i-1]; for ($j = 1; $j < $i; $j++) { $seq2 = $seqDat[$j-1]; $seqName2 = $seqName[$j-1]; my @diff = CntPairWiseDiff($seq1, $seq2); push @pDist, $diff[0]/$diff[1]; # proportion of different sites push (@cntVect, join("\t", @diff)); push @nameVect, "$i\t$seqName1\t$j\t$seqName2"; } } if (defined($opt_a)) { # print ave diff and ave number of sites my $sumDiff = 0; my $sumN = 0; foreach $i (@cntVect) { my @record = split(/\t/, $i); $sumDiff += $record[0]; $sumN += $record[1]; } my $aveDiff = $sumDiff / @cntVect; my $aveN = $sumN / @cntVect; print "$aveDiff\t$aveN\n"; } elsif (defined($opt_d)) { if ($opt_d eq "kimuraProt") { # kimura's amino acid @pDist=KimuraProteinDist(@pDist); } elsif ($opt_d eq "poissonProt") { # poisson correction for amino acid @pDist=PoissonProteinDist(@pDist); } if (defined ($opt_f)) { # print some other format if ($opt_f eq "phylip") { PrintPhylipDist(@pDist); } else { die "ERROR: $opt_f as the argument of -f is not implemented\n"; } } elsif (defined ($opt_n)) { map {print "$_\n"} @pDist; } else { print "seq1\tseqName1\tseq2\tseqName2\tdist\n"; foreach $i (0..$#nameVect) { print "$nameVect[$i]\t$pDist[$i]\n"; } } } else { # print pairwise diff's if (defined ($opt_n)) { map {print "$_\n"} @cntVect; } else { print "seq1\tseqName1\tseq2\tseqName2\tdiff\tn\n"; foreach $i (0..$#nameVect) { print "$nameVect[$i]\t$cntVect[$i]\n"; } } } } sub CntPairWiseDiff { my ($seq1, $seq2) = @_; my $i; # get rid of codons with gaps my @cleaned = CleanSeqs($seq1, $seq2); $cleaned[0] =~ s/\t//g; $cleaned[1] =~ s/\t//g; my @noGapSeq1 = split(//, $cleaned[0]); my @noGapSeq2 = split(//, $cleaned[1]); # print "$cleaned[0]\t$seq1\n$cleaned[1]\t$seq2\n"; #debug # counting number of differences my $numBases = @noGapSeq1; my $numDiff = 0; for ($i = 0; $i < $numBases; $i++) { $numDiff++ if ($noGapSeq1[$i] ne $noGapSeq2[$i]) } return ($numDiff, $numBases) } sub KimuraProteinDist { my $maxP = 0.854101966249684544; return (map { ($_ >= $maxP) ? "Inf" : (($_ == 0) ? 0 : - log(1 - $_ - $_ ** 2 / 5)) } @_); } sub PoissonProteinDist { return (map { ($_ >= 1) ? "Inf" : (($_ == 0) ? 0 :- log(1 - $_)) } @_); } sub PrintPhylipDist { my @pVect = @_; my $numSeq = @seqName; if ($numSeq * ($numSeq-1) != 2 * @pVect) { die ("ERROR: number of pVect is weird in PrintPhylip\n" . "$numSeq sequences and pVect has ", $#pVect +1, " elements\n"); } my @distTbl = (); my @initVal = Repeat(0, $numSeq); for my $i (0 .. ($numSeq-1)) { push @distTbl, [ @initVal ]; } my $i = 1; my $j=0; for my $d (@pVect) { $distTbl[$i][$j] = $distTbl[$j][$i] = $d; if ($i - 1 == $j) { $i++; $j = 0; } else { $j++; } } if ($i != $numSeq || $j != 0) { die "ERROR: conversion to Phylip format didn't work (\$i=$i, \$j=$j)\n"; } print "$numSeq\n"; for my $i (0 .. ($numSeq-1)) { print "$seqName[$i]"; for my $j (0..$numSeq-1) { print "\t$distTbl[$i][$j]"; } print "\n"; } } # returns an array with the 1st argument repeated n times (2nd arg) sub Repeat { my ($val, $n) = @_; my @result = (); foreach my $i (0..($n-1)) { push @result, $val; } return @result; } # takes multiple string; each string represent a sequence # Get rid of the sites where at least one seq has a gap (or ambiguious base) # It returns an array of sequences. Each seq is tab-delimited # e.g, returned array: ("AAA\tGGG\tATA", "AAA\tGGA\tATA", "AAG\tGGG\tATA") sub CleanSeqs { my $minLength = CharLen($_[0]); my ($i, $seqNum); my @noGapSites = (); my @result = (); my @codonMat = (); # get rid of codons with gaps foreach $i (@_) { my @tmpArray = (defined($opt_c)) ? MkTripletArray($i) : split(//, $i); push @codonMat, [ @tmpArray ]; $minLength = @tmpArray if (@tmpArray < $minLength); # find min length } # identify the sites without any gaps. for $i (0 .. $minLength - 1) { my $gap = 0; if (defined($opt_c)) { for $seqNum (0 .. $#_) { if ($codonMat[$seqNum][$i] !~ /^[ACGT]{3}$/) { $gap = 1; last; } } } elsif ($mode eq "nucleotide") { for $seqNum (0 .. $#_) { if ($codonMat[$seqNum][$i] !~ /^[ACGT]{1}$/) { $gap = 1; last; } } } else { # protein for $seqNum (0 .. $#_) { if ($codonMat[$seqNum][$i] =~ /^[\-\?]{1}$/) { $gap = 1; last; } } } push (@noGapSites, $i) if ($gap == 0); } # select the sites without any gaps for $seq (0 .. $#_) { my @oneSeq = (); foreach $i (@noGapSites) { push @oneSeq, $codonMat[$seq][$i]; } my $tabbedSeq = join "\t", @oneSeq; push @result, $tabbedSeq; } return @result; } # this function take two scalars and return the larger value sub Smaller { my ($a, $b) = @_; return (($a < $b) ? $a : $b); } sub Min { my $min = shift; foreach my $i (@_) { $min = ($i < $min) ? $i : $min; } return $min; } sub Sum { my $sum = 0; foreach my $i (@_) { $sum += $i; } return $sum; } # count the number of characters in a string sub CharLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); } # for a given string, it separates into triplets, and return # the resulting array. # if the last element is less than a triplet, it will be removed sub MkTripletArray { my $seq = shift; $seq =~ s/\s+//g; $seq =~ s/(.{3})/$1 /g; $seq =~ s/\s+$//; my @result = split(/ /, $seq); pop @result unless ($result[$#result] =~ /.{3}/); return @result; } # When one of the codons corresponds to termination, (-1, -1) is returned. # If two codons are identical, (0,0) is returned even if the codon # correspond to termination. # take a list as the argument and extract the unique elements. # The order of elements will not be preserved. sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; }