#!/usr/bin/perl -w # Calcualte GC contents of each sequence from FASTA # Copyright 2008 Naoki Takebayashi # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Version: 20081007 my $usage="Usage: $0 [-h] [fastaFile]\n" . " -h: help (print this message)\n" . "\n". "Print out GC contents and base compositions for each sequence\n". "%GC is (G + C)/(A+T+G+C), so ambiguous characters (N etc) are ignored."; my $sep = "\t"; # if you use tab in the sequence name, change this to # other characters such as "," my $replaceChar = "-"; # for -g, this character is used to replace my $lineWidth = 70; # used to break the long sequences into lines for FASTA out my $debug = 0; our($opt_h, $opt_g); use Getopt::Std; getopts('h') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $dnaFile = shift @ARGV; # initialize the @seqArray, @seqNameArray, and $maxSeqLen my @dat = ReadInFASTA($dnaFile); my $numSeq = @dat; PrintBaseComposi(@dat); exit(0); #### functions sub Mean { my $sum = 0; foreach my $i (@_) { $sum += $i; } return $sum / scalar(@_); } sub MkSelIndex { my ($max, $siteList) = @_; $siteList =~ s/^\s+//; $siteList =~ s/\s+$//; my @sites = split(/\s*,\s*/, $siteList); my @result = (); foreach my $item (@sites) { if ($item =~ /^(\d+)-(\d+)$/) { die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2); $beginPos = $1 - 1; $endPos = $2 - 1; } elsif ($item =~ /^-(\d+)$/) { $beginPos = 0; $endPos = $1 - 1; } elsif ($item =~ /^(\d+)-$/) { $beginPos = $1 - 1; $endPos = $max-1; } elsif ($item =~ /^(\d+)$/) { $beginPos = $1 - 1; $endPos = $1 - 1; } else { die "$siteList given as the list of sites. " . "Make sure it is comma delimitted, and each element is " . " one of the forms: 23-26, 29, -10, 40-\n"; } push (@result, $beginPos..$endPos); } return(@result); } # takes an arg; name of a file from which data are read Then read in # the data and make an array. Each element of this array corresponds # to a sequence, name tab data. sub ReadInFASTA { my $infile = shift; my @line; my $i = -1; my @result = (); my @seqName = (); my @seqDat = (); open (INFILE, "<$infile") || die "Can't open $infile\n"; while () { chomp; if (/^>/) { # name line in fasta format $i++; s/^>\s*//; s/^\s+//; s/\s+$//; $seqName[$i] = $_; $seqDat[$i] = ""; } else { s/^\s+//; s/\s+$//; s/\s+//g; # get rid of any spaces next if (/^$/); # skip empty line s/[uU]/T/g; # change U to T $seqDat[$i] = $seqDat[$i] . uc($_); } # checking no occurence of internal separator $sep. die ("ERROR: \"$sep\" is an internal separator. Line $. of " . "the input FASTA file contains this charcter. Make sure this " . "separator character is not used in your data file or modify " . "variable \$sep in this script to some other character.\n") if (/$sep/); } close(INFILE); foreach my $i (0..$#seqName) { $result[$i] = $seqName[$i] . $sep . $seqDat[$i]; } return (@result); } sub GetSeqDat { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[1]; } return (@result) } sub GetSeqName { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[0]; } return (@result) } sub SelectSites { my ($arrayRef, $indexRef) = @_; unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){ die "args to SelectSites() should be ARRAY REF, ARRAY REF\n"; } my $maxIndex = @$arrayRef -1; my @result = (); foreach my $posi (@$indexRef) { if ($maxIndex < $posi) { push @result, "?"; } else { push @result, $$arrayRef[$posi]; } } return @result; } sub ReplaceOtherSitesWGaps { my ($arrayRef, $indexRef) = @_; unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){ die "args to SelectSites() should be ARRAY REF, ARRAY REF\n"; } my $maxIndex = @$arrayRef -1; my @allSites = 0..($maxIndex) ; my @index = InANotInB (\@allSites, $indexRef); # making the complement set warn "WARN: some selected sites don't exists\n" if (Max(@$indexRef) > $maxIndex); my @result = @$arrayRef; foreach my $posi (@index) { $result[$posi] = $replaceChar; } if ($debug) { print join "", "debug: ", @$arrayRef, "\n"; print join "", "debug: ", @result, "\n\n"; } return @result; } sub Sites { my ($datRef, $indexRef) = @_; my @seqDat = GetSeqDat(@$datRef); my @seqName = GetSeqName(@$datRef); my @result = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { my @tmpArray = split(//, $seqDat[$seqNumber]); my @thisSeq = (defined($opt_g)) ? ReplaceOtherSitesWGaps(\@tmpArray, $indexRef) : SelectSites(\@tmpArray, $indexRef); my $thisLine = $seqName[$seqNumber] . "\t" . (join("", @thisSeq)); push @result, $thisLine; } return (@result); } sub PrintFASTA { my @seqName = GetSeqName(@_); my @seqDat = GetSeqDat(@_); for my $i (0..$#seqDat) { # print ">$seqName[$i]\n$seqDat[$i]\n"; print ">$seqName[$i]\n"; my $seq = $seqDat[$i]; for (my $pos=0 ; $pos < length ($seq) ; $pos += $lineWidth) { print substr($seq, $pos, $lineWidth), "\n"; } } } sub MaxSeqLen { my @data = GetSeqDat(@_); my $maxLen = 0; foreach $i (@data) { my $len = CharLen($i); $maxLen = $len if ($len > $maxLen); } return ($maxLen); } # returns an array of seq lengths after removing - and ? for each seq. sub SeqLensRmGaps { my @data = GetSeqDat(@_); my @result = (); foreach my $i (@data) { $i =~ s/[-\?]//g; # remove gaps my $len = CharLen($i); push @result, $len; } return (@result); } # take std seq data (name\tseq), and attach "?" for the shorter sequences sub AdjustSeqLength { my @data = @_; my @seqDat = GetSeqDat(@_); my @seqName = GetSeqName(@_); my $maxLen = MaxSeqLen(@_); foreach my $i (0 .. $#seqDat) { my $thisLen = CharLen ($seqDat[$i]); if ($thisLen == $maxLen) { ; # do nothing } elsif ($thisLen < $maxLen) { my $diff = $maxLen - $thisLen; warn "WARN: $seqName[$i] shorter. " . "$diff '?' (missing character) were added at the end\n"; for ($j=0; $j < $diff; $j++) { $data[$i] = $data[$i] . "?"; } } else { die "ERROR: the length of sequence $seqName[$i] is $thisLen, " . "longer than \$maxLen = $maxLen. Weird!!"; } } return (@data); } # check if the length of all sequences are same sub CheckEqualSeqLength { my @data = @_; my @seqDat = GetSeqDat(@_); my @seqName = GetSeqName(@_); my $maxLen = MaxSeqLen(@_); my $equalLen = 1; foreach my $i (0 .. $#seqDat) { my $thisLen = CharLen ($seqDat[$i]); if ($thisLen != $maxLen) { $equalLen = 0; last; } } return $equalLen; } sub PrintBaseComposi { my @seqDat = GetSeqDat(@_); my @seqName = GetSeqName(@_); foreach my $i (0..$#seqDat) { my $seq = $seqDat[$i]; $seq = uc($seq); $seq =~ s/[-\?]//g; #get rid of - and ? my $len = CharLen($seq); my @bases = split(//, $seq); my %composi = (); foreach my $b (@bases) { if (defined($composi{$b})) { $composi{$b} ++; } else { $composi{$b} = 1; } } print "$seqName[$i]\t%gc=", ($composi{'G'} + $composi{'C'})/ ($composi{'G'}+$composi{'C'}+$composi{'A'}+$composi{'T'}), "\tBaseComposi=" ; foreach my $k (sort(keys %composi)) { print "$k=$composi{$k},"; } print "\n"; } } sub RemoveGapOnlySites { my ($seqDatARef, $multipleOf) = @_; my @seqDat = GetSeqDat(@$seqDatARef); my @seqName = GetSeqName(@$seqDatARef); my $maxLen = MaxSeqLen(@$seqDatARef); my @gapSites = (); my @notGapSites = (); my ($posi, $seqNumber); my @seqMat = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { my @tmpArray = split(//, $seqDat[$seqNumber]); # Check the length if (@tmpArray != $maxLen) { die "ERROR: the sequence $seqName[$i] is not same length " . "as \$maxLen = $maxLen. Weird!!"; } push @seqMat, [ @tmpArray ]; } # now identify the all gap sites for $posi (0 .. ($maxLen-1)) { my $gap = 1; for $seqNumber (0 .. $#seqMat){ if ($seqMat[$seqNumber][$posi] !~ /^[-\?]$/) { $gap = 0; last; } } if ($gap == 1) { # all sequences have a gap at these sites push (@gapSites, $posi+1); # now unit-offset } else { # there are some non-gap character at these sites push (@notGapSites, $posi+1); } } my @rmSites = (); # removing multiples of $multipleOf for(my $i = 0; $i < @gapSites - $multipleOf + 1; $i++) { my $rmFlag = 1; for(my $j = 1; $j < $multipleOf; $j++) { if ($gapSites[$i] + $j != $gapSites[$i+$j]) { $rmFlag = 0; # we don't want to remove this $i $j=$multipleOf; # get out of inner loop } } if ($rmFlag == 1) { push @rmSites, @gapSites[$i..($i+$multipleOf-1)]; $i += $multipleOf - 1; } } my @allSites = 1..($maxLen) ; my @selIndex = InANotInB (\@allSites, \@rmSites); @selIndex = To0Offset(@selIndex); # convert to 0-ffset # select sites and make results my @result = (); for $seqNumber (0 .. $#seqMat) { my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex); my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq)); push (@result, $line); } # if (@rmSites > 0) { # warn ("Following sites consist of all gaps, removed from analysis\n"); # print STDERR join(" ", @rmSites); # print STDERR "\n"; # } return (@result); } # if the 2nd argument is "rmAmbiguous", ambiguous characters are removed (anything other than ATGC) sub RemoveSitesWithAnyGap { my ($seqDatARef, $ambiguousRm) = @_; my @seqDat = GetSeqDat(@$seqDatARef); my @seqName = GetSeqName(@$seqDatARef); my $maxLen = MaxSeqLen(@$seqDatARef); my @gapSites = (); my @notGapSites = (); my ($posi, $seqNumber); my @seqMat = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { if ($ambiguousRm eq "rmAmbiguous") { # replace anything other than ATGC with - $seqDat[$seqNumber] =~ s/[^ATGCatgc]/-/g; } my @tmpArray = split(//, $seqDat[$seqNumber]); # Check the length if (@tmpArray != $maxLen) { die "ERROR: the sequence $seqName[$i] is not same length " . "as \$maxLen = $maxLen. Weird!!"; } push @seqMat, [ @tmpArray ]; } # now identify the sites with any gap for $posi (0 .. ($maxLen-1)) { my $gap = 0; for $seqNumber (0 .. $#seqMat){ if ($seqMat[$seqNumber][$posi] =~ /^[-\?]$/) { $gap = 1; last; } } if ($gap == 1) { # there is a gap at this site push (@gapSites, $posi+1); # now unit-offset } else { # No seq has a gap at this site. push (@notGapSites, $posi+1); } } # print join(":", "gap=", @gapSites, "\n"); # debug # print join(":", "nogap=", @notGapSites, "\n"); #debug # my @rmSites = @gapSites; # my @allSites = 1..($maxLen) ; # my @selIndex = InANotInB (\@allSites, \@rmSites); my @selIndex = @notGapSites; @selIndex = To0Offset(@selIndex); # convert to 0-ffset # select sites and make results my @result = (); for $seqNumber (0 .. $#seqMat) { my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex); my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq)); push (@result, $line); } # if (@rmSites > 0) { # warn ("Following sites contain gaps\n"); # print STDERR join(" ", @rmSites); # print STDERR "\n"; # } return (@result); } # convert 1-offset index array to 0-offset array sub To0Offset { my @result = map {$_ - 1} @_; return @result; } # count the number of characters in a string sub CharLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); } # this function take two scalars and return the larger value sub Larger { my ($a, $b) = @_; return (($a > $b) ? $a : $b); } sub InANotInB { my ($aRef, $bRef) =@_; my %seen = (); my @aonly =(); foreach my $item (@$bRef) { $seen{$item} = 1}; foreach my $item (@$aRef) { push (@aonly, $item) unless $seen{$item}; } return (@aonly); } sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; } sub Max { my $max = shift; foreach $item (@_) { if ($item > $max) { $max = $item; } } return $max; } sub MemberQ { my ($x, $arrRef) = @_; foreach my $item (@$arrRef) { if ($x eq $item) { return 1; } } return 0; }