#!/usr/bin/perl -w
# Calcualte GC contents of each sequence from FASTA

# Copyright 2008 Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20081007

my $usage="Usage: $0 [-h] [fastaFile]\n" .
    "  -h: help (print this message)\n" .
    "\n".
    "Print out GC contents and base compositions for each sequence\n".
    "%GC is (G + C)/(A+T+G+C), so ambiguous characters (N etc) are ignored.";

my $sep = "\t";  # if you use tab in the sequence name, change this to
                 # other characters such as ","
my $replaceChar = "-"; # for -g, this character is used to replace
my $lineWidth = 70; # used to break the long sequences into lines for FASTA out

my $debug = 0;

our($opt_h, $opt_g);
use Getopt::Std;
getopts('h') || die "$usage\n";

if (defined($opt_h)) {
    die "$usage\n";
}

die "$usage\n" if (@ARGV > 1);

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $dnaFile = shift @ARGV;

# initialize the @seqArray, @seqNameArray, and $maxSeqLen
my @dat = ReadInFASTA($dnaFile);
my $numSeq = @dat;

PrintBaseComposi(@dat);

exit(0);


#### functions

sub Mean {
    my $sum = 0;
    foreach my $i (@_) {
	$sum += $i;
    }
    return $sum / scalar(@_);
}

sub MkSelIndex {
    my ($max, $siteList) = @_;
    $siteList =~ s/^\s+//;
    $siteList =~ s/\s+$//;

    my @sites = split(/\s*,\s*/, $siteList);

    my @result = ();
    foreach my $item (@sites) {
	if ($item =~ /^(\d+)-(\d+)$/) {
	    die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2);
	    $beginPos = $1 - 1;
	    $endPos = $2 - 1;
	} elsif ($item =~ /^-(\d+)$/) {
	    $beginPos = 0;
	    $endPos = $1 - 1;
	} elsif ($item =~ /^(\d+)-$/) {
	    $beginPos = $1 - 1;
	    $endPos = $max-1;
	} elsif ($item =~ /^(\d+)$/) {
	    $beginPos = $1 - 1;
	    $endPos = $1 - 1;
	} else {
	    die "$siteList given as the list of sites.  " . 
		"Make sure it is comma delimitted, and each element is " .
		    " one of the forms: 23-26, 29, -10, 40-\n";  
	}
	push (@result, $beginPos..$endPos);
    }
    return(@result);
}


# takes an arg; name of a file from which data are read Then read in
# the data and make an array.  Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
    my $infile = shift;
    my @line;
    my $i = -1;
    my @result = ();
    my @seqName = ();
    my @seqDat = ();

    open (INFILE, "<$infile") || die "Can't open $infile\n";

    while (<INFILE>) {
        chomp;
        if (/^>/) {  # name line in fasta format
            $i++;
            s/^>\s*//; s/^\s+//; s/\s+$//;
            $seqName[$i] = $_;
            $seqDat[$i] = "";
        } else {
            s/^\s+//; s/\s+$//;
	    s/\s+//g;                  # get rid of any spaces
            next if (/^$/);            # skip empty line
            s/[uU]/T/g;                  # change U to T
            $seqDat[$i] = $seqDat[$i] . uc($_);
        }

	# checking no occurence of internal separator $sep.
	die ("ERROR: \"$sep\" is an internal separator.  Line $. of " .
	     "the input FASTA file contains this charcter. Make sure this " . 
	     "separator character is not used in your data file or modify " .
	     "variable \$sep in this script to some other character.\n")
	    if (/$sep/);

    }
    close(INFILE);

    foreach my $i (0..$#seqName) {
	$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}

sub GetSeqDat {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[1];
    }

    return (@result)
}

sub GetSeqName {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[0];
    }
    return (@result)
}

sub SelectSites {
    my ($arrayRef, $indexRef) = @_;
    unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){
	die "args to SelectSites() should be ARRAY REF, ARRAY REF\n";
    }

    my $maxIndex = @$arrayRef -1;
    my @result = ();
    foreach my $posi (@$indexRef) {
	if ($maxIndex < $posi) {
	    push @result, "?";
	} else {
	    push @result, $$arrayRef[$posi];
	}
    }
    return @result;
}

sub ReplaceOtherSitesWGaps {
    my ($arrayRef, $indexRef) = @_;
    unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){
	die "args to SelectSites() should be ARRAY REF, ARRAY REF\n";
    }

    my $maxIndex = @$arrayRef -1;

    my @allSites = 0..($maxIndex) ;
    my @index = InANotInB (\@allSites, $indexRef);  # making the complement set

    warn "WARN: some selected sites don't exists\n" 
	if (Max(@$indexRef) > $maxIndex);

    my @result = @$arrayRef;
    foreach my $posi (@index) {
	$result[$posi] = $replaceChar;
    }
    if ($debug) {
	print join "", "debug: ", @$arrayRef, "\n";
	print join "", "debug: ", @result, "\n\n";
    }
    return @result;
}

sub Sites {
    my ($datRef, $indexRef) = @_;
    my @seqDat = GetSeqDat(@$datRef);
    my @seqName = GetSeqName(@$datRef);
    my @result = ();

    # make 2 dimensional matrix
    foreach $seqNumber (0..$#seqDat) {
	my @tmpArray = split(//, $seqDat[$seqNumber]);
	my @thisSeq = (defined($opt_g)) ? 
	    ReplaceOtherSitesWGaps(\@tmpArray, $indexRef) : 
		SelectSites(\@tmpArray, $indexRef);
	my $thisLine = $seqName[$seqNumber] . "\t" . (join("", @thisSeq));
	push @result, $thisLine;
    }
    return (@result);
}


sub PrintFASTA {
    my @seqName = GetSeqName(@_);
    my @seqDat = GetSeqDat(@_);
    for my $i (0..$#seqDat) {
	# print ">$seqName[$i]\n$seqDat[$i]\n";
        print ">$seqName[$i]\n";
        my $seq = $seqDat[$i];
        for (my $pos=0 ; $pos < length ($seq) ;  $pos += $lineWidth) {
            print substr($seq, $pos, $lineWidth), "\n";
        }
    }
}

sub MaxSeqLen {
    my @data = GetSeqDat(@_);
    my $maxLen = 0;
    foreach $i (@data) {
	my $len = CharLen($i);
	$maxLen = $len if ($len > $maxLen);
    }
    return ($maxLen);
}

# returns an array of seq lengths after removing - and ? for each seq.
sub SeqLensRmGaps {
    my @data = GetSeqDat(@_);
    my @result = ();
    foreach my $i (@data) {
	$i =~ s/[-\?]//g;  # remove gaps
	my $len = CharLen($i);
	push @result, $len;
    }
    return (@result);
}


# take std seq data (name\tseq), and attach "?" for the shorter sequences
sub AdjustSeqLength {
    my @data = @_;
    my @seqDat = GetSeqDat(@_);
    my @seqName = GetSeqName(@_);
    my $maxLen = MaxSeqLen(@_);
    
    foreach my $i (0 .. $#seqDat) {
	my $thisLen = CharLen ($seqDat[$i]);
	if ($thisLen == $maxLen)  {
	    ; # do nothing
	} elsif ($thisLen < $maxLen) {
	    my $diff = $maxLen - $thisLen;
	    warn "WARN: $seqName[$i] shorter.  " .
		"$diff '?' (missing character) were added at the end\n";
	    for ($j=0; $j < $diff; $j++) {
		$data[$i] = $data[$i] . "?";
	    }
	} else {
	    die "ERROR: the length of sequence $seqName[$i] is $thisLen, " .
		"longer than \$maxLen = $maxLen.  Weird!!";
	}
    }
    return (@data);
}

# check if the length of all sequences are same
sub CheckEqualSeqLength {
    my @data = @_;
    my @seqDat = GetSeqDat(@_);
    my @seqName = GetSeqName(@_);
    my $maxLen = MaxSeqLen(@_);

    my $equalLen = 1;
    foreach my $i (0 .. $#seqDat) {
	my $thisLen = CharLen ($seqDat[$i]);
	if ($thisLen != $maxLen)  {
	    $equalLen = 0;
	    last;
	}
    }
    return $equalLen;
}

sub PrintBaseComposi {
    my @seqDat = GetSeqDat(@_);
    my @seqName = GetSeqName(@_);

    foreach my $i (0..$#seqDat) {
	my $seq = $seqDat[$i];
	$seq = uc($seq);
	$seq =~ s/[-\?]//g;  #get rid of - and ?
	my $len = CharLen($seq);
	my @bases = split(//, $seq);
	my %composi = ();
	foreach my $b (@bases) {
	    if (defined($composi{$b})) {
		$composi{$b} ++;
	    } else {
		$composi{$b} = 1;
	    }
	}

	print "$seqName[$i]\t%gc=", ($composi{'G'} + $composi{'C'})/ ($composi{'G'}+$composi{'C'}+$composi{'A'}+$composi{'T'}), "\tBaseComposi=" ;

	foreach my $k (sort(keys %composi)) {
	    print "$k=$composi{$k},";
	}
	print "\n";

    }

}

sub RemoveGapOnlySites {
    my ($seqDatARef, $multipleOf) = @_;
    my @seqDat = GetSeqDat(@$seqDatARef);
    my @seqName = GetSeqName(@$seqDatARef);
    my $maxLen = MaxSeqLen(@$seqDatARef);
    my @gapSites = ();
    my @notGapSites = ();
    my ($posi, $seqNumber);
    my @seqMat = ();

    # make 2 dimensional matrix
    foreach $seqNumber (0..$#seqDat) {
	my @tmpArray = split(//, $seqDat[$seqNumber]);
	# Check the length
	if (@tmpArray != $maxLen)  {
	    die "ERROR: the sequence $seqName[$i] is not same length " .
		"as \$maxLen = $maxLen.  Weird!!";
	}
	push @seqMat, [ @tmpArray ];
    }

    # now identify the all gap sites
    for $posi (0 .. ($maxLen-1)) {
	my $gap = 1;
	for $seqNumber (0 .. $#seqMat){
	    if ($seqMat[$seqNumber][$posi] !~ /^[-\?]$/) {
		$gap = 0;
		last;
	    }
	}
	if ($gap == 1) {  # all sequences have a gap at these sites
	    push (@gapSites, $posi+1); # now unit-offset
	} else {          # there are some non-gap character at these sites
	    push (@notGapSites, $posi+1);
	}
    }

    my @rmSites = ();  # removing multiples of $multipleOf
    for(my $i = 0; $i < @gapSites - $multipleOf + 1; $i++) {
	my $rmFlag = 1;
	for(my $j = 1; $j < $multipleOf; $j++) {
	    if ($gapSites[$i] + $j != $gapSites[$i+$j]) {
		$rmFlag = 0;     # we don't want to remove this $i
		$j=$multipleOf;  # get out of inner loop
	    }
	}
	if ($rmFlag == 1) {
	    push @rmSites, @gapSites[$i..($i+$multipleOf-1)];
	    $i += $multipleOf - 1;
	}
    }
    
    my @allSites = 1..($maxLen) ;
    my @selIndex = InANotInB (\@allSites, \@rmSites);
    @selIndex = To0Offset(@selIndex);  # convert to 0-ffset

    # select sites and make results
    my @result = ();
    for $seqNumber (0 .. $#seqMat) {
	my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex);
	my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq));
	push (@result, $line);
    }

#    if (@rmSites > 0) {
#	warn ("Following sites consist of all gaps, removed from analysis\n");
#	print STDERR join(" ", @rmSites);
#	print STDERR "\n";
#    }
    return (@result);
}

# if the 2nd argument is "rmAmbiguous", ambiguous characters are removed (anything other than ATGC)
sub RemoveSitesWithAnyGap {
    my ($seqDatARef, $ambiguousRm) = @_;
    my @seqDat = GetSeqDat(@$seqDatARef);
    my @seqName = GetSeqName(@$seqDatARef);
    my $maxLen = MaxSeqLen(@$seqDatARef);
    my @gapSites = ();
    my @notGapSites = ();
    my ($posi, $seqNumber);
    my @seqMat = ();

    # make 2 dimensional matrix
    foreach $seqNumber (0..$#seqDat) {
	if ($ambiguousRm eq "rmAmbiguous") { # replace anything other than ATGC with -
	    $seqDat[$seqNumber] =~ s/[^ATGCatgc]/-/g;
	}
	my @tmpArray = split(//, $seqDat[$seqNumber]);
	# Check the length
	if (@tmpArray != $maxLen)  {
	    die "ERROR: the sequence $seqName[$i] is not same length " .
		"as \$maxLen = $maxLen.  Weird!!";
	}
	push @seqMat, [ @tmpArray ];
    }

    # now identify the sites with any gap
    for $posi (0 .. ($maxLen-1)) {
	my $gap = 0;
	for $seqNumber (0 .. $#seqMat){
	    if ($seqMat[$seqNumber][$posi] =~ /^[-\?]$/) {
		$gap = 1;
		last;
	    }
	}
	if ($gap == 1) {  # there is a gap at this site
	    push (@gapSites, $posi+1); # now unit-offset
	} else {          # No seq has a gap at this site.
	    push (@notGapSites, $posi+1);
	}
    }

#    print join(":", "gap=", @gapSites, "\n");  # debug
#    print join(":", "nogap=", @notGapSites, "\n"); #debug
#    my @rmSites = @gapSites;
    
#    my @allSites = 1..($maxLen) ;
#    my @selIndex = InANotInB (\@allSites, \@rmSites);
    my @selIndex = @notGapSites;
    @selIndex = To0Offset(@selIndex);  # convert to 0-ffset

    # select sites and make results
    my @result = ();
    for $seqNumber (0 .. $#seqMat) {
	my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex);
	my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq));
	push (@result, $line);
    }

#    if (@rmSites > 0) {
#	warn ("Following sites contain gaps\n");
#	print STDERR join(" ", @rmSites);
#	print STDERR "\n";
#    }
    return (@result);
}

# convert 1-offset index array to 0-offset array
sub To0Offset {
    my @result = map {$_ - 1} @_;
    return @result;
}

# count the number of characters in a string
sub CharLen {
    my $string = shift;
    my @charString = split (//, $string);
    return scalar(@charString);
}

# this function take two scalars and return the larger value
sub Larger {
    my ($a, $b) = @_;

    return (($a > $b) ? $a : $b);
}

sub InANotInB {
    my ($aRef, $bRef) =@_;
    my %seen = ();
    my @aonly =();

    foreach my $item (@$bRef) { $seen{$item} = 1};
    foreach my $item (@$aRef) {
	push (@aonly, $item) unless $seen{$item};
    }
    return (@aonly);
}

sub ExtractUnique {
    my %seen=();
    my @unique = ();

    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}

sub Max {
    my $max = shift;
    foreach $item (@_) {
        if ($item > $max) {
            $max = $item;
        }
    }
    return $max;
}

sub MemberQ {
    my ($x, $arrRef) = @_;
    foreach my $item (@$arrRef) {
        if ($x eq $item) {
            return 1;
        }
    }
    return 0;
}


