#!/usr/bin/perl
# Gives FASTA seq as an argument or STDIN.
# Calculate the pairwise differences

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [-ch] [alignedDNASeq]\n" .
 "  -c: eliminate codons which include one or more gaps\n" .
 "      This option can be combined with -e\n" .
 "  -e: eliminate base sites where gaps are observed in at least one seq\n".
 "      Gap sites are eliminated pair-wise without this option\n" .
 " following options: not implemented\n" .
 "  -s: use a file which list names of sequences to be used in the analysis\n".
 "  -n: do not print seq names\n" .
 "  -f: output format, currently only phylip, PAML style if not specfied";

my $sep = "\t";

use Getopt::Std;
getopts('f:hces:n') || die "$usage\n";

if (defined($opt_h)) {
    die "$usage\n";
}

die "$usage\n" if (@ARGV > 1);

@ARGV = ('-') unless @ARGV;
my $dnaFile = shift;

# $mode is set "nucleotide" or "protein"
# read in the data and store them in @seqName and @seqDat
my @dat = ReadInFASTA($dnaFile);

if (defined($opt_c)) {
    @dat = CleanSeqs(@dat);
}

my @seqDat = GetSeqDat(@dat);

#print join "\n", @seqDat, "\n";

my %composi1 =();
my %composi2 =();
my %composi3 =();

for my $s (@seqDat) {
    my @entry = split //, $s;
    for my $i (0..$#entry) {
	$posi = $i % 3;
	$residue = $entry[$i];
	if ($posi == 0) {
	    $composi1{$residue}++;
	} elsif ($posi == 1) {
	    $composi2{$residue}++;
	} else {
	    $composi3{$residue}++;
	}
    }
}

print "Position 1:\t";
my $total = TotalCnt(%composi1);
for $key (sort(keys(%composi1))) {
    print "$key: $composi1{$key}\t";
}
print "total: $total\nPosition 2:\t";
my $total = TotalCnt(%composi2);
for $key (sort(keys(%composi2))) {
    print "$key: $composi2{$key}\t";
}
print "total: $total\nPosition 3:\t";
my $total = TotalCnt(%composi3);
for $key (sort(keys(%composi3))) {
    print "$key: $composi3{$key}\t";
}
print "total: $total\n";


# for debugging
# while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; };
# foreach $k (keys %synoDiff)
#   {print "**$k** => $synoDiff{$k} $nonSynoDiff{$k}\n";};

exit (0);

sub TotalCnt {
    my %hash = @_;
    $sum = 0;
    for my $key (keys(%hash)) {
	$sum += $hash{$key};
    }
    return $sum;
}

# takes an arg; name of a file from which data are read Then read in
# the data and make an array.  Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
    my $infile = shift;
    my @line;
    my $i = -1;
    my @result = ();
    my @seqName = ();
    my @seqDat = ();

    open (INFILE, "<$infile") || die "Can't open $infile\n";

    while (<INFILE>) {
        chomp;
        if (/^>/) {  # name line in fasta format
            $i++;
            s/^>\s*//; s/^\s+//; s/\s+$//;
            $seqName[$i] = $_;
            $seqDat[$i] = "";
        } else {
            s/^\s+//; s/\s+$//;
	    s/\s+//g;                  # get rid of any spaces
            next if (/^$/);            # skip empty line
            s/[uU]/T/g;                  # change U to T
            $seqDat[$i] = $seqDat[$i] . uc($_);
        }

	# checking no occurence of internal separator $sep.
	die ("ERROR: \"$sep\" is an internal separator.  Line $. of " .
	     "the input FASTA file contains this charcter. Make sure this " . 
	     "separator character is not used in your data file or modify " .
	     "variable \$sep in this script to some other character.\n")
	    if (/$sep/);

    }
    close(INFILE);

    foreach my $i (0..$#seqName) {
	$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}

# takes multiple string; each string represent a sequence (name tab seq).
# Get rid of the sites where at least one seq has a gap (or ambiguious base)
# It returns an array of sequences.

sub CleanSeqs {
    my @dat = @_;
    my @seqDat = GetSeqDat(@dat);
    my @seqName = GetSeqName(@dat);

    my $minLength = CharLen($seqDat[0]);
    my ($i, $seqNum);
    my @noGapSites = ();
    my @result = ();
    my @codonMat = ();

    # get rid of codons with gaps    
    foreach $i (@seqDat) {
	my @tmpArray = (defined($opt_c)) ? MkTripletArray($i) : split(//, $i);
	push @codonMat, [ @tmpArray ];
	$minLength = @tmpArray if (@tmpArray < $minLength); # find min length
    }

    # identify the sites without any gaps.
    for $i (0 .. $minLength - 1) {
	my $gap = 0;
	if (defined($opt_c)) {
	    for $seqNum (0 .. $#seqDat) {
		if ($codonMat[$seqNum][$i] !~ /^[ACGT]{3}$/) {
		    $gap = 1;
		    last;
		}
	    }
	} elsif ($mode eq "nucleotide") {
	    for $seqNum (0 .. $#seqDat) {
		if ($codonMat[$seqNum][$i] !~ /^[ACGT]{1}$/) {
		    $gap = 1;
		    last;
		}
	    }
	} else { # protein
	    for $seqNum (0 .. $#seqDat) {
		if ($codonMat[$seqNum][$i] =~ /^[\-\?]{1}$/) {
		    $gap = 1;
		    last;
		}
	    }
	}
	push (@noGapSites, $i) if ($gap == 0);
    }

    # select the sites without any gaps
    for $seq (0 .. $#seqDat) {
	my @oneSeq = ();
	foreach $i (@noGapSites) {
	    push @oneSeq, $codonMat[$seq][$i];
	}
	my $newSeq = join "", @oneSeq;
	push @result, $seqName[$seq] . $sep . $newSeq;
    }

    return @result;
}


sub GetSeqDat {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[1];
    }

    return (@result)
}

sub GetSeqName {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[0];
    }
    return (@result)
}

# returns an array with the 1st argument repeated n times (2nd arg)
sub Repeat {
    my ($val, $n) = @_;
    my @result = ();
    foreach my $i (0..($n-1)) {
	push @result, $val;
    }
    return @result;
}

# this function take two scalars and return the larger value
sub Smaller {
    my ($a, $b) = @_;
    return (($a < $b) ? $a : $b);
}

sub Min {
    my $min = shift;
    foreach my $i (@_) {
	$min = ($i < $min) ? $i : $min;
    }
    return $min;
}

sub Sum {
    my $sum = 0;
    foreach my $i (@_) { 
	$sum += $i;
    }
    return $sum;
}

# count the number of characters in a string
sub CharLen {
    my $string = shift;
    my @charString = split (//, $string);
    return scalar(@charString);
}

# for a given string, it separates into triplets, and return
# the resulting array.
# if the last element is less than a triplet, it will be removed
sub MkTripletArray {
    my $seq = shift;
    $seq =~ s/\s+//g;
    $seq =~ s/(.{3})/$1 /g;
    $seq =~ s/\s+$//;
    my @result = split(/ /, $seq);
    pop @result unless ($result[$#result] =~ /.{3}/);
    return @result;
}

# When one of the codons corresponds to termination, (-1, -1) is returned.
# If two codons are identical, (0,0) is returned even if the codon 
# correspond to termination.

# take a list as the argument and extract the unique elements.
# The order of elements will not be preserved.
sub ExtractUnique {
    my %seen=();
    my @unique = ();

    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}
