#!/usr/bin/perl

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [-t geneticCodeTbl] [-s speciesList] dnaSeq";

# Amino acid seqs hasn't been implemented yet.

# This script takes an aligned DNA or amino acid seq file in fasta
# format (as an argument or STDIN), and calculate the number of
# singleton observed in the data.  When DNA seq is given, it assumes
# the first nucleotide of each sequence in the file corresponds to the
# 1st position of a codon.  Then number of singletons for each of the
# three codon positions are calculated.  Obviously, the three
# positions are meaningles when amino acid sequences are given (just
# use the total number for AA).

# At first, it will print out the locations of singletons:
# e.g.:

#   seq1	3:3:C
#   seq2	8:2:G	16:1:C*
#   seq3	5:2:C	16:1:G*

# The first column is the name of the sequence, and 2nd to the last
# columns indicate the location of singletons in this sequence.  The
# location is formatted as x:y:z, where x is the location of the site
# in the sequence, y is the position in the codon (1, 2, or 3), and z
# is the nucleotide which corrsponds to the singleton.  When "*" is
# seen after a location info, one should be careful with the info.
# For example, consider a site of 6 sequences.  If the nucleotide of 6
# seqs at a site is {-, A, G, C, -, -} (1st, 5th, 6th are gaps),
# number of singletons in this site is 2 (=3-1), NOT 3.  But we do not
# know which of two are the singletons.  So the script arbitrary drop
# one of them (say A), and G, and C are considered as the singletons.
# In this case, "*" are attached to C and G to indicate this kind of
# situation. A base insertion seen only on one sequence is considered
# as uninformative and ignored.  Also a site where all sequences have a gap,
# is ignored.

# If the names of sequences are delimitted by "|" in the name line (which 
# is indicated by ">" in the beginnning of the line), the name will be
# trancated to include only the first part.

# At the end, the script prints out the final counts.

use Getopt::Std;
getopts('ht:s:') || die "$usage\n";

if (defined($opt_h)) {
    die "$usage\n";
}

my %aminoAcid;
my @dnaSeq, @seqNameArray;
my $maxSeqLen;

die "$usage\n" if (@ARGV > 1);

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $dnaFile = shift @ARGV;

# initialize the hash %aminoAcid
if (defined($opt_t)) {  # -t genCodeTbl was given
    open (CODE_TBL, "<$opt_t") || die "ERROR: Can't open $opt_t\n";
    InitGenCode(*CODE_TBL);
    close (CODE_TBL);
} else {                # use the default standard table supplied at the end
    InitGenCode(*DATA);
}

# for debugging
#while (($k, $v) = each %aminoAcid) { print "**$k** => --$v--\n";};

# initialize the @dnaSeq, and @seqNameArray
my $numSeq = ReadInDNA($dnaFile);

# for debugging
# while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; };

ProcessDna($numSeq, $maxSeqLen);

exit (0);

# Initialize the hashTbl, %aminoAcid{codon},
# by reading in the FH given as the argument
sub InitGenCode {  # take a typeglob of FILEHANDLE as an argument, e.g. *INFILE
    local *FH = shift;
    my $type;
    my @aa, @b1, @b2, @b3;
    my $i;
    my $codon;

    while (<FH>) {
	chomp;
	s/\s+$//;

	if (/^\s*(.*)\s*=/) {  # extract the type of the line
	    $type = $1;
	} else {
	    next;
	}

	s/^\s*(.*)=\s*//;      # get rid of characters before "="

	if ($type =~ /AAs/) {
	    @aa = split (//);
	} elsif ($type =~ /Base1/) {
	    @b1 = split (//);
	} elsif ($type =~ /Base2/) {
	    @b2 = split (//);
	} elsif ($type =~ /Base3/) {
	    @b3 = split (//);
	} else {
	    next;
	}
    }

    if (@aa + @b1 + @b2 + @b3 != 64 * 4) {  # checking the length of arrays
	die "ERROR, Please check the genetic code table is well formatted\n";
    }

    # making a hash table, %aminoAcid, Note all upper cases are used
    for ($i = 0; $i < 64; $i++) {
	$codon = uc ($b1[$i] . $b2[$i] . $b3[$i] );
	$aminoAcid{$codon} = uc $aa[$i];
    }
}

# this function read in dna data and store it in @dnaSeq and @seqNameArray
# It also initialize the $maxSeqLen.
# It returns number of sequences.
sub ReadInDNA {  # takes an arg; name of a file from which data are read
    my $infile = shift;
    my @line;
    my $seqCntr = 0;
    my $seqName;

    open (INFILE, $infile) || die "Can't open $infile\n";

    $maxSeqLen = 0;

    while (<INFILE>) {
	chomp;
	if (/^>/) {  # name line in fasta format
	    s/^>\s*//;
	    @line = split (/\|/);     # note it takes only the name before |
	    $seqNameArray[$seqCntr] = $line[0];
	    $seqNameArray[$seqCntr] =~ s/\s+$//;

	    $maxSeqLen = larger($maxSeqLen, charLen($dnaSeq[$seqCntr-1]))
		if ($seqCntr > 0);

	    $seqCntr++;
	} else {
	    s/^\s+//;
	    s/\s+$//;
	    next if (/^$/);  # skip empty line
	    $dnaSeq[$seqCntr-1] = $dnaSeq[$seqCntr-1] . uc( $_ );
	}
    }

    # need this for the last sequence
    $maxSeqLen = larger($maxSeqLen, charLen($dnaSeq[$seqCntr-1]));

    close(INFILE);
    return ($seqCntr);
}

# count the number of characters in a string
sub charLen {
    my $string = shift;
    my @charString = split (//, $string);
    return scalar(@charString);
}

# this function take two scalars and return the larger value
sub larger {
    my ($a, $b) = @_;

    return (($a > $b) ? $a : $b);
}

# count the number of singletons and print it out
sub ProcessDna {
    my ($numSeq, $seqLen) = @_;
    my $site;
    my $base;
    my @baseCntr;
    my @singletonSeqNum;

    @singletonCntr = (0,0,0,0);

    for ($site = 0; $site < $seqLen; $site++) {
	my $warnFlag = 0;

	@baseCntr = (0,0,0,0,0);
	@singletonSeqNum = (-1,-1,-1,-1,-1);
	for ($i = 0; $i < $numSeq; $i++) {
	    
	    $base = substr($dnaSeq[$i], $site, 1);
	    $base = '-' if ($base eq ''); # end of seq reached.
	    $base =~ tr/-AGCT/-/c;    # change unrecognized character to '-'
	    $base =~ tr/-AGCT/01234/; # code the base to number
	    $baseCntr[$base] ++;
	    $singletonSeqNum[$base] = $i;
	}
	# ignore when single insertion, or all seq has a gap.
	next if ($numSeq - $baseCntr[0] < 2);

	# When the site contains 0 or 1 counts for all of the nucleotide
	# (i.e., A, G, C, and T), one of them shouldn't be considered as
	# singleton.
	# E.g.)  When counts are A:G:C:T = 0:1:1:1,
	# number of singletons should be 3 - 1 = 2.  So I'm setting the
	# one of the count (in this case count for G) to be 0.
	if ($baseCntr[1] < 2 && $baseCntr[2] < 2 && $baseCntr[3] < 2 && 
	    $baseCntr[4] < 2) {
	    $warnFlag = 1;
	    for ($i = 1; $i < 5; $i++) {
		$baseCntr[$i] = 0 if ($baseCntr[$i] == 1);
		last;
	    }
	}

	$singletonCntr[3]++; # this element counts the num of informative sites
	for ($i = 1; $i < 5; $i++) {
	    next if ($baseCntr[$i] != 1);

	    # found a singleton when reached here
	    $singletonCntr[$site % 3]++;
	    my $tmpBase = $i;
	    $tmpBase =~ tr/1234/AGCT/;
	    $singletonLoc = $singletonLoc . ", " . 
		$singletonSeqNum[$i] . ":" . $site . ":" . $tmpBase; 
	    if ($warnFlag == 1) {
		$singletonLoc = $singletonLoc . "*";
	    }
	}
    }
    $singletonLoc =~ s/^, //;

    PrintSingletonLoc($singletonLoc);

    print "\n\nNumber of singletons in $numSeq sequences, $seqLen bases\n";
    print "Number of informative sites " .
	"(insertion in single seq or gap is excluded)" .
	":  $singletonCntr[3]\n\n";
    my $tot = $singletonCntr[0] + $singletonCntr[1] + $singletonCntr[2];
    print "1stCodon: $singletonCntr[0], 2ndCodon: $singletonCntr[1], " .
	"3rdCodon: $singletonCntr[2], total: $tot\n\n";
}

sub PrintSingletonLoc {
    my $locString = shift;

    my @locArray = split (/, /, $locString);
    my @eachRecord;
    my $oldSeqNum = -1;

    @locArray = sortByColumn (0, "a", @locArray);

    print "Location of all singletons:\n";
    print "NameOfSeq\tsiteNumber:codonPosition:nucleotide ...\n";

    foreach my $item (@locArray) {
	@eachRecord = split (/:/, $item);
	$eachRecord[1]++; # increment the site number because $eachRecord[1]
                          # is originally 0-offset.
	my $codonPos = ($eachRecord[1] - 1) % 3 + 1;
	if ($eachRecord[0] == $oldSeqNum) {
	    print "\t$eachRecord[1]:$codonPos:$eachRecord[2]";
	} else {
	    print "\n$seqNameArray[$eachRecord[0]]\t" .
		"$eachRecord[1]:$codonPos:$eachRecord[2]";
	    $oldSeqNum = $eachRecord[0];
	}
    }
    print "\n";
}

sub sortByColumn {
# numerical sort by a column, return an array
#    sortbyColumn ($col_num, $order, @record)
# @record is an array with each element representing a space delimited record
# example
#    ("473 p1 S     0:06 -bash", "541 p2 SW    0:00 ps-a", ....)
# $col_num -- the column by which the record is sorted by (left-most col is 0)
# $order can be "a" (ascending) or "d" (descending),
# sort column can be hyphnated numbers (e.g. 10-4-150)

    local $col_num = shift(@_);
    local $order = shift(@_);
    local @record = @_ ;
    local ($sortCol);
    
    ## test if the sort column is hyphnated or plain number
    local $sortMethod = "number";
    foreach $sortCol (@record) {
	if ( (split(/\s+/,$sortCol))[$col_num] =~ /\d+-\d+/ ) {
	    $sortMethod = "hyphnated";
	    last ;
	}
    }

    return sort $sortMethod @record;

## two sub-sub-routines
    sub number {
	# $col_num, $order are the given arguments
	# the left-most column is 0 
	local $first = (split(/\s+/, $a))[$col_num];
	local $second = (split(/\s+/, $b))[$col_num];
# argument errors not well trapped here
	($first,$second) = ($second, $first) if ($order eq "d");
	
	return ($first <=> $second);
    }

#probably I don't need the "sub number"
    sub hyphnated {
	# $col_num, $order are the given arguments
	local ($each_number, $cmp_result, @temp_swap);

	## separte the hyphnated numbers and put them in the following arrays
        local @first = split(/-/, ((split(/\s+/, $a))[$col_num]));
	local @second = split(/-/, ((split(/\s+/, $b))[$col_num]));

	## ascending (default) or descending order
	if ($order eq "d") {
	    @temp_swap = @first;
	    @first = @second;
	    @second = @temp_swap;
	}
	
	## comparison of array elements
	for ($each_number = 0; $each_number <=
	     (($#first < $#second) ? $#first : $#second) ; $each_number++) {
	    $cmp_result = ($first[$each_number] <=> $second[$each_number]);
	    last if ($cmp_result);
	}

	## if the size of two arrays differ
	if ( ($cmp_result == 0) && ($#first != $#second) ) {
	    return (($#first < $#second) ? -1 : 1);
	} else {
	    return $cmp_result;
	}
    }
}


# The Standard Code (transl_table=1) from NCBI
# http://www3.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c#SG1
__DATA__
  AAs  = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
Starts = ---M---------------M---------------M----------------------------
Base1  = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
Base2  = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
Base3  = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
