#!/usr/bin/perl

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [-t geneticCodeTbl] alignedAASeq dnaSeq";

use Getopt::Std;
getopts('ht:') || die "$usage\n";

if (defined($opt_h)) {
    die "$usage\n";
}

my %aminoAcid;
my %aaSeq;

die "$usage\n" if (@ARGV != 2);

my ($aaFile, $dnaFile) = @ARGV;

# initialize the hash %aminoAcid
if (defined($opt_t)) {  # -t genCodeTbl was given
    open (CODE_TBL, "<$opt_t") || die "ERROR: Can't open $opt_t\n";
    InitGenCode(*CODE_TBL);
    close (CODE_TBL);
} else {                # use the default standard table supplied at the end
    InitGenCode(*DATA);
}

# for debugging
#while (($k, $v) = each %aminoAcid) { print "**$k** => --$v--\n";};

# initialize the aaSeq
ReadInAlignedAA($aaFile);

# for debugging
# while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; };

ProcessDna($dnaFile);

exit (0);

# Initialize the hashTbl, %aminoAcid{codon},
# by reading in the FH given as the argument
sub InitGenCode {  # take a typeglob of FILEHANDLE as an argument, e.g. *INFILE
    local *FH = shift;
    my $type;
    my @aa, @b1, @b2, @b3;
    my $i;
    my $codon;

    while (<FH>) {
	chomp;
	s/\s+$//;

	if (/^\s*(.*)\s*=/) {  # extract the type of the line
	    $type = $1;
	} else {
	    next;
	}

	s/^\s*(.*)=\s*//;      # get rid of characters before "="

	if ($type =~ /AAs/) {
	    @aa = split (//);
	} elsif ($type =~ /Base1/) {
	    @b1 = split (//);
	} elsif ($type =~ /Base2/) {
	    @b2 = split (//);
	} elsif ($type =~ /Base3/) {
	    @b3 = split (//);
	} else {
	    next;
	}
    }

    if (@aa + @b1 + @b2 + @b3 != 64 * 4) {  # checking the length of arrays
	die "ERROR, Please check the genetic code table is well formatted\n";
    }

    # making a hash table, %aminoAcid, Note all upper cases are used
    for ($i = 0; $i < 64; $i++) {
	$codon = uc ($b1[$i] . $b2[$i] . $b3[$i] );
	$aminoAcid{$codon} = uc $aa[$i];
    }
}

sub ReadInAlignedAA {  # takes an arg; name of a file from which data are read
    my $infile = shift;
    my @line;

    open (INFILE, $infile) || die "Can't open $infile\n";

    while (<INFILE>) {
	chomp;
	if (/^>/) {  # name line in fasta format
	    s/^>\s*//;
	    @line = split (/\|/);     # note it takes only the name before |
	    $line[0] =~ s/\s+$//;
	} else {
	    s/^\s+//;
	    s/\s+$//;
	    next if (/^$/);  # skip empty line
	    $aaSeq{$line[0]} = $aaSeq{$line[0]} . $_;
	}
    }

    close(INFILE);
}

sub ProcessDna {
    my $file = shift;
    my @line;
    my $dnaSeq = "";
    my $alignedDna;

    open (INFILE, "<$file") || die "ERROR: Can't open $file\n";

    while(<INFILE>) {
	chomp;
	if (/^>/) {
	    PrintSeq($aaSeq{$name}, $dnaSeq, $name); # align dna and print it
	    print "$_\n";
	    s/^\s+//;
	    @line = split (/\|/);
	    $line[0] =~ s/\s+$//; $line[0] =~ s/^>\s*//;
	    $name = $line[0];
	    $dnaSeq = "";
	} else {
	    s/^\s+//; s/\s+$//;
	    $dnaSeq = $dnaSeq . $_;
	}
    }
    PrintSeq($aaSeq{$name}, $dnaSeq, $name); # take care of the last line
    close (INFILE);
}

sub PrintSeq {
    my ($aaSeqString, $dnaSeqString, $name) = @_;
    
    if ($dnaSeqString ne "") {
	my $alignedDna = InsertGap($aaSeqString, $dnaSeqString, $name);
	if ($alignedDna =~ /\d+/) {
	    print STDERR "ERROR: $alignedDna-th amino acid of ",
	    "$name does not match the DNA seq.\n";
	    die "Please correct the problem in $aaFile or $dnaFile\n";
	}
	print "$alignedDna\n";
    }

}

sub InsertGap {
    my ($aaSeqString, $dnaSeqString, $name) = @_;

    my @aaSeqArray = split //, $aaSeqString;
    my @dnaSeqArray = split //, $dnaSeqString;

    my $i;
    my $alignedSeq = "";
    my $thisCodon, $thisCodonCopy;

    for ($i = 0; $i < @aaSeqArray; $i++) {
	if ($aaSeqArray[$i] eq "-") {
	    $alignedSeq = $alignedSeq . "---";
	    next;
	} else {
	    $thisCodonCopy = join('', splice @dnaSeqArray, 0, 3);
	    $thisCodon = uc ($thisCodonCopy);
	    $thisCodon =~ s/U/T/g;                 # converting RNA to DNA

	    # checking to see the dna triplet matchs the aa.
	    if (uc($aaSeqArray[$i]) eq 'X') {
		warn "WARN: aa sequence of \'$name\' contains X\n";
	    } elsif(uc($aaSeqArray[$i]) ne $aminoAcid{$thisCodon}) {
		# If the very last base of the DNA sequence is missing,
		# one may still be able to determine the last aa.
		# e.g. If $thisCodon == "GG", then we know it corresponds to G.
		
		# CheckLastBase returns 1 when 2 bases are enough to determine
		# the amino acid ambiguously.
		if (scalar(split(//,$thisCodon)) != 2 || # checking the length
		    CheckLastBase($thisCodon, $aaSeqArray[$i]) != 1) {
			return ($i+1);        # mismatch bet the codon and aa
		    }
	    }
	    $alignedSeq = $alignedSeq . $thisCodonCopy;
	}
    }

    if (@dnaSeqArray != 0) {
	warn "The length of dna seq ($name) is longer than aa seq by " . 
	    scalar(@dnaSeqArray) . " bases.\n";
    }

    return $alignedSeq;
}

sub CheckLastBase {
    my ($twoDNA, $aa) = @_;
    if ($aminoAcid{$twoDNA . "T"} eq $aa && $aminoAcid{$twoDNA . "C"} eq $aa &&
	$aminoAcid{$twoDNA . "A"} eq $aa && $aminoAcid{$twoDNA . "G"} eq $aa) {
	return 1;    # returns 1 if the third base doesn't matter
    }
    return 0;        # returns 0 if two bases are NOT enough to ensure the AA
}

# The Standard Code (transl_table=1) from NCBI
# http://www3.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c#SG1
__DATA__
  AAs  = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
Starts = ---M---------------M---------------M----------------------------
Base1  = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
Base2  = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
Base3  = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
