#!/usr/bin/perl

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [-hmt] [codeml_output]\n" .
    " -m: the codeml_output contains multiple genes\n" .
    " -t: eliminate the termination codons\n" .
    "Extract the codon usage counts table from codeml_output.\n" .
    "Order of bases: T C A G\n";

use Getopt::Std;
getopts('hmt') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

$tableStart = (defined($opt_m)) ? 
    'Codon usage in genes' : 'Sums of codon usage counts';

@doc = <>;

my @matchingIndex = IndexOfMatchingPattern (\@doc, $tableStart);

if (@matchingIndex != 1) {
    die "ERROR: document contains " . scalar(@matchingIndex) . " lines " .
	"which match the pattern.  The pattern we are looking for is:\n".
	    "$tableStart\nSpecify another unique signal by modifying a " .
		"variable \$tableStart\n";
}

my $tbl1st = @matchingIndex[0];

die "ERROR: line " . $tbl1st+3 . "is supposed to contains the very 1st line " .
    "of data. But it doesn't follow the codon Tbl format\n"
    unless ($doc[$tbl1st+1] =~ /^-+\n$/ && $doc[$tbl1st+6] =~ /^-+\n$/ &&
	    $doc[$tbl1st+11] =~ /^-+\n$/ && $doc[$tbl1st+16] =~ /^-+\n$/ &&
	    $doc[$tbl1st+21] =~ /^-+\n$/);

$tbl1st += 2; # now pointing to the 1st data

my $numCol = -1;
my @codonTbl = ();
for my $i (0..3, 5..8, 10..13, 15..18) {
    my $dat = $doc[$tbl1st + $i];
    $dat =~ s/[a-zA-Z\*\|]//g;
    $dat =~ s/^\s+//; $dat =~ s/\s+$//;
    my @line = split /\s+/, $dat;
    if ($numCol == -1) {
	$numCol = @line;
    } else {
	die "ERROR: CodonTable doesn't have same number of columns\n" 
	    if ($numCol != @line);
    }
    push  @codonTbl, [ @line ];
}

my $numGenes = int($numCol / 4);

my @result =();
for my $gene (0..($numGenes-1)) {
    my @cnt = ();
    my $col;
    for ($col = $gene; $col < $numCol; $col += $numGenes) {
	for my $row (0..15) {
	    push @cnt, $codonTbl[$row][$col];
	}
    }
    my @cntReordered  = ();
    for my $i ( 0..3 , 16..19, 32..35, 48..51,
	        4..7 , 20..23, 36..39, 52..55,
		8..11, 24..27, 40..43, 56..59,
	       12..15, 28..31, 44..47, 60..63) {
	push @cntReordered, $cnt[$i];
    }

    if (defined($opt_t)) { # remove termination codons
	splice(@cntReordered, 14,1);
	splice(@cntReordered, 10,2);
    }

    my $tmp = join ",", @cntReordered;

    push @result, $tmp;
}


print join "\n", @result;
print "\n";

exit(0);

sub IndexOfMatchingPattern {
    my ($arrayRef, $pattern) = @_;
    
    my @indexArray = ();
    for my $i (0..(scalar(@$arrayRef)-1)) {
	my $line = $$arrayRef[$i];
	if ($line =~ /$pattern/) {
	    push @indexArray, $i;
	}
    }
    return @indexArray;
}
