#!/usr/bin/perl

# This script reads in codemlsites output, and extract the positively
# selected sites for each models.  You can give multiple inputfiles.

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [-d alignedDNASeq [-f]] [-n] [codemlsites.out]\n" .
    "  -d: PAML aligned DNA Sequence file\n" .
    "  -f: read FASTA instead of PAML\n" .
    "  -n: include filenames in the output";

my $convTblBin = "~/marcylab/seq/pamlConvTbl.pl";

use Getopt::Std;
getopts('hd:fn') || die "$usage\n";
if (defined($opt_h)) {
    die "$usage\n";
}

my $printOutStage = 0;
my $model = -1;

if (defined ($opt_d)) {
    %convTbl=MkSiteConvTbl($opt_d);
    print "model\tsiteNum.paml\taa\tp\tsignificance\tsiteNum\n";
} else {
    print "model\tsiteNum.paml\taa\tp\tsignificance\n";
}

@ARGV = ('-') unless @ARGV;
while ($ARGV=shift) {
    open (ARGV, $ARGV) or warn "Can't open $ARGV: $!\n";
    while (<ARGV>) {
	if (/^Model\s+(\d+):/) {
	    $model = $1;
	    next;
	}
	
	if (/^Positively selected sites Prob/) {
	    $printOutStage=1;
	    next;
	}
	
	if ($printOutStage == 1) {
	    chomp; s/^\s+//; s/\s+$//;
	    next if (/^$/);
	    
	    if (/^Time used:/) {
		$printOutStage = 0;
		next;
	    }
	    
	    unless (/^\d+/) {
		warn "In model $model: $_\n";
		next;
	    }
	    
	    my $sigVal = 0;
	    if (/(\*+)$/) {
		my @stars = split (//, $1);
		$sigVal = @stars;
		s/(\*+)$//;
	    }
	    
	    s/\s+/\t/g;
	    if(defined($opt_n)) {
		print "$ARGV:$model\t";
	    } else {
		print "$model\t";
	    }
	    print;
	    print "\t$sigVal";
	    if (defined($opt_d)) { # print the actual position
		print "\t$convTbl{$1}\n" if (/^\s*(\d+)/);
	    } else {
		print "\n";
	    }
	}
    }
}

sub MkSiteConvTbl {
    my $file = shift;

    my $convTblOpt = "-f" if (defined($opt_f));
    open (INPIPE, "$convTblBin $convTblOpt -c $file|") ||
	die "Can't run $convTblBin\n";

    my $header = <INPIPE>;
    warn "The first line of $convTblBin output is \"$header\", " .
	"but expecting \"PAML\tOrig\"\n" if ($header !~ /^PAML\tOrig/);
    
    my %resHash = ();
    while (<INPIPE>) {
	my @line = split;

	if (exists ($resHash{$line[0]})) {
	    die "There are two lines with same site #\n";
	}
	$resHash{$line[0]} = $line[1];
    }
    close (INPIPE);

    return (%resHash);
}
