#!/usr/bin/perl -w

#  Takes sites output and create lines appropriate for mlhka

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "Help to create mlhka input file\n".
    "Usage:  $0 [-hc] [SITES_inputFile]\n\n".
    " -c: print the SITES command which this script use\n" .
    " -h: help\n\n".
    "Takes SITES input file with at least two groups defined.\n".
    "If no filename is given, it take STDIN as the input\n\n" .
    "Runs SITES and extract the information for the inputfile of mlhka\n".
    "It runs sites with -Szxis: remove position with >2 bases, remove\n".
    "sites with ANY gaps (data-set wide), and use silent (non-coding + \n".
    "synonymous) mutation.\n".
    "SITES print out the total length of sequences, not the number of silent\n".
    "sites.  This script correct the problem, and the seq length printed by\n".
    "this script is (total analyzed length) - (average replacement sites)\n".
    "Output:\n".
    "If there are two groups in the input file, it will print four mlhka lines.\n".
    "1. sp2 is considered as outgroup, use all sp2 and divergence is ave. distance\n".
    "2. 1 sequence of sp2 is considered as outgroup (single line)\n".
    "3. sp1 is considered as outgroup\n".
    "4. 1 sequence of sp.1 is considered as outgroup\n".
    "You need to pick 1 appropriate line and copy it to 'infile.txt' for mlhka.\n".
    "The name of mlhka input file has to be 'infile.txt'.\n".
    "The 1st column should be the gene name, edit it appropriately.\n".
    "\n6-th column of the output is initial theta (PER SITE) value.\n" .
    "The script use Watterson's theta estimate for this.\n" .
    "The last column may need to be modified if your gene is not autosomal.\n";

use IO::File;
use POSIX qw(tmpnam);

our ($opt_h, $opt_c, $opt_d);
use Getopt::Std;
getopts('hcd') || die "$usage\n";

die "$usage\n" if (defined($opt_h));
die "$usage\n" if (@ARGV > 1);

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.

my $file = shift;


my $outFile="TEMP_OUTPUT";

if (defined($opt_c)) {
    $file = "INPUT_FILE";
    print "sites -I$file -R$outFile -M'' -Szxis -Ap -Ohgs > /dev/null\n";
    exit;
}

my $fh;
do {$outFile = tmpnam()}
until $fh = IO::File->new("$outFile.SIT", O_RDWR|O_CREAT|O_EXCL);
if (defined($opt_d)) { # for debugging
    warn "INFO: Temporary SITES output is $outFile.SIT, for debugging\n";
}

system("sites -I$file -R$outFile -M'' -Szxis -Apc -Ohgs > /dev/null");

open SITESOUT, "<$outFile.SIT" || die "cannot open $outFile.SIT\n";
my $state = 0;
my $cntr = 0;
my @extracted = ();
my %info = ();
while (<SITESOUT>) {

    if ($state == 0) {
	if (/^\s*AVERAGE NUMBER OF SYN. SITES\s*:\s*([\d\.]+)\s+REPLACEMENT SITES\s*:\s*([\d\.]+)\s*/) {
	    $info{'synSites'} = $1;
	    $info{'nonSynSites'} = $2;
	    $state++;
	}
	next;
    }
    
    if ($state == 1) {
	$state ++ if (/^\s*Formatted Values for Input to HKA Program\s*$/);
	next;
    }

    if ($state == 2){
	if (/^\s*Fu & Li D for group 2\s*$/) {
	    $state ++;
	}
	next;
    }

    if ($state == 3) {
	if (/^_+\s*$/) {
	    $state++;
	    last;
	}
	s/^\s+//;
	s/\s+$//;
	
	push @extracted, $_;
	next;
    }

    last if ($state ==4);
}

if ($state != 4) {
    die "ERROR: it didn't find complete info, last state is $state\n";
}

if (@extracted < 2) {
    die "ERROR: couldn't find at least two lines of formated text";
}
if (@extracted %2 != 0) {
    die "ERROR: HKA formated text isn't multpile of 2:\n" . 
	join("\n", @extracted), "\n";
}

my @names = ();
my @dat = ();
foreach my $i (0..$#extracted) {
    if ($i % 2 == 0)  { # name line
	$extracted[$i] =~ s/\s+/ /g;
	$extracted[$i] .= " (2nd one = outgroup)";
	push @names, $extracted[$i];
    } else {
	$extracted[$i] =~ s/low ss/ lowSS /g;
	my @line = split /\s+/, $extracted[$i];
	my $segSites = $line[7];
	my $seqLen = $line[2] - $info{'nonSynSites'} ;
	# note $line[2] is the analyzed seq length, not num silent sites
	# So by substracting nonSynSites, this become length of intron + syn.
	# This value is not exactly same as DNAsp, which probably uses
	# slightly different method of calculating number of syn sites
	my $sampleSize = $line[5];

	my $theta = ($sampleSize > 1) ? 
	    $segSites/WattersonCoef($sampleSize)/$seqLen : "NA";

	if ($sampleSize > 1 && $segSites < 1) {
	    warn "WARN: watterson's theta estimate is 0 (no segregating sites),".
		"using 0.000001 as the initial theta to avoid problems ".
		"with mlhka (mlhka doesn't like initial theta of 0)\n"; 
	    $theta = "0.000001"; # avoiding printing out in '1e-06' by ""
	}
	$seqLen = MyRound($seqLen); # rounding to integer;
	
	push @dat, "$line[0]\t$seqLen\t$segSites\t$sampleSize\t$line[9]\t$theta\t$line[1]";
	# id, seqLen SegSites sampleSize div starttheta scalar
    }
}

print "1geneID\t2seqLen\t3SegSites\t4sampleSize\t5divergence\t6starttheta\t7scalar\n";
for my $i (0..$#names) {
    print "$names[$i]\n$dat[$i]\n";
}

unlink ("$outFile.SIT") || die "Can't unlink $outFile.SIT\n";

exit;

sub WattersonCoef {
    my $sampleSize = shift;
    
    my $result = 0;
    for (my $i = $sampleSize - 1; $i > 0; $i--) {
	$result += 1/$i;
    }

    return $result;
}

sub MyRound {
    my $num = shift;

    return int($num + 0.5 * ($num <=> 0));
}

#  Values for Input to HKA Program 
#  --------- ------ --- ----- -- --- ------- 

#  Lines are generated for each ordered species pair, and also with each 2nd species as single sequence

#   Value Order : 
#            locus identifier (input data filename upto 10 characters)
#            inheritance scalar (1.0 for autosomal is default - this must be edited: 0.75 X -linked; 0.25 mitochondrial or Y-linked)
#            mean sequence length group1 
#            mean sequence length group2 
#            mean sequence length between group 
#            # of sequences group1 
#            # of sequences group2 
#            # of polymorphic sites group1 
#            # of polymorphic sites group2 
#            mean pairwise divergence between group 
#            Tajima D  for group 1 
#            Tajima D  for group 2 
#            Fu & Li D for group 1 
#            Fu & Li D for group 2 
# lyrpet       thal         
# data/genic 1.0     1886     1886     1886  28  19  46  10     94 -0.5753  -1.2559  -0.5611  -1.6868  
# lyrpet          thal         (single line)
# data/genic 1.0     1886     1886     1886  28   1  46   0     94 -0.5753  -1.2559  -0.5611  -1.6868  
# thal         lyrpet       
# data/genic 1.0     1886     1886     1886  19  28  10  46     94 -1.2559  -0.5753  -1.6868  -0.5611  
# thal            lyrpet       (single line)
# data/genic 1.0     1886     1886     1886  19   1  10   0     94 -1.2559  -0.5753  -1.6868  -0.5611  
# _____________________________________________________________________________
