#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "\nUsage: $0 [-hg] [-w windowSize,incr] [-c V|Y|M|I|C|E|P|A|D|F] [sitesInputFile]\n\n".
    " -g: By default, sites with any gaps are ignored, but this option will\n".
    "     let SITES to include these sites with gaps in the analysis\n".
    " -c: specify non-canonical genetic code\n".
    "Runs SITES polymorphism analysis, and it prints outs the output in\n".
    "tab delimited format.\n\n".
    "- Sites with any gaps are removed from analysis, and sites with\n".
    "  multiple hits (more than 2 bases) are removed, i.e. -Sxz is specified.\n".
    "- Run sites with five different types of site choice:\n".
    "  All site types (a), silent base changes (is), noncoding base changes (i)\n".
    "  synonymous coding sites (s), non-synonymous/replacement coding sites (r).\n".
    "- Comparison within groups (-Og) is specified\n" . 
    "- Note that piPerBp and thetaPerBp is different from per site measure\n".
    "  pi and theta per gene is divided by the number of all kinds of sites\n".
    "  used in the analysis.  Hence, for site types other than all (a), you need to\n".
    "  divide pi or theta by appropriate number of sistes (e.g. synSites, \n".
    "  nonSynSites, nonCodingSites column)\n\n".
    "- Each analysis of windowing analysis contains total base of windowSize.\n".
    "  The sites with any gaps are removed and not counted in the windowSize.\n".
    "- Even if you specify windowSize of 200, actual number of bp analyzed\n".
    "  may be slightly smaller if there are sites with multiple hits.\n".
    "- With analysis other than site type (a), windowSize refer to the\n".
    "  total number of site, not the number of site of given type \n".
    "  (e.g. synonymous)\n".
    "- increments (incr) is after removing the gap sites. Say that there\n".
    "  is a gap between 200-300, and incr of 5 is specified.\n".
    "  If it calculate the polymorphism centered around 198, then the next\n".
    "  calculation is centered around 304 ".
    "  (because 198, 199, gap --- gap, 301, 302, 303, 304)\n";


use POSIX qw (tmpnam);
use IO::File;

my $debug = 0;

our ($opt_h, $opt_w, $opt_g, $opt_d, $opt_c);
use Getopt::Std;
getopts('hgdw:c:') || die "$usage\n";

if (defined($opt_h)) {
    die "$usage\n";
}

if (defined($opt_c) && $opt_c !~ /^[VYMICEPADF]$/) {
    warn "$usage\n";
    die "ERROR: For -c, please specify one of V, Y, M, I, C, E, P, A, D, or F.\n";
}
my $sitesCmd = (defined($opt_c)) ? "echo $opt_c | sites" : "sites";
my $altGenCode = (defined($opt_c)) ? "c" : "";

die "$usage\n" if (@ARGV > 1);

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $sitesInFile = shift @ARGV;

my $tmpOut = '/tmp/ooojjjj';
my $fh;

do {$tmpOut = tmpnam()}
until $fh = IO::File->new("$tmpOut.SIT", O_RDWR|O_CREAT|O_EXCL);
if (defined($opt_d)) { # for debugging
    warn "INFO: Temporary SITES output is $tmpOut.SIT, for debugging\n";
}
$fh -> close;
system("touch $tmpOut.SIT");

# run polymorphism analysis (-Ap)
# -Og for comparisons within groups


my %sitesDat = ReadInSites($sitesInFile);


if ($debug) {
    my @key = qw(comment numSeq alignedLen frame numNonCodingStretches numGroups);
    foreach my $k (@key) {
	print "$k => $sitesDat{$k}\n";
    }
    
    print "range = ";
    foreach my $k (0..($sitesDat{numNonCodingStretches}-1)) {
	#print join ("-", @{${$sitesDat{'ncRange'}}[$k]}), "\n";
	print join ("-", @{$sitesDat{'ncRange'}->[$k]}), "\n";
	print $sitesDat{'ncRange'}->[$k]->[0], "-", $sitesDat{'ncRange'}->[$k]->[1],"\n";

    }
    print "groups = ", join (":", @{$sitesDat{'groupNames'}}), "\n";
    print "grpSizes = ", join (":", @{$sitesDat{'groupSizes'}}), "\n";
    print "allGapSites = ", join (":", @{$sitesDat{'allGapSites'}}), "\n";
    print "anyGapSites = ", join (":", @{$sitesDat{'anyGapSites'}}), "\n";
    my @nnn = @{$sitesDat{'seqNames'}};
    my @seqDatMat = @{$sitesDat{'seqMat'}};
    foreach my $i (0..$#nnn) {
	my $len = $sitesDat{'alignedLen'};
	print "$nnn[$i] = ", 
	join ("", @{$seqDatMat[$i]}[0..19], "...", @{$seqDatMat[$i]}[($len-10)..($len-1)]), "\n";
    }
}

if (defined($opt_w)) {
    my ($winSize, $incr);
    if ($opt_w =~ /^\s*(\d+)\s*,\s*(\d+)\s*/) {
	$winSize = $1;
	$incr = $2;
    } else {
	die "$usage\n\nSpecify the window size and increments for -w\n";
    }
    my @windowRes = RunWindowAnalysis($sitesInFile, $winSize, $incr);

    print join("\n", @windowRes), "\n";
    exit;
}

PrintSummary($sitesInFile);
if (!defined($opt_d)) {
    unlink ("$tmpOut.SIT") || die "Can't unlink $tmpOut.SIT\n";
}
exit;

sub PrintSummary {
    my $sitesInFile = shift;

    my $sOpt = (defined($opt_g)) ? "" : "x";
    # all site -Sa
    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Sza$sOpt -Aa -Ogs$altGenCode > /dev/null");

    my %result = ReadSitesOut("$tmpOut.SIT");
    CheckSitesInfo(%result);
    #PrintSitesInfoVert("header",\%result);
    my $outputString = PrintSitesInfo("header",\%result);

    $outputString = "mutType\t" . $outputString;
    $outputString =~ s/\n/\nall\t/;

    # silent base changes -Sis
    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szis$sOpt -Aa -Ogs$altGenCode > /dev/null");
    %result = ReadSitesOut("$tmpOut.SIT");
    CheckSitesInfo(%result);
    $outputString .= "\nsilent\t" . PrintSitesInfo("noHeader",\%result);

    # noncoding base changes -Si
    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szi$sOpt -Aa -Ogs$altGenCode > /dev/null");
    %result = ReadSitesOut("$tmpOut.SIT");
    CheckSitesInfo(%result);
    $outputString .= "\nnoncoding\t" . PrintSitesInfo("noHeader",\%result);

    # synonymous base changes -Ss
    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szs$sOpt -Aa -Ogs$altGenCode > /dev/null");
    %result = ReadSitesOut("$tmpOut.SIT");
    CheckSitesInfo(%result);
    $outputString .= "\nsyn\t" . PrintSitesInfo("noHeader",\%result);

    # non-synonymous base changes -Sr
    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szr$sOpt -Aa -Ogs$altGenCode > /dev/null");
    %result = ReadSitesOut("$tmpOut.SIT");
    CheckSitesInfo(%result);
    $outputString .= "\nnonsyn\t" . PrintSitesInfo("noHeader",\%result);

    print "$outputString\n";
}

sub RunWindowAnalysis {
    my ($sitesInFile, $window, $incr) = @_;

    my %sitesIn = ReadInSites($sitesInFile);

    my $alnLen = $sitesIn{'alignedLen'};
    my @gapSites = @{$sitesIn{'anyGapSites'}};

    my @allSites = 1..$alnLen;
    my @sitesUsed = InANotInB(\@allSites, \@gapSites);
 
    my $frontOffset = MyRound(($window - 1) /2);
    my $backOffset = $window - 1 - $frontOffset;
    
    my @siteSelection = ('a', 'is', 'i', 's', 'r');
    # correspond to all, silent, non-coding, syn, non-syn
    my @statType = qw(pi thetaW tajimaD fuliDnoOG fuliD);
    my $index;
    my @allResults = ();
    my $sOpt = (defined($opt_g)) ? "" : "x";

    for ($index = $frontOffset; $index < @sitesUsed-1-$backOffset; $index += $incr) {
	my $begin = $sitesUsed[$index-$frontOffset];
	my $end = $sitesUsed[$index+$backOffset];
	my $middle = $sitesUsed[$index];

	$thisResult = "$middle\t$begin\t$end";
	foreach my $siteSel (@siteSelection) {
	    system("$sitesCmd -I$sitesInFile -R$tmpOut -M \"Analysis\" -Sz$sOpt$siteSel -Aa -Ogsm$altGenCode -Y${begin}-${end} > /dev/null");
	    my %result = ReadSitesOut("$tmpOut.SIT");
	    CheckSitesInfo(%result);

	    # get the number of sites in each category once
	    if ($siteSel eq $siteSelection[0]) {
		my $countSilentSites = 
		    $result{'synSites'} + $result{'nonCodingSites'};
		$thisResult .= "\t" . 
		    join("\t",($result{'analyzedSeqLen'}, $countSilentSites, 
			       $result{'nonCodingSites'}, $result{'synSites'}, 
			       $result{'nonSynSites'}));
	    }
	    foreach my $ss (@statType) {
		$thisResult .= "\t$result{$ss}";
	    }
	}
	push @allResults, $thisResult;
    }

    my $header = "siteMedian\tsiteBegin\tsiteEnd";
    $header .= "\tallSites\tsilentSites\tnonCodingSites\tsynSites\tnonSynSites";
    
    foreach my $ttt (@siteSelection) {
	foreach my $sss (@statType) {
	    $header .= "\t$sss.$ttt";
	}
    }
    unshift @allResults, $header;

    return @allResults;
}

sub CheckSitesInfo {
    my %info = @_;

    my @segSites = split /\t/, $info{'segSites'};
    my @totalMu = split /\t/, $info{'totalMu'};

    my $diff = 0;
    for my $i (0..$#totalMu) {

	if ($segSites[$i] ne $totalMu[$i] && $segSites[$i] ne 'NA') {
	    $diff = 1;
	    last;
	}	
    }

    if ($diff == 1) {
	warn "CHECK $info{'segSites'} END\n";
	warn "WARN: SegSites ($info{'segSites'}) and totalMu ($info{'totalMu'}) differ\n";
    }

    
}

# Returns a information hash with the following keys:
#   numExcludedIndels  number of sites excluded due to indels
#   numExcludedMultiBases number of sites excluded because there are > 2 bases
#   numSeq total number of sequences in the analysis (ignoring grouping).
#   alignedLen aligned sequence length in the entire data set

#   synSites  average number of synonymous sites used in the analysis
#   nonSynSites average number of non-synonymous sites used in the analysis
#   nonCodingSites analyzedSeqLen - synSites - nonSynSites


#   numGroups number of subgroups
#   names names of groups
#   divergence average pairwise differences between groups
#   netDivergence net average pairwise differences between groups
#   fixedDiff fixed difference between groups
#   Fst  
#   Nm population migration rates assuming diploidy
#   tajimaD
#   numSeqs number of sequences in each subgroup
#   fuliD
#   fuliDnoOG
#   totalMu  number of segregating sites
#   derivedSingleton number of derived singletons (only with outgroup)
#   singleton number of singletons
#   analyzedSeqLen  sequence lengths used for analysis of each group
#   segSites  number of segregating sites, should be same as totalMu
#   thetaW  watterson's theta estimate per gene
#   thetaWPerBp  watterson's theta estimate per base pair (not per site)
#   pi average pairwise differences per gene
#   piPerBp average pairwise differences per base pair (not per site)

sub ReadSitesOut {
    my $sitesFileName = shift;
    
    open SITESOUT, "<$sitesFileName" || die "Can't open $sitesFileName\n" ;

    my $state = 0;
    my %info=();
    my $cntr = 0;
    
    if (defined ($opt_g)) {  # INDELS section doesn't exist
	$state = 2;
	$info{'numExcludedIndels'} = 0;
    }

    # note theta is not estimated for the group with only 1 sequence.
    my @singleSeqPerGroupIndex = ();
    {
	my $index = 0;
	foreach my $grpSize (@{$sitesDat{'groupSizes'}}) {
	    if ($grpSize <2) {
		push @singleSeqPerGroupIndex, $index;
	    }
	    $index++;
	}
    }


    while (<SITESOUT>){
	chomp;
	if ($state == 0 && /^\*\*\*POSITIONS WITHIN INDELS WERE EXCLUDED/) {
	    $state++; next;
	}
	if ($state == 1) {
	    if (/NUMBER OF POSITIONS EXCLUDED\s*:\s*(\d+)\s*/) {
		$info{'numExcludedIndels'} = $1;
		$state++;
	    }
	    next;
	} 
	if ($state == 2 && 
	    /^\*\*\*POSITIONS WITH MORE THAN TWO BASE VALUES EXCLUDED/) {
	    $state++; next;
	}
	if ($state == 3) {
	    if (/NUMBER OF POSITIONS EXCLUDED\s*:\s*(\d+)\s*/) {
		$info{'numExcludedMultiBases'} = $1;
		$state++;
	    }
	    next;
	}
	if ($state == 4 && 
	    /^NUMBER OF SEQUENCES\s*:\s*(\d+)\s+LENGTH OF SEQUENCES\s*:\s*(\d+)/) {
	    $info{'numSeq'} = $1;
	    $info{'alignedLen'} = $2;
	    $state++;
	    next;
	}
	
	if ($state == 5) {
	    if (/^\s*AVERAGE NUMBER OF SYN. SITES\s*:\s*([\d\.]+)\s+REPLACEMENT SITES\s*:\s*([\d\.]+)\s*/) {
		$info{'synSites'} = $1;
		$info{'nonSynSites'} = $2;
		$state++;
	    }
	    next;
	}

	if ($state == 6 && /^GROUP DIFFERENCES PER BASE PAIR/) {
	    $state++; next;
	}
	if ($state == 7) {
	    next if (/^\s+- above and on the diagonal/);
	    next if (/^\s+- below the diagonal/);
	    if (/^(\s+\d+)+\s*$/) {
		s/^\s+//; s/\s+$//;
		my @groups = split /\s+/;
		if ($groups[$#groups] != @groups) {
		    warn "WARN: number of groups ($groups[$#groups]) may not be estimated correctly\n"; 
		}
		$info{'numGroups'} = $groups[$#groups];

		$state++;
		$cntr = 0;  # set the counter to 0
	    }
	    next;
	}

	# extracting the divergence and net-divergence
	if ($state == 8) {
	    if ($info{'numGroups'} < 2) {
		$info{'divergence'} = 'NA';
		$info{'netDivergence'} = 'NA';
		$info{'names'} = 'NA';
		$state++;
		next;
	    }

	    next if (/^\s*-+\s*$/);

	    s/\|//g;
	    s/^\s*\d+:\s*//;
	    s/\s+$//;
	    my @line = split /\s+/;
	    my $grName = shift @line;

	    if (defined ($info{'names'})) {
		$info{'names'} .= "\t$grName";
	    } else {
		$info{'names'} = $grName;
	    }
	    
	    # extract above diagonal: ave pw diff per base pair
	    foreach my $colIndex (($cntr+1)..($info{'numGroups'}-1)) {
		if (defined($info{'divergence'})) { # : separated list
		    $info{'divergence'} .= "\t$line[$colIndex]";
		} else {
		    $info{'divergence'} = "$line[$colIndex]";
		}
	    }
	    #extract below diagnonal: net pw diff per base pair
	    if ($cntr>0) {
		foreach my $colIndex (0..($cntr-1)) {
		    if (defined($info{'netDivergence'})) { # tab separated list
			$info{'netDivergence'} .= "\t$line[$colIndex]";
		    } else {
			$info{'netDivergence'} = "$line[$colIndex]";
		    }
		}
	    }

	    #print "$info{'names'} &&& $info{'divergence'} &&& $info{'netDivergence'}\n"; #debug
	    if ($cntr < $info{'numGroups'}-1) {
		$cntr++;
	    } else {
		$state++;  # get out of this extraction loop
	    }
	    next;
	}

	if ($state == 9) {
	    if ($info{'numGroups'} < 2) {
		$info{'fixedDiff'} = 'NA';
		$state += 2;  #FIXED DIFFERENCES section doesn't exist
		next;
	    }
	    
	    if (/^FIXED DIFFERENCES/) {
		$state++; 
		$cntr = 0;
	    }
	    next;
	}
	
	# Fixed differences
	if($state == 10) {
	    next if (/^[\s-]+$/);
	    next if (/^(\s*\d+)+\s*$/);
	    
	    s/^\s*\d+:\s*//;
	    s/\s+$//;
	    my @line = split /\s+/;
	    my $grName = shift @line;
	    
	    # extract above diagonal: ave pw diff per base pair
	    foreach my $colIndex (($cntr+1)..($info{'numGroups'}-1)) {
		if (defined($info{'fixedDiff'})) { # : separated list
		    $info{'fixedDiff'} .= "\t$line[$colIndex]";
		} else {
		    $info{'fixedDiff'} = "$line[$colIndex]";
		}
	    }
	    if ($cntr < $info{'numGroups'} -1) {
		$cntr++;
	    } else {
		$state++;  # get out of this extraction loop
	    }
	    next;
	}

	if ($state == 11)  {
	    if ($info{'numGroups'} < 2) {
		$info{'Fst'} = 'NA';
		$info{'Nm'} = 'NA';
		$state += 2;  # skip the next section
		next;
	    }

	    if(/Fst AND POPULATION MIGRATION RATES/) {
		$state++; $cntr = 0; next;
	    }
	}

	# Fst and nm
	if ($state == 12) {
	    next if (/^\s*- above the diagonal/);
	    next if (/^\s*- below the diagonal/);
	    next if (/^(\s*\d+)+\s*$/);
	    next if (/^[\s-]+$/);

	    s/\|//g;
	    s/^\s*\d+:\s*//;
	    s/\s+$//;
	    my @line = split /\s+/;
	    my $grName = shift @line;
	    
	    # extract above diagonal: ave pw diff per base pair
	    foreach my $colIndex (($cntr)..($info{'numGroups'}-2)) {
		if (defined($info{'Nm'})) { # : separated list
		    $info{'Nm'} .= "\t$line[$colIndex]";
		} else {
		    $info{'Nm'} = "$line[$colIndex]";
		}
	    }
	    #extract below diagnonal: net pw diff per base pair
	    if ($cntr > 0) {
		foreach my $colIndex (0..$cntr-1) {
		    if (defined($info{'Fst'})) { # tab separated list
			$info{'Fst'} .= "\t$line[$colIndex]";
		    } else {
			$info{'Fst'} = "$line[$colIndex]";
		    }
		}
	    }
	    if ($cntr < $info{'numGroups'}-1) {
		$cntr++;
	    } else {
		$state++;  # get out of this extraction loop
	    }
	    next;
	}

	### extract D statistics
	if ($state == 13 && /^D STATISTICS/) {
	    $state++; next;
	}
	if ($state == 14 && /^[-\s]+$/) {
	    $state++; $cntr=0; next;	    
	}
	if ($state == 15) {
	    if ($cntr < $info{'numGroups'}) {
		s/no OutGp/ NA /g;
		s/undef/ NA /g;
		s/low ss/ lowSS /g;
		my @line = split /\s+/;
		if (@line != 8) {
		    die "Extracting D statistics, and there are " . 
			scalar(@line) . " columns, but it should be 8\n";
		}
		
		if (defined($info{'tajimaD'})) {
		    $info{'numSeqs'} .= "\t$line[1]";
		    $info{'tajimaD'} .= "\t$line[2]";
		    $info{'fuliD'} .= "\t$line[3]";
		    $info{'fuliDnoOG'} .= "\t$line[4]";
		    $info{'totalMu'} .= "\t$line[5]";
		    $info{'derivedSingleton'} .= "\t$line[6]";
		    $info{'singleton'} .= "\t$line[7]";
		} else {
		    $info{'numSeqs'} = $line[1];
		    $info{'tajimaD'} = $line[2];
		    $info{'fuliD'} = $line[3];
		    $info{'fuliDnoOG'} = $line[4];
		    $info{'totalMu'} = $line[5];
		    $info{'derivedSingleton'} = $line[6];
		    $info{'singleton'} = $line[7];
		}
		$cntr++;
	    } else {
		$state++;
	    }

	}

	### extract theta 
	if ($state == 16 && /^THETA \(4Nu\) ESTIMATES/) {
	    $state++; next;
	}
	if ($state == 17 && /^[-\s]+$/) {
	    $state++; $cntr=0; next;	    
	}

	if ($state == 18) {
	    # If there is only 1 seq per group, sites do not print out the
	    # theta line corresponding to this group
	    if ($cntr < $info{'numGroups'} - @singleSeqPerGroupIndex) {
		my @line = split /\s+/;
		if (@line != 8) {
		    die "Extracting theta statistics, and there are " . 
			scalar(@line) . " columns, but it should be 8\n";
		}
		
		if (defined($info{'thetaW'})) {
		    $info{'analyzedSeqLen'} .= "\t$line[2]";
		    $info{'segSites'} .= "\t$line[3]";
		    $info{'thetaW'} .= "\t$line[4]";
		    $info{'thetaWPerBp'} .= "\t$line[5]";
		    $info{'pi'} .= "\t$line[6]";
		    $info{'piPerBp'} .= "\t$line[7]";
		} else {
		    $info{'analyzedSeqLen'} .= "$line[2]";
		    $info{'segSites'} .= "$line[3]";
		    $info{'thetaW'} .= "$line[4]";
		    $info{'thetaWPerBp'} .= "$line[5]";
		    $info{'pi'} .= "$line[6]";
		    $info{'piPerBp'} = "$line[7]";
		}
		$cntr++;
	    } else {
		$state++;
	    }

	}

	if ($state == 19) {
	    if (@singleSeqPerGroupIndex > 0) {
		# Fixing the missing theta by inserting NA
		foreach my $kwd (qw(analyzedSeqLen segSites thetaW thetaWPerBp pi piPerBp)) {
		    my @values = split /\t/, $info{$kwd};
		    foreach my $insIndex (@singleSeqPerGroupIndex) {
			splice @values, $insIndex, 0, 'NA';
		    }

		    $info{$kwd} = join "\t", @values;
		}
	    }
	    last;
	}
    }

    close(SITESOUT);

    if ($state != 19) {
	die "ERROR: Reading in the sites output, but couldn't find all info\n".
	    "The last state was $state. Check the output of sites: $sitesFileName\n";
    }
    
    # calc dN and dS
    my @lenVect = split /\t/, $info{'analyzedSeqLen'};
    my @ncSitesVect = ();
    foreach my $lll (@lenVect) {
	# note that when there is a single seq per group, theta
	# section doesn't get printed for the group.  So
	# analyzedSeqLen is not known.
	if ($lll eq 'NA') {
	    push @ncSitesVect, 'NA';
	} else {
	    push @ncSitesVect, $lll - $info{'synSites'} - $info{'nonSynSites'};
	}
    }
    $info{'nonCodingSites'} = join "\t", @ncSitesVect;

    # change the order of information extracted from below diagonal matrix.
    if ($info{'numGroups'} > 2) {
	foreach my $k (('Fst','netDivergence')) {
	    my @vals = split /\t/, $info{$k};
	    my @rr=TransposeLowerToUpperTriangleFlat($info{'numGroups'},\@vals);
	    $info{$k} = join "\t", @rr;
	}
    }
    
    return %info;
}

sub PrintSitesInfo {
    my ($header, $infoHashRef) =@_;
    my %info = %$infoHashRef;

    my @keyList = qw(names numSeqs piPerBp pi thetaWPerBp thetaW segSites singleton derivedSingleton tajimaD fuliDnoOG fuliD divergence netDivergence fixedDiff Fst Nm analyzedSeqLen synSites nonSynSites nonCodingSites alignedLen numExcludedIndels numExcludedMultiBases);

    #my @pairwiseKeyList = qw(divergence netDivergence fixedDiff Fst Nm);

    my $result = "";
    if ($header ne 'noHeader') {
	$result .= join("\t",@keyList) . "\n";
    }

    foreach my $kkk (0..$#keyList) {
	if (!defined($info{$keyList[$kkk]})) {
	    warn "WARN: $keyList[$kkk] not found in information hash\n";
	}
	my $out = $info{$keyList[$kkk]};
	$out =~ s/\t/,/g;
	
	if ($kkk == 0) {
	    $result .= "$out";
	} else {
	    $result .= "\t$out";
	}
    }

    return $result;
}

sub PrintSitesInfoVert {
    my ($header, $infoHashRef) =@_;
    my %info = %$infoHashRef;

    my @keyList = qw(names numSeqs piPerBp pi thetaWPerBp thetaW segSites singleton derivedSingleton tajimaD fuliDnoOG fuliD divergence netDivergence fixedDiff Fst Nm analyzedSeqLen synSites nonSynSites nonCodingSites alignedLen numExcludedIndels numExcludedMultiBases);

    foreach my $kkk (0..$#keyList) {
	my $out = $info{$keyList[$kkk]};
	$out =~ s/\t/,/g;
	
	if($header ne 'noHeader') {
	    $out = "$keyList[$kkk]\t$out\n";
	}
	print "$out";
    }
}

# @array is a flat array containing (2.1, 3.1, 3.2, 4.1, 4.2, ,...)
#  from a square matrix
#  1.1 1.2 1.3 1.4 1.5 1.6
#  2.1 2.2 2.3 2.4 2.5 2.6
#  ....
# This function reorder it to (2.1, 3.1, 4.1, 5.1, 3,2,...)
sub TransposeLowerToUpperTriangleFlat {
    my ($numRow, $arrRef) = @_;

    my @vals = @$arrRef;
    my @mat = ();
    foreach my $row (1..($numRow-1)) {
	my @temp = splice @vals,0,$row;
	push @mat, \@temp;
    }
    my @result = ();
    foreach my $col (0..($numRow-2)) {
	foreach my $row ($col..($numRow-2)) {
	    push @result, $mat[$row][$col];
	}
    }
    return @result;
}


# take a file name of SITES inputfile, read the file and return
# hash with all info
sub ReadInSites {
    my $file = shift;

    open IN, "<$file" || die "Can't open $file\n";
    
    my %dat = ();
    $dat{'comment'} = <IN>;  # 1st line
    chomp $dat{'comment'};
    # 2nd line
    my $line = <IN>;    
    ($dat{'numSeq'}, $dat{'alignedLen'}) = Get2Num ($line);
    
    # 3rd line
    $line = <IN>;
    # frame is the position in the coding frame of the first coding base
    ($dat{'frame'}, $dat{'numNonCodingStretches'}) = Get2Num ($line);
    
    my @range = ();
    foreach my $i (1..$dat{'numNonCodingStretches'}) {
	$line = <IN>;
	my @tmp = Get2Num($line);
	push @range, \@tmp;
    }
    $dat{'ncRange'} = \@range;
    
    $line = <IN>;
    if ($line =~ /^\s*(\d+)\s*/) {
	$dat{'numGroups'} = $1;
    } else {
	die "ERROR: line $. of file $file should contain a single integer: $line\n";
    }

    if ($dat{'numGroups'} > 1) {
	my @groupNames = ();
	my @groupSizes = ();
	foreach my $ggg (1..$dat{'numGroups'}) {
	    $line = <IN>;
	    $line =~ s/^\s+//;
	    $line =~ s/\s+$//;
	    my @grpLine = split /\s+/, $line;
	    if (@grpLine != 2) {
		die "ERROR: Reading in group sizes, this line doesn't have 2 columns: $line\n";
	    }
	    push @groupNames, $grpLine[0];
	    push @groupSizes, $grpLine[1];
	}
	$dat{'groupNames'} = \@groupNames;
	$dat{'groupSizes'} = \@groupSizes;
    }

    # readin sequence data
    my @rawdatArr = <IN>;
    chomp(@rawdatArr);
    
    my $rawDat = join "", @rawdatArr;
    @rawdatArr = split //, $rawDat;
    
    my @seqName = ();
    my @seqData = (); # matrix
    for my $seqNum (0..($dat{'numSeq'}-1)) {
	my @bufferArr = splice @rawdatArr, 0, 10;  # 10 char;
	
	my $thisName = join("", @bufferArr);
	$thisName =~ s/\s+//g;
	push @seqName, $thisName;
	my @seqArr = ();
        my $base;
	while (@seqArr < $dat{'alignedLen'} && ($base = shift @rawdatArr)) {
	    push (@seqArr, $base) if ($base !~ /^\s$/);
	}
	if (@seqArr != $dat{'alignedLen'}) {
	    die "ERROR: For $thisName, the number of bases (" .
		scalar(@seqArr) . ") is shorter than expected (" .
		$dat{'alignedLen'} . ")\n";
	}
	push @seqData, \@seqArr;
    }
    my $residue = join "", @rawdatArr;
    if ($residue !~ /^\s*$/) {
	die "ERROR: after reading $dat{'numSeq'} sequences ($dat{'alignedLen'} bp each), there are some left-over:\n$residue\n";
    }

    $dat{'seqNames'} = \@seqName;
    $dat{'seqMat'} = \@seqData;

    close (IN);
    
    my @allGapSites = GapSitesMat(\@seqData,"all");
    my @anyGapSites = GapSitesMat(\@seqData,"any");
    
    $dat{'allGapSites'} = \@allGapSites;
    $dat{'anyGapSites'} = \@anyGapSites;

    return %dat;
}

# take a string with two integers separated by space, and return an array of
# 2 integer values.
sub Get2Num {
    my $string = shift;
    my @result = ();
    if ($string =~ /^\s*(\d+)\s+(\d+)\s*$/) {
	@result = ($1, $2);
    } else {
	die "ERROR: this line should contain 2 integers: $string\n";
    }
    return @result;
}

# take a reference to matrix (2-dim array)
# 2nd arugment is "all" or "any"
sub GapSitesMat {
    my ($matRef, $option) =@_;

    if ($option !~ /(all|any)/i) {
	die "ERROR: 2nd argument to GapSitesMat should be all or any, ".
	    "not $option\n";
    }
    my $numRow = @$matRef;
    return () if ($numRow == 0);
    my $numCol = @{$matRef->[0]};
    return () if ($numCol == 0);

    my @result = ();
    foreach my $col (0..($numCol-1)) {
	my $anyGap = 0;
	my $allGap = 1;
	foreach my $row (0..($numRow - 1)) {
	    my $b = $matRef->[$row]->[$col];
	    if ($b =~ /^-$/) {
		if ($option eq 'any') {
		    $anyGap = 1;
		    last;
		}
	    } else {
		$allGap = 0;
	    }
	}
	
	if ($option eq 'any') {
	    if ($anyGap) {
		push @result, $col+1;
	    }
	} else {
	    if ($allGap) {
		push @result, $col+1;
	    }
	}
    }
    return @result;
}

sub InANotInB {
    my ($aRef, $bRef) =@_;
    my %seen = ();
    my @aonly =();

    foreach my $item (@$bRef) { $seen{$item} = 1};
    foreach my $item (@$aRef) {
        push (@aonly, $item) unless $seen{$item};
    }
    return (@aonly);
}


sub MyRound {
    my $num = shift;

    return int($num + 0.5 * ($num <=> 0));
}
