#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "\nUsage: $0 [-phma] [-w windowSize] [-i interval] -l alignLen -n sampleSize [-r numberOfSimulatedSampling] [-f seqNameFile] fastaFiles\n" .
    " -h: help\n".
    " -m: memory conservation. By default, all sequence data get read into\n".
    "     memory.  But it is not possible with a lot of sequence data. With\n".
    "     -m, the program accesses  files when required without reading in.\n".
    "     This slows down the process, but enables analysis with large set\n".
    "     (Not implemented yet)\n".
    " -a: Nordborg Arabidopsis data contains multiple samples per population.\n".
    "     This option specify to sample at most 1 sample per population.\n".
    " -f filename: read in a file, listing names of sequences to be used. 1 name per line\n".
    " -w: Size (in bp) of each window\n" .
    " -p: Slidling Window Pi analysis instead of Tajima's D\n".
    " -i: Increment (in bp) of sliding windows\n".
    " -l: Length of aligned sequences\n".
    " -n: number of sequences (samples) in data\n".
    " -r: How many re-sampling to be done?\n"
    ;

use Bio::AlignIO;
use Bio::Align::DNAStatistics;

use File::Find;
use File::Basename;
use Getopt::Std;

my %IUPAC = ('A'=>'A','C'=>'C','G'=>'G','T'=>'T','U'=>'T','-'=>'-','?'=>'?',
	     'M'=>'A C','R'=>'A G','W'=>'A T','S'=>'C G','Y'=>'C T','K'=>'G T',
	     'V'=>'A C G','H'=>'A C T','D'=>'A G T','B'=>'C G T',
	     'N'=>'A C G T');

our ($opt_h, $opt_a, $opt_f, $opt_m, $opt_l, $opt_w, $opt_i, $opt_r, $opt_p);
getopts('ahpml:w:r:i:n:f:') || die $usage;
die $usage if (defined($opt_h));

die "$usage\nERROR: Please specify at least one source directory\n" 
    unless @ARGV;

# should take align len, window-size, window step-size
if (!defined ($opt_l)) {
    die "Please specify -l the alignment lengths after removing sites with gaps\n";
}
if (!defined($opt_n)) {
    die "Please specify number of samples with -n\n";
}
if (!defined ($opt_w)) {
    warn "WARN: -w was not used, using default size of 100bp window size";
    $opt_w = 100;
}
if (!defined ($opt_i)) {
    warn "WARN: -i (intervals between neighboring windows) not specified, using default of 25\n";
    $opt_i = 25;
}

if (!defined($opt_r)) {
    $opt_r = 5000;
    warn "WARN: number of simulated resampling (-r) not specified, using default of $opt_r)\n";
}

if ($opt_l < $opt_w) {
    die "Window size (-w) has to be greater than or equal to alignment length (-l)\n";
}

my $possibleStartSites = $opt_l - $opt_w + 1;
my $numWindows = int(($possibleStartSites - 1) / $opt_i) + 1;
    # This is number of windows per gene

#$numWindows = 5;  # debug
my $windowSize = $opt_w;
my $numSamples = $opt_n;
my $increment = $opt_i;
my $alignLen = $opt_l;

my $verbose = 0;

my $outAln = Bio::AlignIO->new(-format => 'fasta'); # used for debug

print STDERR "INFO: reading in files, ";
# recursively goes down to directory and find the fasta files
my @fileName = FilenamesRecursive(@ARGV);
# print join("\n", @fileName), "\n";

my @fileInfo = ProcessFastaFile(@fileName);
print STDERR "DONE, ", scalar(@fileInfo), " files read in.\n";

if ($verbose ) {
    foreach my $fff (@fileInfo) {
	print STDERR "file: ", $$fff{filename}, "\tn: ", $$fff{n}, "\tlen: ", $fff->{alignedLen}, "\n";
	# print join(":", @{ $$fff{names} }), "\n";
    }
}

if (defined($opt_p)) {
    # This prints out as it goes
    WindowPi(\@fileInfo,  $windowSize, $numWindows, $numSamples, $opt_r, $alignLen, $increment);
} else {
    my ($maxArrRef, $minArrRef) = WindowMaxMinTajimaD(\@fileInfo,  $windowSize, $numWindows, $numSamples, $opt_r, $alignLen, $increment);
    
    foreach my $i (0..(@$maxArrRef - 1)) {
	print join("\t", ($$minArrRef[$i], $$maxArrRef[$i])), "\n";
    }
}

exit ;

sub FilenamesRecursive {
    my @fileName =();
    my @sourceDirName;

    foreach my $dir (@_) {
	$dir =~ s@/$@@;              # Strip any trailing slash
	if (-d $dir) {
	    push @sourceDirName, $dir;
	} elsif (-f $dir) {
	    push @fileName, $dir;
	} else {
	    warn "Don't know how to handle argument '$dir'\n";
	    next;
	}
    }
    
    # extract the plain files and symlinks
    push @fileName, ListRegFilesRecursive(@sourceDirName);
    # print join "\n", @fileName, "\n";  # for debug    

    return @fileName;
}

# take a list of directories and returns the names of plain files and sym links
sub ListRegFilesRecursive {
    my @names = ();
    find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_;
    return @names;
}

# Make an obj
#   filename => fileName
#   inMemory => 'Y' or 'N'
#   n => number of sequences
#   alignedLen => aligned length
#   names => reference to array of sequence names
#   align => alignmentObject if inMemory = 'y' otherwise ''.
sub ProcessFastaFile {
    my @result = ();
    my $href;
    
    my $inMem = (defined($opt_m)) ? 'N': 'Y';

    $inMem = 'Y'; # All sequence get read into memory, this can be changed in the future.

    my @namesFromFile = (defined($opt_f)) ? (ReadSeqNameFile($opt_f)) : ();
    
    foreach my $fileName (@_) {
	my $inAlnIOObj = Bio::AlignIO->new(-file => $fileName, -format => 'fasta');
	my ($alnLen, $numSeq) = (0,0);
	my @seqNames = ();
	my ($aln);

	# Is there a case with multiple align per file?
	# I'm assuming this loop goes through only once.
	while ($aln = $inAlnIOObj->next_aln()) {

	    # select names
	    if (defined($opt_f)) {
		$aln = SubsetOfAlignByID($aln, \@namesFromFile);
	    }
	    
	    SetAlphabetInAlnObj($aln, "dna");
	    
	    # sequence clean up
	    $aln->uppercase();
	    $aln->map_chars('U', 'T');

	    ## To speed up, converting anything other than ATGC to gap (ignored)
	    #$aln->map_chars('[^ATGC]', '-');
	    # causes warning about alphabet when a seq is all -, so manual clean
	    foreach my $sss ($aln->each_seq) {
		my $thisSeq = $sss->seq();
		$thisSeq =~ s/[^ATGC]/-/g;
		if ($thisSeq =~ /^-+$/) { # empty seq
		    $aln->remove_seq($sss);
		    next;
		}
		$sss->seq($thisSeq, 'dna');
	    }
	    
	    $aln = RmEmptySamplesFromAlignObj($aln);
	    $aln = RmInsertionFromAlignObj($aln);
	    $numSeq = $aln->no_sequences;
	    $alnLen = $aln->length;
	    @seqNames = SeqNamesInAlnObj($aln);
	    
	    # making sure sequence names are unique
	    if (scalar @seqNames != scalar(ExtractUnique(@seqNames))) {
		die "Error: Sequence names in fasta file $fileName contain duplicates: " .
		    join(",", sort(@seqNames)) . "\n";
	    }

	    if ($numSeq > 0 && $alnLen > 0) {
		my $alnObj = ($inMem eq 'Y') ? $aln : ''; 
		$href = { 'filename' => $fileName, 'inMemory' => $inMem,
			  'n' => $numSeq, 'alignedLen' => $alnLen,
			  'names' => \@seqNames,
			  'align' => $alnObj};
		
		push @result, $href;
		if ($verbose) {
		    print STDERR "fn: $fileName, n: $numSeq, len: $alnLen\n"; # debug
		}
		#print join(":", @seqNames), "\n"; # debug
	    }
	}
    }
    return @result;
}

sub ReadSeqNameFile {
    my $file = shift;
    my @result = ();
    open(INFILE, "<$file") || die "Can't open $file\n";

    while (<INFILE>) {
        chomp;
        s/#.*$//;    # remove comments (#)
        s/^\s+//; s/\s+$//;
        next if (/^$/);
        push @result, $_;
    }
    close(INFILE);
    return @result;
}

# Sequence names of Nordborg data contains population(ecotype) before the first '-'.
sub NordborgRandPop {
    my $numSamples = shift;
    my @names = @_;

    # Check there are enough pops
    my @pops = @names;
    @pops = map {my @tmp = split /-/, $_; $tmp[0]} @pops;
    @pops = ExtractUnique(@pops);
    if ($numSamples > @pops) {
	warn "WARN: in Nordborg data set, only ", scalar(@pops), " pops. Can't choose $numSamples samples\n";
	return ();
    }

    # start to sample;
    my @result = ();
    my %seen = ();
    while (@result < $numSamples) {
	my $i = int (rand(scalar(@names)));

	my @nameParts =  split /-/, $names[$i];

	my $pop = shift @nameParts;
	
	next if (defined($seen{$pop}));

	$seen{$pop} = 1;
	push @result, $names[$i];
    }

    # print STDERR "SEL ", join(" ", sort(@result))," NUM ", scalar(@result), "\n"; #debug

    return @result;
}

sub WindowMaxMinTajimaD {
    my ($fileInfoArrRef, $windowSize, $numWindows, $numSamples, $numRep, $totalLen, $increment) = @_;
    my @fileInfo = @$fileInfoArrRef;
    
    my @lenVect = map {$_->{alignedLen}} @fileInfo;
    # print "@lenVect\n";
    my $maxLen = Max(@lenVect);

    my $rep = 0;
    my @maxTajDarr = ();
    my @minTajDarr = ();

    print STDERR "INFO: WindowAnalysis of Tajima's D started: \n" .
	"windowSize=$windowSize, numWindows=$numWindows, $numRep repetitions\n".
	"Analysis Progress:\n";
    my $printIntvl = int($numRep / 200);
    $printIntvl = 10 if ($printIntvl < 10);
    # $printIntvl = 1;
    while ($rep < $numRep) {
	# select the names, give up after 100 tries
	my @names = ();
	my $nameTryCntr = 0;
	while (@names != $numSamples) {
	    my $fn = int (rand(scalar(@fileInfo)));
	    
	    my @allNames = @{ $fileInfo[$fn]->{names} };
	    
	    #print "$rep HERE1\n"; #profile
	    if (defined($opt_a)) {
		@names = NordborgRandPop($numSamples, @allNames);
		# print "rep: $rep, names ", join(" ", sort(@names)),"\n";  # debug
	    } else {
		my @index = RandIntArrayUniq($numSamples, scalar(@names) - 1);
		@index = sort {$a <=> $b} @index;
		@names = @names[@index];
		# print "rep $rep: ",$fileInfo[$fn]->{filename}," @index @names\n"; # debug
	    }

	    $nameTryCntr++;
	    if ($nameTryCntr > 100) {
		die "Error: Tried to get $numSamples samples, but there isn't ".
		    "enough samples in the data\n";
	    }
	}

	#print "$rep HERE2\n"; #profile
	
	#my @windows = SampleWindows(\@fileInfo, $maxLen, $windowSize, $numWindows,
	#			    \@names);
	#my @windows = SampleWindowsSequential(\@fileInfo, $maxLen, $windowSize, 
	#				      $numWindows, \@names, $increment);
	my @windows = SampleWindowsSlideWConcat(\@fileInfo, $maxLen, $windowSize, 
						$numWindows, \@names, $totalLen, $increment);

	#print "$rep HERE3\n"; #profile

	next if (@windows != $numWindows);  # giving up this name set

	# Do analysis
	my $minTajD = TajimaD($windows[0]);
	my $maxTajD = $minTajD;
	foreach my $thisAln (@windows[1..$#windows]) {
	    my $tajD = TajimaD($thisAln);
	    $maxTajD = $tajD if ($tajD > $maxTajD);
	    $minTajD = $tajD if ($tajD < $minTajD);

	    #$outAln->write_aln($thisAln);
	    #print "$tajD max $maxTajD min $minTajD\n"; # debug
	}
	push @maxTajDarr, $maxTajD;
	push @minTajDarr, $minTajD;

	#print "$rep HERE4\n"; #profile
	
	$rep++;
	print STDERR " $rep Repetitions done\n" if ($rep % $printIntvl == 0);
    }
    return (\@maxTajDarr, \@minTajDarr);
}

sub WindowPi {
    my ($fileInfoArrRef, $windowSize, $numWindows, $numSamples, $numRep, $totalLen, $increment) = @_;
    my @fileInfo = @$fileInfoArrRef;
    
    my @lenVect = map {$_->{alignedLen}} @fileInfo;
    # print "@lenVect\n";
    my $maxLen = Max(@lenVect);

    my $rep = 0;
    my @maxTajDarr = ();
    my @minTajDarr = ();

    print STDERR "INFO: WindowAnalysis of Pi started: \n" .
	"windowSize=$windowSize, numWindows=$numWindows, $numRep repetitions\n".
	"Analysis Progress:\n";
    my $printIntvl = int($numRep / 200);
    $printIntvl = 10 if ($printIntvl < 10);
    # $printIntvl = 1;
    while ($rep < $numRep) {
	# select the names, give up after 100 tries
	my @names = ();
	my $nameTryCntr = 0;
	while (@names != $numSamples) {
	    my $fn = int (rand(scalar(@fileInfo)));
	    
	    my @allNames = @{ $fileInfo[$fn]->{names} };
	    
	    if (defined($opt_a)) {
		@names = NordborgRandPop($numSamples, @allNames);
	    } else {
		my @index = RandIntArrayUniq($numSamples, scalar(@names) - 1);
		@index = sort {$a <=> $b} @index;
		@names = @names[@index];
	    }

	    $nameTryCntr++;
	    if ($nameTryCntr > 100) {
		die "Error: Tried to get $numSamples samples, but there isn't ".
		    "enough samples in the data\n";
	    }
	}

	my @windows = SampleWindowsSlideWConcat(\@fileInfo, $maxLen, $windowSize, 
						$numWindows, \@names, $totalLen, $increment);

	next if (@windows != $numWindows);  # giving up this name set

	# Do analysis
	my @piArr = ();
	foreach my $thisAln (@windows) {
	    my $piPerGene = AvePWDiff($thisAln);
	    push @piArr, $piPerGene/($thisAln->length);
	}

	# print all pi
	print "\t";
	print join("\t", @piArr), "\n";

	$rep++;
	print STDERR " $rep Repetitions done\n" if ($rep % $printIntvl == 0);
    }
    return (\@maxTajDarr, \@minTajDarr);
}


# It will return an array of Align Object.
# - Extract the sequences specified by @$seqNameArrRef.
# - Then remove any character other than ATGC and calculate the cleaned length.
# - Genes in @$fileInfoArrRef are sampled proportional to the cleaned length.
# - Genes are sampled WITH replacement
# - For each gene, one window of $windowSize is extracted.
# - This process get repaeated until total number of samples become
#   $numWindows.
sub SampleWindows {
    my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef) = @_;
    
    my @fileInfo = @$fileInfoArrRef;
    
    my %avoidHash = ();
    my %lenHash = ();
    my %subsetHash = ();

    # my $trial = 0;
    my @resultAlignArr = ();
    # my @cntr = (0,0,0,0);
    while (@resultAlignArr < $numWindows) {
	if ($trial > $numWindows * 100) {
	    warn "INFO: too many trials to sample $numWindows windows from name set of " . 
		join(" ", @$seqNameArrRef) .
		" giving up this set, and creating another name set\n";
	    return ();
	}
	$trial++;

	# Pick one candidate gene
	my $fn = int (rand(scalar(@fileInfo)));

	next if (defined($avoidHash{$fn}));

	if (! defined($lenHash{$fn})) {
	    my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, $seqNameArrRef);
	    if ($subAln eq "" || $subAln->no_sequences != scalar(@$seqNameArrRef)) {
		$avoidHash{$fn} = 1;
		# $cntr[0]++;
		next;
	    }
	    
	    $subAln = RmAnyGapAmbigSites($subAln);
	    # This is SimpleAlign object
	    if ($subAln eq "") {  # means all sites contains gaps/ambig chars
		$avoidHash{$fn} = 1;
		# $cntr[1]++;
		next;
	    }
	
	    my $thisLen = $subAln->length;
	    # print "file; $fn, AFTER RmGap, LENGTH: $thisLen, max: $maxLen\nA"; # debug
	    # $outAln->write_aln($subAln); #debug
	    
	    if ($thisLen < $windowSize){
		$avoidHash{$fn} = 1;
		# $cntr[2]++;
		next;
	    }

	    $lenHash{$fn} = $thisLen;
	    $subsetHash{$fn} = $subAln;
	}

	# choose proportional to the length of each gene
	if (rand($maxLen) > $lenHash{$fn} ) {
	    # $cntr[3]++;
	    next;
	}

	# make a slice with rand
	# rand returns [0, max)
	my $begin = int(rand($lenHash{$fn} - $windowSize + 1)) + 1; 
	my $end = $begin + $windowSize - 1;

	my $subAln = $subsetHash{$fn};
	$subAln = $subAln->slice($begin,$end);
	# print "BEGIN: $begin - $end\n";  # debug
	# $outAln->write_aln($subAln); # debug
	
	push @resultAlignArr, $subAln; #debug
    }
    
    # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n";
    return @resultAlignArr;
}


# It will return an array of Align Object.
# - Extract the sequences specified by @$seqNameArrRef.
# - Then remove any character other than ATGC and calculate the cleaned length.
# - Genes in @$fileInfoArrRef are sampled proportional to the cleaned length.
# - Genes are sampled WITHOUT replacement
# - For each gene, sequnences are extracted with $windowSize and $increment
# - It will keep sampling genes until total number of samples become
#   $numWindows
sub SampleWindowsSequential {
    my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef, $increment) = @_;
    
    my @fileInfo = @$fileInfoArrRef;
    
    my %avoidHash = ();
    my %lenHash = ();
    my %subsetHash = ();
    my %usedHash  = ();

    my $trial = 0;
    my @resultAlignArr = ();
    # my @cntr = (0,0,0,0);
    while (@resultAlignArr < $numWindows) {
	if ($trial > $numWindows * 100) {
	    warn "INFO: too many trials to sample $numWindows windows from name set of " . 
		join(" ", @$seqNameArrRef) .
		" giving up this set, and creating another name set\n";
	    return ();
	}
	$trial++;

	# Pick one candidate gene
	my $fn = int (rand(scalar(@fileInfo)));

	next if (defined($avoidHash{$fn}));
	next if (defined($usedHash{$fn}));

	if (! defined($lenHash{$fn})) {
	    my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, $seqNameArrRef);
	    if ($subAln eq "" || $subAln->no_sequences != scalar(@$seqNameArrRef)) {
		$avoidHash{$fn} = 1;
		# $cntr[0]++;
		next;
	    }
	    
	    $subAln = RmAnyGapAmbigSites($subAln);
	    # This is SimpleAlign object
	    if ($subAln eq "") {  # means all sites contains gaps/ambig chars
		$avoidHash{$fn} = 1;
		# $cntr[1]++;
		next;
	    }
	
	    my $thisLen = $subAln->length;
	    # print "file; $fn, AFTER RmGap, LENGTH: $thisLen, max: $maxLen\nA"; # debug
	    # $outAln->write_aln($subAln); #debug
	    
	    if ($thisLen < $windowSize){
		$avoidHash{$fn} = 1;
		# $cntr[2]++;
		next;
	    }

	    $lenHash{$fn} = $thisLen;
	    $subsetHash{$fn} = $subAln;
	}
	
	# choose proportional to the length of each gene
	if (rand($maxLen) > $lenHash{$fn} ) {
	    # $cntr[3]++;
	    next;
	}

	# Make sequential slice
	my $subAln = $subsetHash{$fn};
	for(my $begin = 1;  (@resultAlignArr < $numWindows) && 
	    ($begin <= $lenHash{$fn} - $windowSize + 1); $begin += $increment) {
	    my $end = $begin + $windowSize -1;

	    my $subAlnSlice = $subAln->slice($begin,$end);
	
	    push @resultAlignArr, $subAlnSlice; #debug
	}
	
	$usedHash{$fn} = 1;

    }
    
    # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n";

    return @resultAlignArr;
}

# It will return an array of Align Object.
# - Extract the sequences specified by @$seqNameArrRef.
# - Then remove any character other than ATGC and calculate the cleaned length.
# - Genes are concatenated until the length of concatenated sequences become
#   at least $totalLen bp
# - Genes are sampled WITHOUT replacement
# - From this concatenated sequence, the beginning of the sequence is randomly
#   chosen so the selected sequence has exactly $totalLen bp
# - Then sliding windows are extracted with $windowSize and $increment
sub SampleWindowsSlideWConcat {
    my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef, $totalLen, $increment) = @_;
    
    my @fileInfo = @$fileInfoArrRef;
    
    my %avoidHash = ();
    my %lenHash = ();
    my %subsetHash = ();
    my %usedHash  = ();

    my $trial = 0;

    my @resultAlignArr = ();
    my @tempGeneArray = ();
    my $cumuLen = 0;
    # my @cntr = (0,0,0,0);
    while ($cumuLen < $totalLen) {
	if ($trial > $numWindows * 100) {
	    warn "INFO: too many trials to sample $numWindows windows from name set of " . 
		join(" ", @$seqNameArrRef) .
		" giving up this set, and creating another name set\n";
	    return ();
	}
	$trial++;

	# Pick one candidate gene
	my $fn = int (rand(scalar(@fileInfo)));

	next if (defined($avoidHash{$fn}));
	next if (defined($usedHash{$fn}));

	if (! defined($lenHash{$fn})) {
	    my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, 
					   $seqNameArrRef);
	    if ($subAln eq "" || 
		$subAln->no_sequences != scalar(@$seqNameArrRef)) {
		$avoidHash{$fn} = 1;
		# $cntr[0]++;
		next;
	    }
	    
	    $subAln = RmAnyGapAmbigSites($subAln);
	    # This is SimpleAlign object
	    if ($subAln eq "") {  # means all sites contains gaps/ambig chars
		$avoidHash{$fn} = 1;
		# $cntr[1]++;
		next;
	    }
	
	    $lenHash{$fn} = $subAln->length;
	    $subsetHash{$fn} = $subAln;
	}
	
	# choose proportional to the length of each gene
	if (rand($maxLen) > $lenHash{$fn} ) {
	    # $cntr[3]++;
	    next;
	}
	
	$cumuLen += $lenHash{$fn};
	push @tempGeneArray, $subsetHash{$fn};
	$usedHash{$fn} = 1;

	#print "File:", $fileInfo[$fn]->{filename}, ", len $lenHash{$fn}, cumu $cumuLen\n"; #debug
    }

    my $concatAln = ConcatenateAln(@tempGeneArray);

    # pick the random begin, uniform [1, ($concatAln->length) - $totalLen +1]
    my $beginGene = int (rand(($concatAln->length) - $totalLen +1)) + 1;
    my $endGene = $beginGene + $totalLen - 1;

    #print "CumuLen: ", $concatAln->length, " beg: $beginGene, end: $endGene\n"; #debug
    #$outAln->write_aln($concatAln);  # debug

    # Make sequential slice
    for(my $begin = $beginGene; $begin+$windowSize-1<= $endGene; 
	$begin += $increment) {
	my $end = $begin + $windowSize -1;
	
	my $subAlnSlice = $concatAln->slice($begin,$end);
	
	push @resultAlignArr, $subAlnSlice;
    }
    
    if (@resultAlignArr != $numWindows) {
	die "ERROR: Created ". scalar(@resultAlignArr). 
	    " windows, but there should be $numWindows windows\n";
    }
    # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n";

    ## printing pi per site for the concatenaed aln
    if(defined($opt_p)) {
	print AvePWDiff($concatAln) / ($concatAln->length);
    }

    return @resultAlignArr;
}


######### Align Object related functions
# From a Bio::Align object, extract all of the sequence names
# and return them in an array
sub SeqNamesInAlnObj {
    my $alnObj = shift;
    my @result = ();
    foreach my $seq ($alnObj->each_seq) {
	my $name = $seq->display_id;
	push @result, $name;
    }
    return @result;
}

# Takes an array of align object and make a new align object
# with all concatenated.
# The order of sequences is same as the first object.  If
# other align objects have sequences with names not included in the first object,
# Those sequences are ignored.  If sequences are missing in the other
# Align object, it will return "".  If multiple sequences have same name,
# "" get returned.
sub ConcatenateAln {
    my $thisAln = shift @_;
    my $resultAln = $thisAln->slice(1,$thisAln->length);
    foreach my $alnObj (@_) {
	foreach my $cumuSeq ($resultAln->each_seq) {
	    my $nnn = $cumuSeq->display_id;
	    my @newSeq = $alnObj->each_seq_with_id($nnn);
	    if (@newSeq != 0) {
		if (@newSeq == 0) {
		    warn "WARN: in ConcatenateAln(), $nnn not found\n";
		    return "";
		} elsif (@newSeq > 1) {
		    warn "WARN: in ConcatenateAln(), $nnn found " . scalar(@newSeq).
			" times. Not processing\n";;
		    return "";    
		}
	    }

	    my $connected = ($cumuSeq->seq()) . ($newSeq[0]->seq());
	    $cumuSeq->seq($connected);
	}
    }
    return ($resultAln);
}

## side effect: alphabet of all sequences in aln get set.
sub SetAlphabetInAlnObj {
    my ($alnObj, $alphabet) = @_;
    foreach my $sss ($alnObj->each_seq) {
	$sss->alphabet($alphabet);
    }
}

# consenusu_iupac returns lower case for sites which contains '-', '?', or '.'

sub TajimaD {
    my $alnObj = shift;

    my $n = $alnObj->no_sequences;
    my $an = WattersonCoef1($n);
    my $bn = WattersonCoef2($n);

    my $e1 = ($n+1) / (3 * $an * ($n-1)) - 1/($an ** 2);
    my $e2 = (2 * ($n ** 2 + $n + 3)/ (9 * $n * ($n - 1)) -
	      ($n + 2)/ ($n * $an) + $bn / ($an ** 2)) /
	      ($an ** 2 + $bn);
    my $pi = AvePWDiff($alnObj);
    my $s = NumSegSites($alnObj, 1); # "1" means "use Eta"
    my $wattTheta = $s / $an;  

    my $denom = sqrt ($e1 * $s + $e2 * $s * ($s - 1) );
    if ($denom  == 0) {
	$denom = 1;   # is this valid?
    }
    my $d = ($pi - $wattTheta) / $denom;
    
    return $d;
}


sub WattersonCoef1 {
    my $n = shift;

    my @arr = map {1/$_} (reverse 1..($n-1));
    return Sum(@arr);
}

sub WattersonCoef2 {
    my $n = shift;

    my @arr = map {1/($_ ** 2)} (reverse 1..($n-1));
    return Sum(@arr);
}


# Note this doesn't handle degenerate code correctly.
# This is slow
sub PiePerSite {
    my ($alnObj) = @_;
    my $statObj = Bio::Align::DNAStatistics->new();
    my $distMat = $statObj->distance(-align => $alnObj, -method =>'uncorrected');
    
    my @names = $distMat->column_names;
    # my @row_names = $distMat->row_names;
    # print "COL: @names\nROW: @row_names\n";
    # print $distMat->print_matrix;

    return 0 if (@names < 2);

    my $result = 0;
    foreach my $rrr (0..($#names-1)) {
	foreach my $ccc (($rrr+1)..$#names) {
	    $result += $distMat->get_entry($names[$rrr], $names[$ccc]);
	}
    }
    $result /= scalar(@names) * (scalar(@names) - 1) / 2;  # average

    return $result;
}

# This is pi per gene
sub AvePWDiff {
    my $alnObj = shift;
    my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] );

    my $numSites = @{ $seqMat[0] } ;
    my $numSamples = @seqMat;

    return 0 if ($numSites < 1 || $numSamples < 2);
    my $sum = 0;

    foreach my $ind1 (0..($numSamples - 2)) {
	foreach my $ind2 (($ind1+1)..($numSamples-1)) {
	    my $comparableBases = 0;
	    my $thisDiff = 0;
	    foreach my $site (0..($numSites - 1)) {
		my $b1 = uc($seqMat[$ind1]->[$site]);
		my $b2 = uc($seqMat[$ind2]->[$site]);
		next if (($b1 . $b2) !~ /^[ACGT]{2}$/);
		# Skip all ambiguous characters and gap

		$comparableBases ++;
		$thisDiff++ if ($b1 ne $b2);
	    }

	    $sum += $thisDiff;
	}
    }
    return $sum / ($numSamples * ($numSamples-1)/2);
}

# if $etaQ > 0, it returns Eta (total number of mutations) instead of
# Number of segregating sites.
sub NumSegSites {
    my ($alnObj, $etaQ) = @_;

    my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] );

    my $numCol = @{ $seqMat[0] } ;

    my @result = ();
    foreach my $col (0..($numCol-1)) {
	my @thisCol = ();

	foreach my $row (0..$#seqMat) {
	    push @thisCol, $seqMat[$row]->[$col];
	}
	@thisCol = ExtractUnique(@thisCol);
	
	my $type = PolymorphismTypeNoHet(@thisCol);
	push @result, $type;
    }
    
    if ($etaQ) { # similar to num seg sites: add +1 for tri-allelic, +2 for quad-allelic sites
	@result = map { ($_ > 1) ? ($_ - 1) % 4 : 0 } @result;
	return Sum(@result);
    } else { # num seg sites: type 2-4,6-8 are the sites with polymorphism
	@result = grep {/^[234678]$/} @result;
	return scalar(@result)
    }
}

# number of mutations


# Takes an array of bases, and categorize the 'polymorphism' in the array
# Ambiguity code are considered as ambiguous, not heterozygote.
# Returns:
#  0 : all gap (-)
#  1 : monomorphic (include all N or ?)
#  2 : di-allelic 
#  3 : tri-allelic
#  4 : quad-allelic
#  5 : monomorphic + gap (include cases: N N N N - N)
#  6 : di-allelic + gap
#  7 : tri-allelic + gap
#  8 : quad-allelic + gap
# <0 : non IUPAC code encountered <- not used
# WARN: For simplicity, 3 or 4 base ambiguity codes are considered as a
# missing char (? N V H D B), and they are ignored.
# This could cause problem.  For example, with (G, G, G, G, H, H), this site
# contains at least two types of bases (H=A|C|T).  However, this function
# consider this site to be monomorphic

sub PolymorphismTypeNoHet {
    my @bases = @_;

    # check gaps, and get rid of them
    my $withGap = 0;  # a flag to indicate existence of gap
    my @gaps = grep {/^-$/} @bases;
    if (@gaps > 0) {
	$withGap = 4; # raise the flag
	@bases = grep {! /^-$/} @bases; # get rid of gaps 
	return 0 if (@bases == 0);
    }
    
    @bases = map {uc $_} @bases; # convert to all upper case
    @bases = map {s/U/T/; $_} @bases; # U->T
    @bases = map {s/\?/N/; $_} @bases; # ? -> N

    my $degenCode = MinimumBaseSet(@bases);
    
    my $type;
    if ($degenCode =~ /^[ATGC]$/) {
	$type = 1;
    } elsif ($degenCode =~ /^[WSMKRY]$/) {
	$type = 2;
    } elsif ($degenCode =~ /^[BDHV]$/) {
	$type = 3;
    } elsif ($degenCode =~ /^N$/) {
	$type = 4;
    } elsif ($degenCode =~ /^-$/) {
	$type = 0;
    } else {
	die "ERROR: MinimumBaseSet returned $degenCode\n";
    }
    
    return ($type + $withGap) ;
}

# Takes an array: each element is single base.  Then find a set of
# fewest bases which will match all elements.
# (A, A, G, G) -> R (R=A/G)
# (A, A, A, R) -> A
# (A, A, A, Y) -> W (Y = C/T, so M=A/C or W=A/T is OK, but W get returned)
# (R, R, R, R) -> A
# (A, C, G, T) -> N
# (N, N, N, N) -> A
# If all elements in the array are something other than IUPAC code, returns gap '-'
sub MinimumBaseSet {
    my @baseArr = map {uc $_} @_;
    @baseArr = map {s/U/T/; $_} @baseArr; # U->T
    @baseArr = map {s/\?/N/; $_} @baseArr; # ? -> N
    @baseArr = grep { /[ACGTWSMKRYBDHVN\-]/ } @baseArr; # ignore non IUPAC

    return '-' if (@baseArr == 0);
    
    my @degen = ('A', 'C', 'G', 'T',  'W', 'S', 'M', 'K', 'R', 'Y', 
		 'B', 'D', 'H', 'V');
    foreach my $candidate (@degen) {
	my @candidateArr =  split /\s+/, $IUPAC{$candidate};
	my $match = 1;
	foreach my $bbb (@baseArr) {
	    my @this = split /\s+/, $IUPAC{$bbb};
	    # check sharing
	    my @isect = Intersection(\@candidateArr, \@this);
	    if (@isect == 0) {
		$match = 0;
		last;
	    }
	}
	if ($match) {  # all @baseArr can be explained by the candidate
	    return $candidate;
	}
    }
    my @base = grep {/^[ACGTWSMKRYBDHVN]$/} @baseArr;
    if (@base > 0) {
	return 'N';
    } else {
	return '-';  # in case, all elements are '-'
    }
}


# Takes a Bio::Align object, and a reference to array of names
# returns an Bio::Align object with the names specified.
# Note that if the sequence names are not unique, only the first sample
# will be selected.
# The order of sequences in returned alignment are not the order in the array 
# of names, but the order in the Align object
# If same name is included in the name array multiple times, only 1 sequence 
# will be returned.
# If the name array contains names which are not in Align object, it will be
# ignored.
# If seq names (display_id) are not uniq in Align Object, later one get used.
sub SubsetOfAlignByID {
    my ($alnObj, $nameArrRef) = @_;

    my @names = SeqNamesInAlnObj($alnObj);
    my @index = 1 .. scalar(@names);

    my %posiHash = map { $names[$_ - 1]  => $_ } @index;

    # my @matchedIndex =  map {$posiHash{$_}} @$nameArrRef;
    # @matchedIndex = grep { /^\d+$/ } @matchedIndex;
    
    my @matchedIndex = ();
    foreach my $nnn (@$nameArrRef) {
	if (defined($posiHash{$nnn})) {
	    push @matchedIndex, $posiHash{$nnn};
	} else {
	    return "";  # trying to speed up
	}
    }
    my $subset = $alnObj->select_noncont(@matchedIndex);
    return $subset;
}

sub SubsetOfAlignByIDold {
    my ($alnObj, $nameArrRef) = @_;

    my @names = SeqNamesInAlnObj($alnObj);
    my @matchedIndex = ();
    foreach my $target (@$nameArrRef) {
	foreach my $index (0..$#names) {
	    if ($names[$index] eq $target) {
		push @matchedIndex, $index + 1;  # unit-offset
		last;
	    }
	}
    }

    my $subset = $alnObj->select_noncont(@matchedIndex);
    return $subset;
}


### Get rid of samples whose sequences are empty, or all N, X, ?, or -
sub RmEmptySamplesFromAlignObj {
    my $origAlnObj = shift;
    # make duplicate
    my $alnObj = $origAlnObj->slice(1,$origAlnObj->length);
    foreach my $seq ($alnObj->each_seq) {
	my $dna = $seq->seq();
	
	if ($dna =~ /^[NnXx\?\-]*$/) {
	    my $name = $seq->display_id;
	    if ($verbose) {
		warn "INFO $name is removed because it is all gaps or missing chars\n";
	    }
	    $alnObj->remove_seq($seq);
	}
    }
    return $alnObj;
}

# Returns a 2-dim array, not used
sub MatrixFromAlignObj {
    my ($alnObj, $colNumArr) = @_;
    @result = ();
    foreach my $seq ($alnObj->each_seq) {
	my $dna = $seq->seq();
	my @dnaArr = split //, $dna;

	my @index = map {$_ - 1} @$colNumArr;  # convert to 0-offset index

	my @select = @dnaArr[@index];
	push @result, \@select;
    }
    return @result;
}

# From align object, site which contain, any degenerate char or any gap will be
# removed.  
sub RmAnyGapAmbigSites {
    my $alnObj = shift;

    # convert anything other than ATGCU get converted to gap
    # $alnObj->map_chars('[^ATGCUatgcu]', '-');
    # ## WARN ## : uncomment above to use this function in general,
    #      In this script, this is done in ProcessFastaFile to speed up


    my $gline = $alnObj->gap_line;
    my $gchar = $alnObj->gap_char;
    my $result;

    # Creating an align object: each seq has 0 length.
    # Without this, remove_gaps appears to return undef.
    if ($gline =~ /^$gchar+$/) {  # all sites with gap char
	# Making an align object with no sequence data
	# Make a duplicate, select() doesn't make duplicates
	# It will through a couple of harmless warnings:
	#   MSG: Got a sequence with no letters in it cannot guess alphabet []
	#   MSG: Slice [1-16] of sequence [seq5/1-6] contains no residues. Sequence excluded from the new alignment.
	return "";
	$result = $alnObj->slice(1,$alnObj->length);
	foreach my $sss ($result->each_seq) {
	    $sss->seq('');
	}
    } else {
	$result =$alnObj->remove_gaps(); # a site with at least one gap get removed
    }
    return $result;
}

# Return a new Align object after removing the sites where all except one seq
# has non-gap character. Also removes sites with all gaps.
sub RmInsertionFromAlignObj {
    my $alnObj = shift;
    my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] );

    my $numSites = @{ $seqMat[0] } ;
    my $numSamples = @seqMat;

    my @insertionSites = ();  # 0 offset
    foreach my $col (0..($numSites-1)) {
	my $numNonGap = 0;
	foreach my $row (0..($numSamples-1)) {
	    if ($seqMat[$row]->[$col] ne '-') {
		$numNonGap ++;
		last if ($numNonGap > 1);
	    }
	}

	push (@insertionSites, $col) if ($numNonGap < 2);
    }
    
    my @range = SiteIndexToRange(@insertionSites);
    
    my @rmMat = ();
    foreach my $rrr (@range) {
	my ($begin, $end) = split /-/, $rrr;
	push @rmMat, [ $begin, $end ];
    }

    return ($alnObj->remove_columns(@rmMat));
}

######### End of Align Object related functions

####### general functions
# rand integers between 0 and $max (both ends inclusive)
sub RandIntArrayUniq {
    my ($size, $max) = @_;
    my @result = ();
    my %uniq = ();
    while (@result < $size) {
	my $candidate = int(rand ($max + 1));  # rand returns [0, $max + 1)
	next if (defined($uniq{$candidate}));

        push @result, $candidate;
	$uniq{$candidate} = 1;
    }
    return (@result);
}

sub ExtractUnique {
    my %seen=();
    my @unique = ();

    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}

sub Intersection {
    my ($arrRefA, $arrRefB) = @_;

    my %union = ();
    my %isect = ();
    
    foreach my $e (@$arrRefA) {
	$union{$e} = 1;
    }

    foreach my $e (@$arrRefB) {
	if ($union{$e}) {
	    $isect{$e} = 1;
	}
	#$union{$e} = 1;
    }

    return (keys %isect);
}

sub Max {
    my $max = shift;
    foreach $item (@_) {
        if ($item > $max) {
            $max = $item;
        }
    }
    return $max;
}

sub Sum {
    $result = 0;
    foreach my $i (@_) {
        $result += $i;
    }
    return $result;
}

sub SiteIndexToRange {
    my @index = @_;

    return () if (@index == 0);

    @index = sort {$a <=> $b} (@index);
    @index = ExtractUnique (@index);

    $index[$#index+1] = $index[$#index] + 2;  # this force the final index
                                              # to be printed out.
    my $begin = shift @index;
    my $prev = $begin;
    my @results = ();
    for (my $i = 0; $i < @index; $i++) {
	if ($index[$i] == $prev + 1) {
	    $prev = $index[$i];
	    next;
	}
	# now put the range into the results, and reset
	push @results, "$begin-$prev";
	$begin = $prev = $index[$i];
    }

    return (@results);
}