#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage="Usage: $0 [fastaFile]\n" . 
    "Converts fastaFile (with IUPAC ambiguity code) into PHASE format.\n".
    "The output contains only polymorphic sites (inluding gap polymorphisms)\n".
    "3-base and 4 base ambiguity codes are considered as missing character.\n".
    "This may have undesirable effect: a site with G G G H should be polymorphic,\n".
    "since H = A, C, or T.  But this site is considered as monomorphic in this program\n".
    "There is no way to specify an individual who is het for indel, this has to".
    "be added manually\n";

use Bio::AlignIO;

my %IUPAC = ('A'=>'A','C'=>'C','G'=>'G','T'=>'T','U'=>'T','-'=>'-','?'=>'?',
	     'M'=>'A C','R'=>'A G','W'=>'A T','S'=>'C G','Y'=>'C T','K'=>'G T',
	     'V'=>'A C G','H'=>'A C T','D'=>'A G T','B'=>'C G T',
	     'N'=>'A C G T');
my %toPhase = ('A'=>'A A','C'=>'C C','G'=>'G G','T'=>'T T','U'=>'T T','-'=>'- -',
	       '?'=>'? ?',
	       'M'=>'A C','R'=>'A G','W'=>'A T','S'=>'C G','Y'=>'C T','K'=>'G T');

use Getopt::Std;
getopts('h') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

@ARGV = ('-') unless @ARGV;

if (@ARGV > 1) {
    die "ERROR: it can take only 1 input file\n$usage\n";
}

my $infile = shift @ARGV;

my $inAln = Bio::AlignIO->new(-file => "$infile", -format => 'fasta');

my $alnObj = $inAln -> next_aln;

unless ($alnObj->is_flush) {
    die "Alignment is not flush at the ends\n";
}

my @polyCat = CategorizeSites($alnObj);

# Find the polymorphic sites: this include site with
# monomorphic + some gap
my @polyIndex = ();  # 1-offset index
my @siteTypeString = ();
foreach my $index (0..$#polyCat) {
    if ($polyCat[$index] > 1) {
	push @polyIndex, $index+1;
	# S = biallelic, M = microsat
	my $thisType = ($polyCat[$index] == 2 || $polyCat[$index] == 5) ? "S" : "M";
	push @siteTypeString, $thisType;
    }
}

if (@polyIndex == 0) {
    warn "WARN: no polymorphic sites were found\n";
    exit;
}

print $alnObj->no_sequences, "\n", scalar(@polyIndex), "\n";
print join(" ", 'P', @polyIndex), "\n";
print join("", @siteTypeString), "\n";

## Now extract the poly sites, and print out in phase format
foreach my $seq ($alnObj->each_seq()) {
    my $name = $seq->display_id();
    print "$name\n";
    my $seqString = $seq->seq();
    my $outputString = "";
    foreach my $pindex (0..$#polyIndex) {
	my $base = substr $seqString, $polyIndex[$pindex] - 1 , 1;
	$base = uc $base;
	$base =~ s/U/T/;
	my $genotype = '? ?';
	if (defined($toPhase{$base})) {
	    $genotype = $toPhase{$base};
	} # 3 or 4-base ambiguity and non-IUPAC codes get converted to '? ?'

	if ($siteTypeString[$pindex] eq 'M') { # convert to microsat spec
	    $genotype =~ tr/ATGC-/12345/;
	    $genotype =~ s/\?/-1/g;
	}
	$outputString .= "$genotype ";
    }
    $outputString =~ s/\s+$//;
    print "$outputString\n";
} 

exit;

# Returns an array of integers, with each representing the
# type of polymorphism at the site (see PolymorphismType() below)
sub CategorizeSites  {
    my $alnObj = shift;

    my $alignedLen = $alnObj->length;
    my $numSamples = $alnObj->no_sequences;

    my @category = ();
    
    foreach my $index (1..$alignedLen) {
	my @charArr = GetBasesAtSite($alnObj, $index);
	my $type = PolymorphismType(@charArr);
	if ($type < 0 ) {
	    die "ERROR: Site $index contains non-IUPAC code:\n" . 
		join(" ", @charArr);
	}
	push @category, $type;
    }
    return @category;
}

# Return a char array of bases at a particular site.
# 2nd arg $position is 1-offset index.  The 1st site is 1 not 0.
# when invalid position is specified, empty array is returned.
sub GetBasesAtSite {
    my ($alnObj, $position) = @_;
    my $numSamples = $alnObj->no_sequences;

    my @basesAtThisSite = ();
    
    if($alnObj->length < $position || $position < 1) {
	return @basesAtThisSite;
    }
    
    foreach my $seq ($alnObj->each_seq()) {
	push @basesAtThisSite, $seq->subseq($position,$position);
    }
    return @basesAtThisSite;
}


# Takes an array of bases, and categorize the 'polymorphism' in the array

# Returns:
#  0 : all gap (-)
#  1 : monomorphic (include all N or ?)
#  2 : di-allelic 
#  3 : tri-allelic
#  4 : quad-allelic
#  5 : monomorphic + gap (include cases: N N N N - N)
#  6 : di-allelic + gap
#  7 : tri-allelic + gap
#  8 : quad-allelic + gap
# <0 : non IUPAC code encountered
# WARN: For simplicity, 3 or 4 base ambiguity codes are considered as a
# missing char (? N V H D B), and they are ignored.
# This could cause problem.  For example, with (G, G, G, G, H, H), this site
# contains at least two types of bases (H=A|C|T).  However, this function
# consider this site to be monomorphic

sub PolymorphismType {
    my @bases = @_;

    # check gaps, and get rid of them
    my $withGap = 0;  # a flag to indicate existence of gap
    my @gaps = grep {/^-$/} @bases;
    if (@gaps > 0) {
	$withGap = 4; # raise the flag
	@bases = grep {! /^-$/} @bases; # get rid of gaps 
	return 0 if (@bases == 0);
    }
    
    @bases = map {uc $_} @bases; # convert to all upper case
    @bases = map {s/U/T/; $_} @bases; # U->T
    
    # convert 3 or 4 base ambiguity codes to N
    @bases = map {$_ =~ s/[VHDB\?]/N/; $_} @bases;
    
    # converting the 2 base-ambituity codes to single bases
    @bases = map {if(/^[MRWSYK]$/) {$IUPAC{$_}} else {$_} } @bases;
    @bases = split(/\s+/, join(" ", @bases));

    # Now @bases contain only A, T, G, C, N, so check this
    my $allIUPAC = 1;
    foreach my $bb (@bases) {
	if ($bb !~ /^[ATGCN]$/) {
	    warn "WARNING: non-IUPAC code encountered: $bb\n";
	    $allIUPAC = -1;
	    last;
	}
    }
    
    my $type = NumBaseTypes(@bases);
    if ($type == 0) { # This means all 'N', return 1
	$type = 1;
    }

    # if there is at least one non-IUPAC code exist, negative
    # value get returned
    return ($type + $withGap) * $allIUPAC;
}

# Count how many different types of bases (A T G C) exist in an array.
# N, -, ? are ignored
# return value: 0 - 4
sub NumBaseTypes {
    my @bases = @_;
    my %cnt = ('A'=>0, 'T'=>0, 'G'=>0, 'C'=>0);
    foreach my $bb (@bases) {
	$cnt{uc($bb)} = 1;
    }
    return $cnt{'A'} + $cnt{'T'} + $cnt{'G'} + $cnt{'C'};
}
