#!/usr/bin/perl

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = 
"Usage: $0 [-h] consedDiscrepancyListFile\n\n" .
"This program summarizes the high quality descripancies of a contig in consed.\n".
" 1. From consed, open a contig by double clicking.\n" .
" 2. Choose \"Navigate\" -> \n" .
"        \"High Quality (>=40) Discrepancies, > 5bp from unaligned region\"\n".
" 3. Push \"Save\" at the bottom of the pop-up window.\n".
" 4. You can choose whatever file name, but I save it in a temporary place:\n".
"     \"/tmp/list.txt\"\n".
" 5. $0 /tmp/list.txt\n".
"     This will show two ways of summarizing:\n".
"      - 1st section contains a mutation shared by a multiple clones,\n".
"        followed by the position of the mutation.\n".
"          This could indicate that the SNP may be real.\n" .
"      - 2nd section contains the position of discrepancies for each read.\n".
"          A clone with a lot of SNPS could be a different allele.\n";

my $sep = "\t";
my $debug = 0;

use Getopt::Std;
getopts('h') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.

### read in the data
my @data = ();
while (<>) {
    my @line = split /\s+/;

    my $name = $line[1];
    my $posi = $line[2];

    my $indID;
    my $cloneInfo;
    # "+?" is stingy match
    if ($name =~ /^(.+?)\.(.*)\.ab1$/) {
	$indID = $1;
	$cloneInfo = $2;
    } else {
	warn "WARN: $name doesn't match expectation\n";
	next;
    }
    
    @line = split(/-/, $cloneInfo);
    $line[4] =~ s/Al//;  # remove Al from "Al24.1.2"

    $indID = $indID . "_" . $line[4];
    my $entry = $indID . $sep . $posi;
    push @data, $entry ; 
}

# attaching a dummy line at the end, so the last data is analyzed
my $dummyLine = "dummy" . $sep . "dummy";
push @data, $dummyLine;

my $curPosi = "";
my @cache = ();
my %polySites = ();
my %singleton = ();
foreach my $i (0..$#data) {
    my ($name, $posi) = split /$sep/, $data[$i];

    # first line
    if ($i == 0) {
	$curPosi = $posi;
	@cache = ($name);
	next;
    }

    print "$name $posi $curPosi\n" if ($debug);
    if ($posi eq $curPosi) { # same site
	print "  cached\n" if ($debug);
	push @cache, $name;
	next;
    }
    
    # When it is reached here, moving to a new site, so process data
    @cache = sort(ExtractUnique(@cache));

    # Summarize and attach the data in appropriate places
    if (@cache == 1) {  # singleton
	if (defined($singleton{$cache[0]})) {
	    $singleton{$cache[0]} = $singleton{$cache[0]} . $sep . $curPosi;
	} else {
	    $singleton{$cache[0]} = $curPosi;
	}
	print "  singleton: $cache[0] => $singleton{$cache[0]}\n" if ($debug);
    } else {   # polymorphic site
	my $discrepancyClones =  join($sep, @cache);
	if(defined($polySites{$discrepancyClones})) {
	    $polySites{$discrepancyClones} = 
		$polySites{$discrepancyClones} . $ sep . $curPosi;
	} else {
	    $polySites{$discrepancyClones} = $curPosi;
	}
	print "  poly: $discrepancyClones => $polySites{$discrepancyClones}\n" if ($debug);
    }

    # reset
    @cache =($name);
    $curPosi = $posi;   
}

#if (scalar(keys(%polySites)) > 0) {
    foreach my $key (sort(keys(%polySites))) {
	print $key . $sep . $polySites{$key} . "\n";
    }
    print "\n\n";
#}

#if (scalar(keys(%singleton)) > 0) {
    foreach my $key (sort(keys(%singleton))) {
	print $key . $sep . $singleton{$key} . "\n";
    }
#}

exit (0);

# take a list as the argument and extract the unique elements.
# The order of elements will not be preserved.
sub ExtractUnique {
    my %seen=();
    my @unique = ();

    foreach my $item (@_) {
        push (@unique, $item) unless $seen{$item}++;
    }
    return @unique;
}

