#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "\nUsage: $0 [ace_filename]\n" .
  "  -h: help\n" .
  "If no argument is given, it will find the most recent ace file in the\n".
  "current directory." .
  "From ace file, it will summarize the correspondence between the read\n".
  "name and contig name.  For each individual, it prints the contigs, which\n".
  "each read was assembled into. If all reads are from a single extraction,\n".
  "extractionDate:tube# is printed once.  In the parenthes following cloneID\n".
  "(i.e., pcr#.clone#), sequence primers for the read(s) are printed\n";

use File::Basename;
use Getopt::Std;

my $sep = "\t";  # used for field delimitation

getopts('h') || die $usage;
die $usage if (defined($opt_h));

my $infile;
if (@ARGV == 1) {
    $infile = shift;
} elsif (@ARGV == 0) {
    $infile = FindAceFile(".");
    if ($infile eq "") {
	die "ERROR: ace file not found in the current directory\n$usage\n";
    }
    print "!!! INFO !!! $infile is used as the inputfile\n";
} else {
    die "\nERROR: provide 0 or 1 argument\n\n$usage\n";
}


my %contigName = ExtractReadsInContig($infile);

my %readsInContig = SummarizeForEachIndividual(%contigName);

my $curInd = "";
foreach my $k (sort(keys(%readsInContig))) {
#    print "KEY $k\n";
    my ($indID, $contigName)  = split /$sep/, $k;
    
    unless ($curInd eq $indID) {
	print "\n### $indID\n";
	$curInd = $indID;
    }

    my $clones = $readsInContig{$k};
    $clones =~ s/$sep/ /g;
    print "* $contigName=\t$clones\n";

}

exit(0);
sub FindAceFile {
    my $dir = shift;
    
    opendir (DIR, $dir) or die "Can't opendir $dir:$!";
    my $maxVersion = 1;
    my $name = "";
    while (defined($file = readdir(DIR))) {
	# ignore miniaseembly ace files
	next if ($file =~ /^mini\.\d+\.\d+\.fasta\.screen\.ace\.\d+$/);

	if ($file =~ /^(.+\.fasta\.screen\.ace)\.(\d+)$/) {
	    if ($name eq "") {
		$name = $1;
		$maxVersion = $2;
	    } else {
		if ($name ne $1) {
		    die "ERROR: more than 2 ace files with different name\n";
		}
		if ($maxVersion < $2) {
		    $maxVersion = $2;
		}
	    }
	}
    }

    if ($name eq "") {
	return ("");
    } else {
	return ("$name.$maxVersion");
    }
}

sub ExtractReadsInContig {
    my $infile = shift;

    my %result = ();

    open IN, "<$infile" || die "Can't open $infile\n";

    my $curContig = "";
    while(<IN>) {
	s/\s+$//;
	if (/^CO/) {  # contig name line
	    # CO Contig1 1447 1 1 U
	    # This line is like above, contig name is the 2nd element.
	    # I'm doing in more time consuming way in case contig name
	    # can have a space.  I don't know if spaces in the name is allowed.
	    my @line = split (/\s+/, $_);
	    my @name = splice(@line, 1, scalar(@line) - 5);
	    $curContig = join(" ", @name);

	} elsif (/^RD/) {  # read name line
	    # The line look like this
	    # RD lyra-WI-C.g-d-Mea6fMea26r-l-041006:9.1.4-M13F-050421.ab1 1615 0 28
	    my @line = split (/\s+/, $_);
	    my @name = splice(@line, 1, scalar(@line) - 3);
	    my $read = join(" ", @name);
	    
	    if ($curContig eq "") {
		die "ERROR: in $infile, Read name encountered before a contig".
		    " name.\n";
	    }

	    if (defined($result{$read})) {
		die "ERROR: $read is in several contigs: $result{$read} " .
		    "and $curContig\n";
	    }
	    $result{$read} = $curContig;
	}
	# other lines will be ignored
    }

    close (IN);
    return %result;
}


# returns a hash with key: templateName $sep contigName
# Value is the $sep delimited cloneIDs included in the contig.
#   cloneID(SeqPrimer1, SeqPrimer2, ...) $sep ....
sub SummarizeForEachIndividual {
    my %hash = @_;
    my %result = ();

    foreach my $read (keys(%hash)) {
	my %info = Filename2Hash($read);
	my $id = "$info{'species'}-$info{'population'}-$info{'individualID'}";

	$id = $id . $sep . $hash{$read}; # contig name is attached at the end
	my $value = $info{'cloneID'} . "-" . $info{'seqPrimer'};
	if (defined($result{$id})) {
	    $result{$id} = $result{$id} . $sep . $value;
	} else {
	    $result{$id} = $value;
	}
    }

    foreach my $i (keys(%result)) {
	$result{$i} = CleanContigEntry($result{$i});
	# remove duplicates.
    }
    
    return (%result);
}

sub CleanContigEntry {
    my $all = shift;
    my @clones = split /$sep/, $all;
    @clones = Unique(@clones);

    my %seqPrimer = ();

    # Check all clones have the same extractionDate:tubeID
    my @sampleID =();
    foreach my $i (@clones) {
	my @dotSep = split /\./, $i;
	push @sampleID, $dotSep[0];
    }

    @sampleID = Unique(@sampleID);
    my $abbrevFlag = (@sampleID == 1) ?  1 : 0;
    
    foreach my $i (@clones) {
	my @oneRead = split /-/, $i;
	if (@oneRead != 2) {
	    die "ERROR: in CleanContigEntry, $all\n";
	}
	my $cloneID = $oneRead[0];
	my $primer =  $oneRead[1];

	if ($abbrevFlag) {
	    $cloneID =~ s/^.+?\.(.+)/$1/; # stingy match
	}

	if (defined($seqPrimer{$cloneID})) {
	    $seqPrimer{$cloneID} = $seqPrimer{$cloneID} . ", $primer";
	} else {
	    $seqPrimer{$cloneID} = $primer;
	}
    }
    
    my @shortened = ();
    foreach my $k (sort(keys(%seqPrimer))) {
	push @shortened, "$k($seqPrimer{$k})";
    }
    my $formatted = join ($sep, @shortened);
    if ($abbrevFlag) {
	$formatted = "$sampleID[0], $formatted";
    }
    # each element is in the form: cloneID(seqPrimer1, seqPrimer2)
    return ($formatted);
}

# Take a filename as an argument.
# Make a hash with information extracted from the basename
sub Filename2Hash {
    my ($templateName, $info);
    my $file = shift;
    my %result = ();

    my $fn = $file;
    $fn =~ s/\.ab1$//;
    $fn = basename $fn;

    # +? is stingy match
    if ($fn =~ /(.+?)\.(.+)/) {
	$templateName = $1;
	$info = $2;
    } else {
	warn "$file doesn't have '.' in the name.  Skipping...\n";
    }

    my @idArray = split /-/, $templateName;
    my @infoArray = split /-/, $info;

    if (@idArray != 3 || @infoArray != 7) {
	warn "$file doesn't conform to the format.  Is there extra '-'?\n";
    }

    $result{'species'} = $idArray[0];
    $result{'population'} = $idArray[1];
    $result{'individualID'} = $idArray[2];

    $result{'chemistry'} = $infoArray[0];
    $result{'dnaRna'} = $infoArray[1];
    $result{'pcrPrimerSet'} = $infoArray[2];
    $result{'tissueOrigin'} = $infoArray[3];
    $result{'cloneID'} = $infoArray[4];
    $result{'sampleID'} = $infoArray[4];
    $result{'sampleID'} =~ s/^(.+?)\..+$/$1/; # taking previous to 1st dot.
    $result{'seqPrimer'} = $infoArray[5];
    $result{'seqDate'} = $infoArray[6];

    $result{'fileName'} = $file;
    return %result;
}

sub Unique {
    my %seen = ();
    my @uniq = grep { ! $seen{$_} ++ } @_;
    return @uniq;
}
