#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "\nUsage: $0 [-f columnNumber] [-hds] sourceDir1 [sourceDir2 ...]\n" .
  "  -h: help\n" .
  "  -s: Survey the directories.  Print out the summary of used Primer names\n".
  "      and sample IDs and names\n".
  "  -d: Print out the internal database of known values\n".

  "sourceDir contains *.ab1 files (output from ABI sequencer).  This " .
  "program assumes that the *.ab1 files follows this naming convention: " .
  "sampleID.templateType.ab1.  The portion before the 1st dot corresponds " .
  "the name of sample.  You can put whatever info in the templateType " .
  "section.  St. Louis naming convention follows this pattern.\n\n" .
  "Specifically in our lab, we use sampleID and templateType in following " .
  "format:\n " .
  "  sp-pop-indID.[gb]-[dc]-pcrPrimerSet-tissue-cloneID-seqPrimer-seqDate.ab1\n" .
  "cloneID has 3 dot-delimited fields: e.g. 041006:3.1.7, which means \n" .
  "sample (tube) number 3 of 2004/10/6 DNA extraction, \n".
  "clone # 7 from 1st PCR amplification.  \n\n".

  "This program has a database of possible values for primer Names, and \n" .
  "correspondence between sampleID (tube # and DNA extraction date).\n".
  "The program recursively go down the sourceDirs to find *.ab1 files, and\n".
  "check if the filename is legitimate.  The script also check if there is ".
  "any files with duplicated names.";

my $debug = 0;

use File::Find;
use File::Basename;
use Getopt::Std;

getopts('hf:ds') || die $usage;

die $usage if (defined($opt_h));

#die "$usage\nERROR: Please specify at least one source directory\n" 
#    unless @ARGV;


%nameDB = ();        # key is sample id, value is sp-pop-indID
%revNameDB = ();
%pcrPrimerSetDB = ();  # key is the posiible values in pcrPrimers section
%seqPrimerDB = ();

#if (defined($opt_t)) {  # -t genCodeTbl was given
#    open (ALLOWED_TBL, "<$opt_t") || die "ERROR: Can't open $opt_t\n";
#    %aminoAcid = InitDB(*ALLOWED_TBL);
#    close (ALLOWED_TBL);
#} else {                # use the default standard table supplied at the end
#    %aminoAcid = InitDB(*DATA);
#}
InitDB();


# print database and exit
if (defined($opt_d)) {  # print the database of known names, primers etc.
    PrintDB();
    exit(0);
}


# start processing arguments
my @fileName =();
my @sourceDirName;

if (defined($opt_f)) {  # arguments are the file
    my $colNum;
    
    if ($opt_f > 1) {
	$colNum = $opt_f - 1;
    } else {
	die "ERROR: -f requires an integer, corresponding to the column\n";
    }

    foreach my $file (@ARGV) {
	open STDIN, "<$file" || die "ERROR: Can't open $file\n";
	while (<STDIN>) {
	    s/^\s+//; s/\s+$//;
	    next if (/^$/);

	    my @line = split /\t/;
	    push @fileName, $line[$colNum];

	}
	close STDIN;
    }

} else { # arguments are directories
    foreach my $dir (@ARGV) {
	$dir =~ s@/$@@;              # Strip any trailing slash
	if (-d $dir) {
	    push @sourceDirName, $dir;
	} elsif (-f $dir) {
	    push @fileName, $dir;
	} else {
	    warn "Don't know how to handle argument '$dir'\n";
	    next; 
	} 
    }

    # extract the plain files and symlinks
    push @fileName, ListRegFilesRecursive(@sourceDirName);
    # print join "\n", @fileName, "\n";  # for debug

}
    

@fileName = SelectAB1Files (@fileName);

my @duped = CheckDuplicatedBasename (@fileName);
if (@duped > 0) {
    warn "ERROR: the filenames should be unique\n";
    warn (join "\n", @duped); 
    warn ("\n");
    die;
}

##
if (defined($opt_s)) {
    SurveyUsedNames(@fileName);
    exit(0);
}

## processing each file name
foreach my $f (@fileName) {
    my %infoH = Filename2Hash($f);
    CheckFilename(%infoH);
}

# if some files are already in the destination, they are removed from the list
#@fileName = CheckDestination(@fileName);

# Now it should be safe to make the symlinks.
#MakeSymlinks (@fileName);

#if (defined ($opt_r)) {
#    RunPhredPhrap (@fileName);
#}

exit (0);

# take a list of directories and returns the names of plain files and sym links
sub ListRegFilesRecursive {
    my @names =();
    find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_;
    return @names;
}

# only select ab1 files
sub SelectAB1Files {
    my @result = ();
    foreach $file (@_) {
	my $id = GetIDFromFilename($file);
	if ($id eq "") {
	    warn "INFO: $file does not follow the naming convention " .
		"(i.e., templateID.type.ab1).  Ignoring ..." if ($debug);
	    next;
	}
	push @result, $file;
    }
    return @result;
}

sub GetIDFromFilename {
    my $name = shift;
    my $bn = basename $name;
    # "+?" is stingy match
    if ($bn =~ /^(.+?)\..*\.ab1$/) {
        return $1;
    } else {
        return "";
    }
}


sub CheckDuplicatedBasename {
    my %seen = ();
    my @dupPairs = ();
    foreach my $name (@_) {
	$base=basename($name);
	if ($seen{$base}) {
	    push @dupPairs, "$seen{$base} = $name";
	} else {
	    $seen{$base} = $name;
	}
    }
    return (@dupPairs);
}

sub Unique {
    my %seen = ();
    my @uniq = grep { ! $seen{$_} ++ } @_;
    return @uniq;
}

# Take a filename as an argument.
# Make a hash with information extracted from the basename
sub Filename2Hash {
    my ($templateName, $info);
    my $file = shift;
    my %result = ();

    my $fn = $file;
    $fn =~ s/\.ab1$//;
    $fn = basename $fn;

    # +? is stingy match
    if ($fn =~ /(.+?)\.(.+)/) {
	$templateName = $1;
	$info = $2;
    } else {
	warn "$file doesn't have '.' in the name.  Skipping...\n";
    }

    my @idArray = split /-/, $templateName;
    my @infoArray = split /-/, $info;

    if (@idArray != 3 || @infoArray != 7) {
	warn "$file doesn't conform to the format.  Is there extra '-'?\n";
    }

    $result{'species'} = $idArray[0];
    $result{'population'} = $idArray[1];
    $result{'individualID'} = $idArray[2];

    $result{'chemistry'} = $infoArray[0];
    $result{'dnaRna'} = $infoArray[1];
    $result{'pcrPrimerSet'} = $infoArray[2];
    $result{'tissueOrigin'} = $infoArray[3];
    $result{'cloneID'} = $infoArray[4];
    $result{'sampleID'} = $infoArray[4];
    $result{'sampleID'} =~ s/^(.+?)\..+$/$1/; # taking previous to 1st dot.
    $result{'seqPrimer'} = $infoArray[5];
    $result{'seqDate'} = $infoArray[6];

    $result{'fileName'} = $file;
    return %result;
}


sub CheckFilename {
    my %hash = @_;
    my @problem = ();

    my $sampleID = $hash{'sampleID'};

    my $id = "$hash{'species'}-$hash{'population'}-$hash{'individualID'}";

    if (defined($nameDB{$sampleID})) {
	if (($nameDB{$sampleID} ne $id)) {
	    if (defined($revNameDB{$id})) {
		push @problem,
		"- Mislabeling?\n" .
		    "  $nameDB{$sampleID} <- $sampleID and \n".
		    "  $id -> $revNameDB{$id} in database";
	    } else {
		push @problem, 
		"- Misspell in $id? $sampleID should be $nameDB{$sampleID}.";
	    }
	}
    } else {  # sample ID is not in DB
	if (defined($revNameDB{$id})) {  # other sample ID correspond to name
	    push @problem, 
	    "- $sampleID not in the database. But $id " .
		"-> $revNameDB{$id} in database.";
	} else {
	    push @problem, 
	    "- $sampleID & $id are not in the database.  Is ".
		"database updated in the script $0 ??";
	}
    }
    
    if ($hash{'chemistry'} !~ /^[gb]$/ ) {
	push @problem, 
	"- chemistry code is weird: $hash{'chemistry'}.";
    }
    
    if ($hash{'dnaRna'} !~ /^[dc]$/ ) {
	push @problem, 
	"- DNA (d) or cDNA (c)?:  $hash{'dnaRna'} is given.";
	
    }
    
    # not checking tissue Origin
    
    unless(defined($seqPrimerDB{$hash{'seqPrimer'}})) {
	push @problem, 
	"- Sequencing Primer: $hash{'seqPrimer'} not in the Database.";
    }
    
    unless(defined($pcrPrimerSetDB{$hash{'pcrPrimerSet'}})) {
	push @problem, 
	"- PCR Primers: $hash{'pcrPrimerSet'} not in the Database.";
    }

    my @clID = split /\./, $hash{'cloneID'};
    if (@clID != 3) {
	push @problem,
	"- clonedID ($hash{'cloneID'}) doesn't have 3 dot-delimited fields.";
    }
    # seqDate not checked

    if (@problem >=1) {
	print "$hash{'fileName'}\n", join("\n", @problem), "\n";
    }
}

# each entry in array contains 1 value, or 2 values delimited by spaces (\s+)
sub DBArray2Hash {
    my @arr = @_;
    my %result = ();

    foreach my $i (@arr) {
	$i =~ s/#.*$//;  # remove comments
	$i =~ s/^\s+//;	$i =~ s/\s+$//;
	next if ($i =~ /^$/);  # empty line

	my @entry = split /\s+/, $i;
	if (@entry == 1) {
	    push @entry, 1;  # add a dummy value 1
	} elsif (@entry > 2) {
	    die "ERROR: \"Possible values\" database should be space(s) ".
		"delimited, with 1-2 columns.\n" .
		"$i contains more than 2 columns.\n";
	}
	
	if (defined ($result{$entry[0]})) { # check if hash key exists
	    if ($result{$entry[0]} eq $entry[1]) {
		warn "WARN: ignoring duplicated entries for possible-value ".
		    "database: $i\n";
	    } else {
		die "ERROR: inconsistency in possible-values database. " .
		    "For the key $entry[0], 2 values defined: " .
		    "$result{$entry[0]} and $entry[1]\n";
	    }
	} else {
	    $result{$entry[0]} = $entry[1];
	}
    }
    return (%result);
}

# initialize 3 global hash tables:
# %nameDB = ();          # key is sample id, value is sp-pop-indID
# %revNameDB;            # key is sp.pop-indID, value is sample id('s)
# %pcrPrimerSetDB = ();  # key is the posiible values in pcrPrimers section
# %seqPrimerDB = ();
sub InitDB {
    my @db = ReadDefaultNameDB();
    %nameDB = DBArray2Hash(@db);
    @db = ReadDefaultPcrPrimerSetDB();
    %pcrPrimerSetDB = DBArray2Hash(@db);
    @db = ReadDefaultSeqPrimerDB();
    %seqPrimerDB = DBArray2Hash(@db);

    foreach my $key (keys(%nameDB)) {
	if(defined($revNameDB{$nameDB{$key}})) {
	    $revNameDB{$nameDB{$key}} = $revNameDB{$nameDB{$key}} . " $key";
	} else {
	    $revNameDB{$nameDB{$key}} = $key;
	}
    }
}

sub PrintDB{
    my $key;
    print "Known PCR primer sets:\n";
    foreach $key (sort(keys(%pcrPrimerSetDB))) {
	print "$key\n";
    }
    print "\nKnown Sequencing primers:\n";
    foreach $key (sort(keys(%seqPrimerDB))) {
	print "$key\n";
    }
    print "\nKnown sampleID and individual names:\n";
    foreach $key (sort(keys(%revNameDB))) {
	print "$key\t$revNameDB{$key}\n";
    }
}

sub PrintSortedKeys {
    my %hash = @_;

    foreach my $key (sort(keys(%hash))) {
	print "$key\n";
    }
}

# go through the directories, and summarize the ab1 files by printing
# all observed samples, primer sets, seq primers, etc.
# take a list of ab1 filenames as the argument
sub SurveyUsedNames {
#    my %seenName =();
    my %seenSampleID = ();
    my %seenPcrPrimerSet = ();
    my %seenSeqPrimer =();
    my %seenSeqDate = ();

    foreach my $f (@_) {
	my %hash = Filename2Hash($f);

	my $id = "$hash{'species'}-$hash{'population'}-$hash{'individualID'}";
	
#	unless(defined($seenName{$id})) {
#	    $seenName{$id} = 1;
#	}

	if(defined($seenSampleID{$hash{'sampleID'}})) {
	    if ($seenSampleID{$hash{'sampleID'}} ne $id) {
		my $i = 1;
		do {
		    $tmpID = $hash{'sampleID'} . "dup" . $i;
		    $i++;
		} while (defined($seenSampleID{$tmpID}) &&
			 ($seenSampleID{$tmpID} ne $id));
		$seenSampleID{$tmpID} = $id;
	    }
	} else {
	    $seenSampleID{$hash{'sampleID'}} = $id;
	}

	unless(defined($seenPcrPrimerSet{$hash{'pcrPrimerSet'}})) {
	    $seenPcrPrimerSet{$hash{'pcrPrimerSet'}} = 1;
	}
	unless(defined($seenSeqPrimer{$hash{'seqPrimer'}})) {
	    $seenSeqPrimer{$hash{'seqPrimer'}} = 1;
	}
	unless(defined($seenSeqDate{$hash{'seqDate'}})) {
	    $seenSeqDate{$hash{'seqDate'}} = 1;
	}
    }

    print "## Observed samples\n";
    foreach my $key  (sort(keys(%seenSampleID))) {
	print "$key\t$seenSampleID{$key}\n";
    }

    print "\n## Sequencing date, observed in all ab1 files\n";
    PrintSortedKeys(%seenSeqDate);
    print "\n## PCR primer sets, observed in all ab1 files\n";
    PrintSortedKeys(%seenPcrPrimerSet);
    print "\n## Sequencing primer, observed in all ab1 files\n";
    PrintSortedKeys(%seenSeqPrimer);
}


#############################################################################
### Allowed values
sub ReadDefaultNameDB {
    my $a = <<'NAMEDB';

040617:25	kamc-portage-3B
040617:26	kamc-darling-5A
040617:27	kamc-savage-6
040617:28	kamc-cParker-A
040617:29	kamc-cParker-B

041006:1	petr-plech-C
041006:2	petr-durn-A
041006:3	lyra-NY-A
041006:4	lyra-WI-A
041006:5	petr-plech-A
041006:6	petr-exeter-Z
041006:7	petr-plech-B
041006:8	petr-exeter-A
041006:9	lyra-WI-C
041006:10	kamc-portage-5AA
041006:11	kamc-savage-5
041006:12	kamc-bearCr-5C
041006:13	kamc-portage-2B
041006:14	kamc-bearCr-3B
041006:15	kamc-portage-4B
041006:16	kamc-portage-2A
041006:19	kamc-portage-1Z
041006:20	kamc-portage-4X
041006:21	kamc-darling-1AA
041006:22	kamc-savage-4A
041006:24	kamc-bearCr-3AA
041006:25	kamc-portage-3AA
041006:26	kamc-chena-13AA
041006:27	kamc-chena-20AA
041006:29	kamc-chena-9A
041006:64	kamc-chena-20B
041006:65	petr-reykjavik-NoID

041130:3	thal-kashmirIndia-CS6751
041130:5	thal-burrenIreland-CS1028
041130:6	thal-micklesFellUK-CS1362
041130:7	thal-miramareItaly-CS1379
041130:8	thal-sanElenoSpain-CS1502
041130:9	thal-marthasVineyard-CS1387
041130:10	thal-turinItaly-CS6875
041130:11	thal-yosemite-CS1622
041130:12	thal-toledo-CS8026
041130:14	kamc-goodnews-A
041130:15	kamc-goodnews-B
041130:16	kamc-goodnews-C
041130:19	thal-pamiroTadjikistan-CS57924
041130:21	thal-IbelMorocco-CS1244
041130:22	thal-oysteseNorway-CS1436
041130:23	thal-denmark-CS1220
041130:25	thal-ukraine-CS22623
041130:26	thal-yosemite-CS1622
041130:27	thal-japan-CS1564
041130:28	thal-czech-CS22589
041130:29	thal-lithuania-CS925
041130:30	thal-sampoMtRussia-CS22492
041130:31	thal-chisdraRussia-CS1076
041130:32	thal-rschewRussia-CS1491

# These are using 4 digits for the year
20041130:3	thal-kashmirIndia-CS6751
20041130:5	thal-burrenIreland-CS1028
20041130:6	thal-micklesFellUK-CS1362
20041130:7	thal-miramareItaly-CS1379
20041130:8	thal-sanElenoSpain-CS1502
20041130:9	thal-marthasVineyard-CS1387
20041130:21	thal-IbelMorocco-CS1244
20041130:25	thal-ukraine-CS22623
20041130:26	thal-yosemite-CS1622

050310:1	petr-esjaMtIceland-A
050310:4	petr-esjaMtIceland-B
050310:5	petr-reykjavik-B
050310:7	petr-austria-NoID
050310:9	drum-galenaAk-A
050310:10	drum-galenaAk-B
050310:14	kamc-rainbow:2004-3I
050310:15	kamc-rainbow:2004-1C
050310:16	kamc-rainbow:2004-11G
050310:18	kamc-exitGl:2004-2A
050310:19	kamc-exitGl:2004-25A
050310:20	kamc-ptarmiganCr-3C
050310:21	kamc-ptarmiganCr-2C
050310:22	kamc-ptarmiganCr-5F
050310:23	kamc-darling:2004-2E
050310:24	kamc-portage1:2004-11C

050428:1	petr-kurhu5Russ-NoID
050428:2	petr-braemarScotland-B
050428:3	petr-austria-NoID
050428:4	petr-braemarScotland-A
050428:5	petr-esjaMtIceland-B
050428:6	kawa-biwaJapan-NoID
050428:7	petr-esjaMtIceland-A
050428:8	petr-reykjavik-B
050428:9	kamc-goodnews-A
050428:10	kamc-goodnews-D
050428:11	kamc-goodnews-E
050428:12	kamc-goodnews-F
050428:18	lyra-WI-C

050524:3	kamc-ptarmiganCr-3D

050525:6	petr-reykjavik-B
050525:7	petr-esjaMtIceland-A
050525:15	kawa-biwaJapan-NoID
050525:18	drum-galena-C
050525:33	kamc-goodnews-H
050525:34	kamc-cParker-A
050525:35	kamc-portage-3AA
050525:36	kamc-portage-4:1
050525:37	kamc-chena-21E
050525:38	kamc-darling-1AA
050525:39	kamc-savage-4
050525:40	kamc-bearCr-3AA
050525:41	unkn-ALAV136266-NoID
050525:42	unkn-ALAV136266-NoID


20050310:10	drum-galenaAk-B
20050310:21	kamc-ptarmiganCr-2C
20050310:9	drum-galenaAk-A

#Following need to be renamed
At20041130:21	thal-IbelMorocco-CS1244
At20041130:22	thal-oysteseNorway-CS1436
At20041130:25	thal-ukraine-CS22623
At20041130:27	thal-japan-CS1564
At20041130:28	thal-czech-CS22589
At20041130:29	thal-lithuania-CS925

At20	thal-catalinaItaly-CS1094
At21	thal-berghaunBushschlag-CS1008
At23	thal-coiumbriaPortugal-CS1086
At24	thal-baselSwitz-CS1000
At50	thal-ukraine-CS22623
At51	thal-durhamOldOxford-1
At52	thal-durhamWaterford-noID
At53	thal-yosemite-CS1622
At54	thal-playaDeAroSpain-CS6916
At55	thal-marthasVineyard-CS1387
At56	thal-turinItaly-CS6875
At57	thal-toledo-CS8026
At58	thal-sanElenoSpain-CS1502
At59	thal-osthammarSweden-CS1430
7	thal-miramareItaly-CS1379
21	thal-berghaunBushschlag-CS1008
21dup1	thal-IbelMorocco-CS1244
55	thal-marthasVineyard-CS1387
56	thal-turinItaly-CS6875
59	thal-osthammarSweden-CS1430


NAMEDB

return (split /\n/, $a);
}

sub ReadDefaultPcrPrimerSetDB {
    my $a = <<'PCRDB';
Mea6fMea26r    # thal media promoter
Mea36fMea25r   # thal media promoter
Mea36fMea26r   # thal media promoter
Mea36fMea27r   # thal media promoter
Mea17fMea16r   # thal media coding
# lyrata media
#Mea6fMea26r
Mea6f36f26r25r
#Mea17fMea16r
#Mea36fMea25r
Mea36f25r9r
Mea36fMea44r
Mea48fMea40r
Mea51fMea41r
Mea52fMea40r
Mea54fMea42r

# CHS
CHSFOR1REV5
Chs1fChs1r

# HAT4
Hat1fHat2r
Hat3fHat3r

PCRDB

return (split /\n/, $a);
}

sub ReadDefaultSeqPrimerDB {
    my $a = <<'SEQDB';
M13F
M13R
# thaliana media promoter
Mea6f
Mea37F
Mea33R
Mea26r
# thaliana media coding
Mea17f
MeaTMOf
MeaTMOr
Mea36R
Mea44F
Mea11R
Mea12R
Mea45F
Mea14R
Mea42F
Mea16R
# lyrata
Mea12r
Mea14r
Mea16f
Mea20r
Mea37r
Mea41f
Mea43f
Mea45f
Mea36f

Mea45r
Mea21r

## need to be revised below this
11R
14R
33R
36R
37F
42F
44F
45F
M13F
M13R
Mea11R
Mea12R
Mea12r
Mea14R
Mea14r
Mea16F
Mea16R
Mea16f
Mea16r
Mea20R
Mea20r
Mea25r
Mea26R
Mea33R
Mea34r
Mea35r
Mea36R
Mea36f
Mea36r
Mea37F
Mea37R
Mea37r
Mea38r
Mea39f
Mea40f
Mea40r
Mea41F
Mea41f
Mea41r
Mea42F
Mea42f
Mea42r
Mea43F
Mea43f
Mea44F
Mea44F.2
Mea44r
Mea45F
Mea45F.2
Mea45R
Mea45f
Mea46f
Mea48f
Mea4f
Mea51f
Mea52f
Mea54f
MeaTMOf
MeaTMOr
TMOF
TMOR
TMOf
TMOr
mea11R
mea12r
mea14R
mea14r
mea16f
mea20r
mea36R
mea37r
mea41f
mea42F
mea43f
mea44F
mea45F
mea45f
meaTMOF
meaTMOR


# For chs
CHSFOR1
CHSREV3
CHSFOR3
CHSREV5
Chs1f
Chs1r

# For HAT4
Hat1f
Hat1r
Hat2f
Hat2r
Hat3f
Hat3r
Hat4f



SEQDB

return (split /\n/, $a);

}
