#!/usr/bin/perl -w
#
# This program is used to batch-process (base call and assemble) ABI trace.
# It extract the information about the source of DNA from the filename,
# and assemble contigs for each source.
#

# Copyright 2006, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage = "\nUsage: $0 [-hr] sourceDir1 [sourceDir2 ...]\n" .
    "  -h: help\n" .
    "  -r: run phredPhrap in the updated directory\n\n" .

    "sourceDir contains *.ab1 files (output from ABI sequencer).  This " .
    "program assumes that the *.ab1 files follows this naming convention: " .
    "sampleID.templateType.ab1.  The portion before the 1st dot corresponds " .
    "the name of sample.  You can put whatever info in the templateType " .
    "section.  St. Louis naming convention follows this pattern.  The program recursively go down the sourceDirs to find *.ab1 files, and creates symlinks in ./sampleID/chromat_dir/, pointing to the *.ab1 files inside of the sourceDir.  The script make sure that there is no file with duplicated names.  Additionally, if correct symlinks already exists, no new symlink will be created.";

use File::Find;
use File::Basename;

use Getopt::Std;

getopts('hr') || die $usage;

die $usage if (defined($opt_h));

die "$usage\nERROR: Please specify at least one source directory\n" 
    unless @ARGV;

my @fileName =();
my @sourceDirName;

foreach my $dir (@ARGV) {
    $dir =~ s@/$@@;              # Strip any trailing slash
    if (-d $dir) {
	push @sourceDirName, $dir;
    } elsif (-f $dir) {
	push @fileName, $dir;
    } else {
	warn "Don't know how to handle argument '$dir'\n";
	next;
   }
}

# extract the plain files and symlinks
push @fileName, ListRegFilesRecursive(@sourceDirName);
# print join "\n", @fileName, "\n";  # for debug

@fileName = SelectAB1Files (@fileName);
my @duped = CheckDuplicatedBasename (@fileName);

if (@duped > 0) {
    warn "ERROR: the filenames should be unique\n";
    warn (join "\n", @duped); 
    warn ("\n");
    die;
}

# if some files are already in the destination, they are removed from the list
@fileName = CheckDestination(@fileName);

# Now it should be safe to make the symlinks.
MakeSymlinks (@fileName);

if (defined ($opt_r)) {
    RunPhredPhrap (@fileName);
}

exit (0);

# take a list of directories and returns the names of plain files and sym links
sub ListRegFilesRecursive {
    my @names =();
    find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_;
    return @names;
}

# only select ab1 files
sub SelectAB1Files {
    my @result = ();
    foreach $file (@_) {
	my $id = GetIDFromFilename($file);
	if ($id eq "") {
	    warn "INFO: $file does not follow the naming convention: " .
		"templateID.type.ab1.  Ignoring ...";
	    next;
	}
	push @result, $file;
    }
    return @result;
}

sub CheckDuplicatedBasename {
    my %seen = ();
    my @dupPairs = ();
    foreach my $name (@_) {
	$base=basename($name);
	if ($seen{$base}) {
	    push @dupPairs, "$seen{$base} = $name";
	} else {
	    $seen{$base} = $name;
	}
    }
    return (@dupPairs);
}

sub GetIDFromFilename {
    my $name = shift;
    my $bn = basename $name;
    # "+?" is stingy match
    if ($bn =~ /^(.+?)\..*\.ab1$/) {
	return $1;
    } else {
	return "";
    }
}

sub CheckDestination {
    my @result = ();
    foreach my $file (@_) {
	my $id = GetIDFromFilename($file);
	if ($id eq "") {
	    warn "INFO: $file does not follow the naming convention: " .
		"templateID.type.ab1.  Ignoring ...";
	    next;
	}
	if ( -e $id) {
	    if (-d $id) {
		unless ( -d "$id/chromat_dir" && -d "$id/edit_dir" &&
			-d "$id/phd_dir" ) {
		    die "ERROR: destination directory, $id, doesn't contain " .
			"consed dir structure: chromat_dir, edit_dir, and " .
			"phd_dir\n";
		}
	    } else {
		die "PROBLEM with destination: non-directory with name $id " .
		    "exists\n";
	    }
	} else {
	    push @result, $file;
	    next;
	}
	
	# the destination directory exists, so check duplication
	my $base = basename($file);
	unless (-e "$id/chromat_dir/$base") {
	    push @result, $file;
	    next;	    
	}
	
	# there is already a file with same name
	my ($destDev, $destIno) = stat ("$id/chromat_dir/$base");
	my ($newDev, $newIno) = stat $file;
	
	# the file exists.
	if ($destDev == $newDev && $destIno == $newIno) {
	    warn "INFO: ignoring $file (= $id/chromat_dir/$base)\n";
	    # Ignore this file
	    next;
	}
	
	### hmm,
	warn "ERROR: $id/chromat_dir/$base exists, symlink to $file " .
	    "can't be made\n";
	die;
    }

    return @result;
}

sub MakeSymlinks {
    foreach my $file (@_) {
	my $id = GetIDFromFilename($file);
	unless (-e $id) {
	    MakeConsedDirStructure($id);
	}
	my $bn = basename $file;

	print "$id/chromat_dir/$bn -> $file\n";
	if (! symlink "../../$file", "$id/chromat_dir/$bn") {
	    warn "WARN: could not make symlink: $id/chromat_dir/$bn -> $file\n";
	}
    }
}

sub MakeConsedDirStructure {
    my $name = shift;
    system ("mkdir -p $name/chromat_dir");
    system ("mkdir -p $name/edit_dir");
    system ("mkdir -p $name/phd_dir");
}

sub RunPhredPhrap {
    my @updatedSamples = ();
    foreach $file (@_) {
	push @updatedSamples, GetIDFromFilename($file);
    }
    @updatedSamples = Unique(@updatedSamples);

    foreach $dir (@updatedSamples) {
	system("cd $dir/edit_dir/;phredPhrap");
    }
}

sub Unique {
    my %seen = ();
    my @uniq = grep { ! $seen{$_} ++ } @_;
    return @uniq;
}
