#!/usr/bin/perl -w

# it runs ace2Fasta.perl with the most recent version of ace file.
# then it will harvest the contigs.

# Copyright 2006, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20100225
#   Fixed to deal with th new behavior of consed-19.0

use File::Find;
use File::Basename;

use Getopt::Std;
getopts('hr') || die $usage;
 
die $usage if (defined($opt_h));
die "$usage\nERROR: Please specify at least one source directory\n" 
    unless (@ARGV);

my @fileName =();
my @sourceDirName;

$binA2F = "ace2Fasta.perl";  # the name of executable.

foreach my $dir (@ARGV) {
    $dir =~ s@/$@@;              # Strip any trailing slash
    if (-d $dir) {
        push @sourceDirName, $dir;
	RunAce2Fasta($dir);
    } else {
        warn "Don't know how to handle argument '$dir', ignoring\n";
        next;
   }
}

# extract the plain files and symlinks
push @fileName, ListRegFilesRecursive(@sourceDirName);

# get the names of *.contigs file, but excluding fasta.screen.contigs
my @contigFiles = grep { /edit_dir\/[^\/]+\.contigs$/ } @fileName;
@contigFiles = grep { ! /\.fasta\.screen\.contigs$/ } @contigFiles;

# exclude miniassembly ace files.
@contigFiles = grep { ! /^mini\.\d+\.\d+\.fasta\.screen\.ace\.\d+$/ } @contigFiles;

@contigFiles = RemoveNewAndOld(@contigFiles);

# We got the list of files, go through each file
my %name = ();
foreach my $file (@contigFiles) {
    if ( -z $file) {
	warn "WARN: the file $file is empty.  Ignoring...\n";
	next;
    }

    my $bn = basename($file);
    $bn =~ s/\.contigs$//;
    my $sampleID = $bn;
    if (defined $name{$bn}) {
	$sampleID = $sampleID . "." . $name{$bn};
	warn "WARN: Duplicated sample names found. Using sample name " .
	    "\"$sampleID\" for contigs in $file.\n";
	$name{$bn}++;
    } else {
	$name{$bn}++;
    }

    # now modify the fasta sequence name and print out
    open (IN, "<$file") || die "ERROR: can't open $file\n";
    while (<IN>) {
	chomp;
	s/^\s+//; s/\s+$//;
	s/^>\s*Contig1\s*$/>$sampleID/;
	s/^>\s*Contig(\d+)\s*$/>${sampleID}_c$1/;
	print; print "\n";
    }
    close (IN);
}

exit (0);

# take a list of directories and returns the names of plain files and sym links
sub ListRegFilesRecursive {
    my @names =();
    find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_;
    return @names;
}


# When phredPhrap got run multiple times, *New.contigs and *Old.contigs
# get created.  Remove these if correponding *.contigs exists.
#
# With phredPhrap came with consed 19.0, it creates *.YYMMDD_HHMMSS.contigs
# everytime it is executed.
# YYMMDD is the date and HHMMSS is the time when phredPhrap was run.
sub RemoveNewAndOld {
    my @result = ();

    # make a filename database
    my %seen = ();
    foreach my $key (@_) {
	$seen{$key} = 1;
    }
    
    foreach my $file (@_) {
	if ($file =~ /^(.+)New\.contigs$/ || $file =~ /^(.+)Old\.contigs/) {
	    next if ($seen{"$1" . ".contigs"}); #ignore these
	}

	# *.contigs with date
	if ($file =~ /^(.+)\.\d{6}_\d{6}\.contigs$/) {
	    next if ($seen{"$1" . ".contigs"}); #ignore these
	}
	push @result, $file;
    }

    return @result;
}


# take a directory name as an argument, and update *.contigs file
# by running ace2Fasta.perl.
sub RunAce2Fasta {
    my $dirName = shift;
    
    my $aceFile = 
	FindAceFile("$dirName/edit_dir");  # get the recent version of ace

    if($aceFile eq "") {  # no acefile
	warn "INFO: no ace file in $dirName, $binA2F was not run\n";
	return;
    }

    system("cd $dirName/edit_dir; $binA2F $aceFile > /dev/null 2>&1");

    return;
}

# return the most recent version of ace file in the directory.
sub FindAceFile {
    my $dir = shift;
    
    opendir (DIR, $dir) or die "Can't opendir $dir:$!";
    my $maxVersion = 1;
    my $name = "";
    while (defined($file = readdir(DIR))) {
	# ignore miniaseembly ace files
	next if ($file =~ /^mini\.\d+\.\d+\.fasta\.screen\.ace\.\d+$/);

	if ($file =~ /^(.+\.fasta\.screen\.ace)\.(\d+)$/) {
	    if ($name eq "") {
		$name = $1;
		$maxVersion = $2;
	    } else {
		if ($name ne $1) {
		    die "ERROR: more than 2 ace files with different name\n";
		}
		if ($maxVersion < $2) {
		    $maxVersion = $2;
		}
	    }
	}
    }

    if ($name eq "") {
	return ("");
    } else {
	return ("$name.$maxVersion");
    }
}
