#!/usr/bin/perl -w

# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version: 20130612

my $usage=
    "Usage: $0 [-hr] [-d genomicDNA_SeqName] [-m mRNA_SeqName] [fastaFile]\n".
    " -h: help\n".
    " -d seqName: sequence name of genomic DNA\n".
    " -m seqName: sequence name of mRNA\n".
    " -r: returned range is using the site positions of genomicDNA \n".
    "     instead of aligned absolute position" .
    "Compare the genomicDNA and its mRNA, and find out the range specifier\n".
    "of exons.  The output can be fed to -s option of selectSites.pl\n".
    "If name of input file (fastaFile) is not given, STDIN is used.\n" .
    "If the fastaFile contains only two sequences, the longer sequence after\n".
    "removing all gaps are considered as the genomic DNA\n".
    "If the fastaFile contains more than two sequences, -d and -m are mandatory.\n".
    "If mRNA doesn't exactly match with genomicDNA, results may not be reliable.";

   
my $sep = "\t";
my $lineWidth = 70;   # used to break the long sequences into lines.

our($opt_h, $opt_r, $opt_m, $opt_d);

use Getopt::Std;
getopts('hm:d:r') || die "$usage\n";
die "$usage\n" if (defined($opt_h));

@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $seqFile = shift @ARGV;
my @dat = ReadInFASTA($seqFile);

my $numSeq = @dat;

my @nam = GetSeqName(@dat);

if (defined($opt_m) && defined($opt_d)) {
    @selectedSeqs = ($opt_d, $opt_m);
    
    @dat = SelectSeqs(\@dat, \@selectedSeqs);
    
    if (@dat != 2) {
	die "ERROR: Please make sure that sequence names: $opt_m and $opt_d exist in".
	    "input file\n";
    }
} else {
    if ($numSeq == 2) {
	my @tmpDat = GetSeqDat(@dat);
	$tmpDat[0] =~ s/-//g;
	$tmpDat[1] =~ s/-//g;
	my $len1 = length($tmpDat[0]);
	my $len2 = length($tmpDat[1]);
	if ($len2 > $len1) {
	    warn ("INFO: 1st sequence is considered as mRNA\n");
	    @dat = ($dat[1], $dat[0]);  # flip the order
	} else {
	    warn ("INFO: 2nd sequence is considered as mRNA\n");
	}
    } else {
	die "ERROR: there are more than two sequences in the input file,\n".
	    "Please specify mRNA with -m and genomic DNA with -g\n\n$usage\n ";
    }
}


# @dat contains (genomic, mRNA)
my @seqDat = GetSeqDat(@dat);
my @genomicDNA = split //, $seqDat[0];
my @mRNA = split //, $seqDat[1];
my @exonAbs = ();
my @exonRel = ();
my @intronAbs = ();
my @intronRel = ();
my @weird = (); # sites where genomic is - but mRNA is not -
my @mismatch = ();
my $state = 'p';  # p = before genomicDNA starts, i = intron, e = exon

# Find the index to the last base of genomicDNA
my $last = $#genomicDNA;
while ($genomicDNA[$last] eq '-') {
    $last--;
}

my $genomicCntr = 0;
foreach my $i (0..$last) {
    if ($state eq 'p') {  # processing before genomic starts
	next if ($genomicDNA[$i] eq '-');
	$state = 'e';
    }

    my $ggg = $genomicDNA[$i];
    my $mmm = $mRNA[$i];

    if ($ggg eq '-') {
	if ($state eq 'i') {
	    push @intronAbs, $i+1;
	} else {
	    push @exonAbs, $i+1;
	}
	next if ($mmm eq '-');
	push @weird, $i;
	next;
    }

    # genomic is not '-'
    $genomicCntr++;
    if ($ggg eq $mmm) { # exon
	push @exonAbs, $i+1;
	push @exonRel, $genomicCntr;
	$state = 'e';
    } elsif ($mmm eq '-') { # intron
	push @intronAbs, $i+1;
	push @intronRel, $genomicCntr;
	$state = 'i';
    } else { # mismatch
	push @mismatch, $i+1;
	push @exonAbs, $i+1;
	push @exonRel, $genomicCntr;
	$state = 'e';
    }
}

#print "@exonAbs\n";

# handle weird sites
if (@weird > 0) {
    warn "INFO: following sites had gaps in genomic DNA, and non-gaps in mRNA, " .
	" non-gap chars in mRNA were considered as gaps:  " . 
	join(",", @weird) . "\n";
}
if (@mismatch > 0) {
    warn "INFO: following sites had mismatch between genomic DNA vs mRNA, ".
	"sites were considered as exon:  " . join(",", @mismatch) . "\n";
}

## final output
if (defined($opt_r)) {
    print CondenseIndex(@exonRel), "\n";

} else {
    print CondenseIndex(@exonAbs), "\n";
}

exit(0);

# Takes an array of integer, and create a range string.
# E.g. (2,3,4,6,9,11,12,13)  => "2-4,6,9,11-13"
# assumes an array is already sorted
sub CondenseIndex {
    my @arr = @_;
    my $prev = shift @arr;
    my $result = $prev;
    my $begin = $prev;
    
    foreach my $num (@arr) {
	if ($num == $prev + 1) {
	    $prev = $num;
	    next;
	} else {  # not continuous
	    if ($prev eq $begin) {
		$result .= ",$num";
	    } else {
		$result .= "-$prev,$num"
	    }
	    $begin = $num;
	    $prev = $num;
	}
    }

    # deal with the last element
    if ($arr[$#arr] == $arr[$#arr-1]+1) {
	$result .= "-$arr[$#arr]";
    }
    return $result;
}

# takes an arg; name of a file from which data are read Then read in
# the data and make an array.  Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
    my $infile = shift;
    my @line;
    my $i = -1;
    my @result = ();
    my @seqName = ();
    my @seqDat = ();

    open (INFILE, "<$infile") || die "Can't open $infile\n";

    while (<INFILE>) {
        chomp;
        if (/^>/) {  # name line in fasta format
            $i++;
            s/^>\s*//; s/^\s+//; s/\s+$//;
            $seqName[$i] = $_;
            $seqDat[$i] = "";
        } else {
            s/^\s+//; s/\s+$//;
	    s/\s+//g;                  # get rid of any spaces
            next if (/^$/);            # skip empty line
            s/[uU]/T/g;                  # change U to T
            $seqDat[$i] = $seqDat[$i] . uc($_);
        }

	# checking no occurence of internal separator $sep.
	die ("ERROR: \"$sep\" is an internal separator.  Line $. of " .
	     "the input FASTA file contains this charcter. Make sure this " . 
	     "separator character is not used in your data file or modify " .
	     "variable \$sep in this script to some other character.\n")
	    if (/$sep/);

    }
    close(INFILE);

    foreach my $i (0..$#seqName) {
	$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
    }
    return (@result);
}

sub GetSeqDat {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[1];
    }

    return (@result)
}

sub GetSeqName {
    my @data = @_;
    my @line;
    my @result = ();

    foreach my $i (@data) {
	@line = split (/$sep/, $i);
	push @result, $line[0];
    }
    return (@result)
}

sub SelectSeqs {
    my ($seqARef, $nameARef) = @_;
    my @seqName = GetSeqName(@$seqARef);
    my @seqDat = GetSeqDat(@$seqARef);

    # make a hash table
    my %seqHash = ();
    for my $i (0..$#seqName) {
	if (exists($seqHash{$seqName[$i]})) {
	    die "ERROR: In fasta file, there are more than 1 entry " .
		"which has the name $seqName[$i]\n";
	} else {
	    $seqHash{$seqName[$i]} = $seqDat[$i];
	}
    }

    # select the specified seqs
    foreach my $name (@$nameARef) {
	if (exists($seqHash{$name})) {
	    my $tmp = $name . $sep . $seqHash{$name};
	    push @result, $tmp;
	} else {
	    warn "WARN: $name didn't occur in the input file\n";
	}
    }
    return @result;
}
