#!/usr/bin/perl -w

# Copyright 2014, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Version 20140911

# ChangeLog
# * Sep 11, 2014
# - deprecated get_Stream_by_batch() was replaced by get_Stream_by_id()
# * May 25, 2012
# - When ExtractCDS() can't find CDS feature, it returns original, unmodified
#   object instead of empty array.  So intead of printing nothing, it prints
#   unmodified (not spliced) sequences.

my $usage="Usage: $0 [-hcp] [accessionFile]\n" .
  " -c: print cds\n" .
  " -p: print translated proteins\n" .
  " STDIN is used as the input if no fastaFile is given\n" .
  "accessionFile should be tab delimited.  1st column (accesion No) " .
  "and 2nd column (seqName to be used) are used. " .
  "The other columns are ignored.  Lines with # are ignored\n";

my $accessionSep = "\t";
my $sep = "\t";

use Bio::DB::GenBank;
use Bio::SeqIO;
#use Bio::SeqFeature::Gene::GeneStructure;
#use Bio::SeqFeature::Gene::NC_Feature;

use Getopt::Std;

# handle optional flags
my %opts = ();
getopts('cpho:', \%opts) || die "$usage\n";

die "$usage\n" if ($opts{h});

my $outFormat = $opts{o} || 'fasta';
my $outFH = Bio::SeqIO->newFh ( -fh => \*STDOUT, -format => $outFormat );

# get the accession number file
@ARGV = ('-') unless @ARGV; # take STDIN when no arg
my $accessionFile = shift;

my @accession = ReadInAccessionFile ($accessionFile);
my @accNums = Column (1, @accession);
my @seqNames = Column (2, @accession);

my @seqs = DownloadSeqs(@accNums);

if (@seqs != @seqNames) {
    die "number of downloaded sequences are: " . scalar(@seqs) .
	" smaller than specified: " . scalar(@seqNames);
}

# print out
for my $i (0..$#seqs) {

    my @cds = ($opts{c} || $opts{p}) ? ExtractCDS($seqs[$i]) : ($seqs[$i]);

    my $numCDS = @cds;
    
    if ($numCDS > 1) {
	warn $seqs[$i]->accession_number ."(". $seqNames[$i].")". 
	    " has " . $numCDS . " CDS.\n";
    }
    
    for my $c (@cds) {
	print ">", $seqNames[$i];
	print "|" . $c->desc() if ($numCDS > 1); #, $c->seq());
	
	my $seqString = ($opts{p}) ? $c->translate->seq : $c->seq;
	if ($opts{p}) {
	    $seqString =~ s/\*\s*$//; # remove the termination codon at the end
	}
	print "\n$seqString\n";
    }
}

exit(0);
 
#Coderet($seqs[0], "cds");

# readin a file, and return an array.
# Each element contains accessionNo . $sep . seqName
sub ReadInAccessionFile {
    my $infile = shift;
    open (INFILE, "<$infile") || die "Can't open $infile\n";

    my @result = ();
    while (<INFILE>) {
	chomp;
	s/#.+$//; # remove comments;
	s/^\s+//; s/\s+$//;
	next if /^$/;
	my @line = split /$accessionSep/;
	if (@line < 2) {
	    warn "WARN: line $. contains < 2 columns. " .
		"1st column is accessionNo, 2nd column is seqName to be used.".
		    "  Skipping this line\n";
	    next;
	}
	push @result, $line[0] . $sep . $line[1]; # accession  seqName
    }
    return @result;
}

sub DownloadSeqs {
    my $batch = 1;       # 1 for batch downloading (quicker)
    my $batchSize = 20;  # 20 seems to be the max of Entrez batch download
    my @accNumArray = @_;
    my $gb = new Bio::DB::GenBank();
    my @seqArray = ();

    if ($batch) {
	while (my @tmpAccNums = splice (@accNumArray, 0, $batchSize)) {
	    my $seqio = $gb->get_Stream_by_id(\@tmpAccNums);
	    while (my $seq = $seqio->next_seq()) {
		push @seqArray, $seq;
	    }
	}
    } else {
	for my $acc (@accNumArray) {
	    $seq = $gb->get_Seq_by_acc($acc);
	    push @seqArray, $seq;
	}
    }
    return @seqArray;
}

# this is supposed to drive EMBOSS coderet, but it doesn't work.
# sub Coderet {
#     my $seq = shift;
#     my $type = shift; # should be cds|mrna|translation
#     use Bio::Factory::EMBOSS;
#     my $f = Bio::Factory::EMBOSS -> new();
#     my $coderet = $f->program('coderet');
    
#     my $tmpOutfile = '/tmp/coderet.out';
#     $coderet->run( { '-seqall' => $seq,
# 		     '-seqout' => $tmpOutfile
# 		     } );

#     #-mrna -translation
# #    my $seqIO = Bio::SeqIO->new( -file => $tmpOutfile);
# }

# returns an array of seq objects, each of them are CDS.
sub ExtractCDS {  # get seq object
    my $entry = shift;
    my $entryAcc = $entry->accession_number();  #$entry->id();

    my @result =();
    my $cdsNotFound = 1;
    foreach my $feat ($entry->top_SeqFeatures()) {
	next if ($feat->primary_tag() ne 'CDS');

	$cdsNotFound = 0;

	my $id = $feat->has_tag('gene') ? 
	    join(' ', $feat->each_tag_value('gene')) : 'no_gene';
	my $acc = $feat->has_tag('protein_id') ? 
	    join(' ', $feat->each_tag_value('protein_id')) : 'no_pid';
	my $desc = $feat->has_tag('product') ?
	    join(' ', $feat->each_tag_value('product')) : 'no_product';
	$desc="$id($acc)|$desc|from $entryAcc";
#	my $featseq;
#	eval { $featseq = $feat->seq(); };
#	if ( $@ ) {
#	    print STDERR 
#		"** can't get the correct sequence of CDS $id|$acc **\n";
#	    print STDERR "** skip it **\n";
#	}
	
	my $loc = $feat->location();
	my $splicedSeq = "";
	if($loc->isa('Bio::Location::SplitLocationI')) {
	    foreach my $subloc ($loc->sub_Location()) {
		my $subSeqFrag = $entry->trunc($subloc->start,$subloc->end);
		$subSeqFrag = $subSeqFrag->revcom if ($subloc->strand == -1);
		# $subloc->seq_id()
		$splicedSeq .= $subSeqFrag->seq();
	    }
	} else {
	    $splicedSeq = $entry->subseq($loc->start,$loc->end);
	}

	# adjust the reading frame
	my $codonStart = 1;
	if ($feat->has_tag('codon_start')) {
	    my @tmpCodonStarts = $feat->each_tag_value('codon_start');
	    warn "Weird, more than 1 codon_start\n" if (@tmpCodonStarts > 1);
	    $codonStart = $tmpCodonStarts[0];
	}
	$splicedSeq = substr($splicedSeq, $codonStart - 1);

	my $seqclass;
	if($entry->can_call_new()) {
	    $seqclass = ref($entry);
	} else {
	    $seqclass = 'Bio::PrimarySeq';
	    $entry->_attempt_to_load_Seq();
	}
	
	my $thisCDS = 
	    $seqclass->new('-seq' => $splicedSeq,
			   '-display_id' =>$entry->display_id() . "|$id($acc)",
			   '-accession_number' => $entryAcc,
			   '-alphabet' => $entry->alphabet,
			   '-desc' => $desc,
			   );
	
	push @result, (($loc->strand == -1) ? $thisCDS->revcom : $thisCDS);
    }

    if ($cdsNotFound || @result == 0) {
	warn ("WARN: in extractCDS, CDS feature not found for ", 
	      $entry->accession_number, " returning the original Sequence\n");
	@result = ($entry);
    }
    return @result;
}

sub Column {
    my $col = shift;
    my @array = @_;
    my @result = ();
    for my $i (@array) {
	my @line = split /$sep/, $i;
	push @result, ((@line >= $col) ? $line[$col-1] : "NA") ;
    }
    return @result;
}

sub simpleExamples {
    # this returns a Seq object :
    my $seqobj = $gb->get_Seq_by_id('MUSIGHBA1');
    # this returns a Seq object :
    my $seqobj2 = $gb->get_Seq_by_acc('AF303112');

    print $seqobj->seq(), "\n";
    
    # other methods
    $seqobj->display_id(); # the human read-able id of the sequence
    $seqobj->seq();        # string of sequence
    $seqobj->subseq(5,10); # part of the sequence as a string
    $seqobj->accession_number(); # when there, the accession number
    $seqobj->alphabet();   # one of 'dna','rna','protein'
    $seqobj->primary_id(); # a unique id for this sequence irregardless
                           # of its display_id or accession number
    $seqobj->desc();       # a description of the sequence

    # this returns a SeqIO object :
    my $seqio = $gb->get_Stream_by_id([ qw(J00522 AF303112 2981014)]);

    my $out = Bio::SeqIO->new(-file => ">/tmp/junk" , '-format' => 'FASTA');
    while (my $seq = $seqio->next_seq()) {
	print "Sequence ",$seq->id," first 10 bases ",$seq->subseq(1,10),"\n";
	$out->write_seq($seq);
    }
}

sub extCDSOld {
    my $entry = shift;
    my $entryid = $entry->id();

    my @result =();
    foreach my $feat ($entry->top_SeqFeatures()) {
	if ($feat->primary_tag() eq 'CDS') {
	    my $id = $feat->has_tag('gene') ? 
		join(' ', $feat->each_tag_value('gene')) : 'no_gene';
	    my $acc = $feat->has_tag('protein_id') ? 
		join(' ', $feat->each_tag_value('protein_id')) : 'no_pid';
	    my $desc = $feat->has_tag('product') ?
		join(' ', $feat->each_tag_value('product')) : 'no_product';

	    my $featseq;
	    eval { $featseq = $feat->seq(); };
	    if (  $@ ) {
		print STDERR 
		    "** can't get the correct sequence of CDS $id|$acc **\n";
		print STDERR "** skip it **\n";
		next;
	    }

	    $featseq->id("$id|$acc");
	    
	    $featseq->desc("$desc |from $entryid");
	    
	    push @result, $featseq;
	}
    }
    return @result;
}


sub PrintFeature {
    my $seqfile = shift;
    my $format = "genbank";
    my $seqio  = Bio::SeqIO->new (-format => $format , -file => $seqfile);
    my $seqobj = $seqio->next_seq();
    
    foreach my $feat ( $seqobj->all_SeqFeatures() ) {
	if ($feat->has_tag('translation')) {
	    if ($feat->has_tag('protein_id')) {
		my @protein_id = $feat->each_tag_value('protein_id');
		my $protein_id = join " ", @protein_id;
	    } elsif ((! $show_gene) && $feat->has_tag('gene')) {
		@gene = $feat->each_tag_value('gene');
		$protein_id = join " ", @gene;
	    } else {
		$protein_id = "";
	    }
	    
	    if ($show_gene && $feat->has_tag('gene')) {
		@gene = $feat->each_tag_value('gene');
		$gene = join " ", @gene;
	    } else {
		$gene = "";
	    }
	    
	    if ($feat->has_tag('product')) {
		@product = $feat->each_tag_value('product');
		$product = join " ", @product;
	    } else {
		$product = "";
	    }
	    
	    $desc = "$protein_id $gene $product";
	    $desc .= " (" . $feat->start . "," . $feat->end . ")";

	    $desc =~ s/\s+/ /g;
	    
	    @translation = $feat->each_tag_value('translation');
	    
	    print ">$desc\n", $translation[0],"\n";
	}
    }
}

sub printFT {
    my $dummy = shift;
    my @feat = $dummy->all_SeqFeatures();
#$dummy->transcripts();
#$dummy->add_transcript_as_features(@feat);
    for $i (@feat) {
	print $i->primary_tag, "\n";
	if ($i->primary_tag eq 'CDS') {
	    $loc = $i->location();
	    @sublocs = $loc->sub_Location();
	    foreach my $j (@sublocs) {
		print $j->start, "++", $j->end, "\n";
	    }
#	$out = Bio::SeqIO->new(-file => ">/tmp/junk" , '-format' => 'FASTA');
#	$out->write_seq($spSeq);
	}
    }
}
