#!/usr/bin/perl -w use Bio::SeqIO; my ($inFile, $outFile) = @ARGV; my $seqIN = Bio::SeqIO->new('-format' => 'genbank', '-file' => "$inFile"); my $seqOUT = Bio::SeqIO->new('-format' => 'fasta', '-file' => ">$outFile"); while(my $seq = $seqIN->next_seq()) { $seqOUT->write_seq($seq); }
# locate Bio/SeqIO.pm /usr/lib/perl5/vendor_perl/5.8.5/Bio/SeqIO.pmClass definition of Bio::SeqIO is included in the file.
Passing an object $seq of class Bio::Seq, which contains a single sequence data, to the method of ``output robot'' $seqOUT. This will write a single entry in fasta format.
perldoc Bio::Seq perldoc Bio::SeqIO
_
seq.
_
seq
#!/usr/bin/perl -w use Bio::SeqIO; use Bio::DB::GenBank; $genBank = new Bio::DB::GenBank; # This object knows how to talk to GenBank my $seq = $genBank->get_Seq_by_acc('AF060485'); # get a record by accession my $seqOut = new Bio::SeqIO(-format => 'genbank'); $seqOut->write_seq($seq);
#!/usr/bin/perl -w use Bio::SeqIO; use Bio::DB::GenBank; $genBank = new Bio::DB::GenBank; my $seq = $genBank->get_Seq_by_acc('AF060485'); # get a record by accession my $dna = $seq->seq(); # get the sequence as a string my $id = $seq->display_id(); # identifier my $acc = $seq->accession; # accession number my $desc = $seq->desc; # get the description # Note () after method is optional when there is no # argument/option is required print "ID: $id\naccession: $acc\nDescription: $desc\n$dna\n";
accession_number() | get the accession number |
display_id() | get identifier string |
description() or desc() | get description string |
seq() | get the sequence as a string |
length() | get the sequence length |
subseq($start, $end) | get a subsequence (char string) |
translate() | translate to protein (seq obj) |
revcom() | reverse complement (seq obj) |
species() | Returns an Bio::Species object |
There are many other methods to access the information.
#!/usr/bin/perl -w use Bio::SeqIO; use Bio::DB::GenBank; $genBank = new Bio::DB::GenBank; my $seq = $genBank->get_Seq_by_acc('AF060485'); # get a record by accession # Bio::SeqIO object use ">display_id desc" as the name line of FASTA $seq->display_id("ThalianaMedea"); $seq->desc(""); # take only first 200 bp my $shortened = $seq->subseq(1,200); $seq->seq($shortened); my $outObj = Bio::SeqIO->new(-format=>'fasta'); $outObj->write_seq($seq);
AF328996 aly13-01 # A. lyrata SRK, allele 1
AY186763 aly13-02 # A. lyrata SRK, allele 2
:
There are three tab-delimited columns: the first is GenBank accession number, the second is shortName you want to use for the sequence, and the third column contains any comments, which will be ignored.
getSeq.pl processes the input file, download the sequnce of each accession number, and print the all sequences in FASTA format (use the corresponding shortName in the second column).
_
transcript'', ``gene'', ``CDS'', ``exon'', and ``intron'' are
called primary tag. Following code will print out the available primary tags.
#!/usr/bin/perl -w use Bio::DB::GenBank; $genBank = new Bio::DB::GenBank; my $seq = $genBank->get_Seq_by_acc('Z19602'); # go through each primary feature tags foreach my $featObj ($seq->get_SeqFeatures) { print "# Primary tag: ", $featObj->primary_tag, "\n"; }
CDS join(27..186,284..582,835..914,1005..1320) /gene="HAT4" /codon_start=1 /protein_id="CAA79670.1" /db_xref="GI:22759" /db_xref="GOA:Q05466" /db_xref="UniProtKB/Swiss-Prot:Q05466" /translation="MMFEKDDLGLSLGLNFPKKQINLKSNPSVSVTPSSSSFGLFRRS ....."
foreach my $featObj ($seq->get_SeqFeatures) { if($featObj->primary_tag eq "CDS") { # extract the spliced sequence my $splicedSeqObj = $featObj->spliced_seq; $seqOut->write_seq($splicedSeqObj); # extract the exons and introns (= 27..1320 ) my $geneSeqObj = $featObj->seq; $seqOut->write_seq($geneSeqObj); # all sequence data (= 1..1352) my $allSeqObj = $featObj->entire_seq; $seqOut->write_seq($allSeqObj); } }
#!/usr/bin/perl -w use Bio::DB::GenBank; use Bio::Restriction::EnzymeCollection; use Bio::Restriction::Analysis; $genBank = new Bio::DB::GenBank; my $seq = $genBank->get_Seq_by_acc('AF060485'); # get a record by accession my $all_collection = Bio::Restriction::EnzymeCollection->new(); my $six_cutter_collection = $all_collection->cutters(6); my $analysis = Bio::Restriction::Analysis->new(-seq => $seq); # $seq is the Bio::Seq object for the DNA to be cut # Check the cut by each $enzyme (Bio::Restriction::Enzyme object) foreach my $enzyme ($six_cutter_collection->each_enzyme()) { @fragments = $analysis->fragments($enzyme); # returns an array of strings $numFrag = @fragments; # number of fragments if ($numFrag > 1) { # print the name of enzyme which cut this $seq print $enzyme->name(), "\t$numFrag\n"; } }
#!/usr/bin/perl -w use Bio::AlignIO; my ($inFile, $outFile) = @ARGV; $in = Bio::AlignIO->new(-file => "$inFile" , -format => 'fasta'); $out = Bio::AlignIO->new(-file => ">$outFile", -format => 'nexus'); my $aln = $in->next_aln(); # get entire alignment data $out->write_aln($aln);
#!/usr/bin/perl -w use Bio::AlignIO; my ($inFile) = @ARGV; $in = Bio::AlignIO->new(-file => "$inFile" , -format => 'fasta'); my $alnObj = $in->next_aln(); # get entire alignment data foreach my $seqObj ($alnObj->each_seq) { print $seqObj->display_id, "\n"; }
perldoc Bio::Align::AlignI for manipulating Align objects.
conv.pl inFormat outFormat inFileName
and print out to the stdout (use -file => \*STDOUT
, or omit
-
file option).
The following code reads in a genbank file with multiple sequences (unaligned), run Muscle alignment program, and print out the nexus file.
#!/usr/bin/perl -w use Bio::SeqIO; use BIO::AlignIO; use Bio::Tools::Run::Alignment::Muscle; my ($inFile) = @ARGV; my $seqIN = Bio::SeqIO->new('-format' => 'genbank', '-file' => "$inFile"); # read in the all sequences, and make an array of seq object my @seqObjArr = (); while(my $seq = $seqIN->next_seq()) { push @seqObjArr, $seq; } # prepare the interface to external program my $alignFactory = Bio::Tools::Run::Alignment::Muscle->new(); my $arrRef = \@seqObjArr; # taking the reference (address) of array my $alnObj = $alignFactory->align($arrRef); # print out the aligned file $out = Bio::AlignIO->new( -format => 'nexus'); $out->write_aln($alnObj);
bioperl-run (distributed separately from bioperl) contains modules to run/interface with external programs and documented here.
For multiple sequence alignment, bioperl can drive other popular programs such as clustalw or T-coffee. Additional tutorial to use these two programs. If you want to align a EST, cDNA, mRNA with a genomic sequnce, you might be interested in driving sim4
Also, EMBOSS contains many stand-alone sequence analysis programs. Bio::Factory::EMBOSS is included in bioperl-run, and you can drive these programs from bioperl. The documentationhttp://doc.bioperl.org/bioperl-run/Bio/Factory/EMBOSS.html is rather spartan at this moment.
When you are dealing with aligning coding region, it is frequently
convenient to translate the DNA, align at the aa, and insert the
corresponding gaps into the dna sequence. Take a look at
aa_to_dna_aln
in perlddoc Bio::Align::Utilities.
Here is an example.
SeqIO | FASTA, EMBL, GenBank, ... | |
AlignIO | ClustalW, Phylip, Nexus, Mega, ... | |
TreeIO | Newick, Nexus, lintree, ... | |
SearchIO | BLAST, FASTA, HMMER, ... (Results of database searches) | |
MapIO | MapMaker (Results of Genetic Map) | |
Matrix::IO | Phylip (e.g. Distance matrix). | |
Assembly::IO | ace (phrap, assembles contigs from sequence fragments) |
Here I show an example to access the remote blast sites with BioPerl.
This script reads in a fasta file (which can contain several sequences), access the NCBI blast via HTTP, select the records with expect value lower than specified, and return the accession numbers of all matching sequences. You can feed the output to getSeq.pl to retrieve the sequences.
./batchBlast.pl input.fasta > output.acc
#!/usr/bin/perl -w # batchBlast.pl # Takes a fasta file of DNA sequence(s), and get the accession number of hits use Bio::Tools::Run::RemoteBlast; $infile = shift @ARGV; my $prog = "blastn"; my $db = "nr"; my $e_val = "1e-5"; my $remoteBlast = Bio::Tools::Run::RemoteBlast->new(-prog => $prog, -data => $db, -expect => $e_val); my $r = $remoteBlast->submit_blast($infile); while (my @reqIDs = $remoteBlast->each_rid ) { print STDERR join(" ", "\nINFO: RIDs: ", @reqIDs), "\n"; foreach my $reqID (@reqIDs) { # each search results my $rc = $remoteBlast->retrieve_blast($reqID); if (! ref ($rc)) { if ($rc < 0) { # no match $remoteBlast->remove_rid($reqID); } # Search is not done yet, wait 10 sec, and try to retrieve again print STDERR "."; sleep (10); } else { # got some blast hit my $result = $rc->next_result; # get the blast output while(my $hit = $result->next_hit) { # print out the accession etc of all hits print $hit->accession, "\t\t# ", $hit->name, " ", $hit->description, ", e-Val: ", $hit->significance, "\n"; } print STDERR "\nINFO: removing $reqID\n"; $remoteBlast->remove_rid($reqID); # remove this RID since we # already got the results } } }
_
blast, an array of request IDs (RIDs) is
kept in the RemoteBlast object. The number of elements in the
array is equal to the number of sequences in the input fasta file.
_
rid method will returns the array of request IDs.
_
rid).
Bio::Tools::Run::RemoteBlast
Bio::Tools::BPlite
Bio::Tools::Blast
If you want to drive blast+, see this HOWTO.