#!/usr/bin/perl -w # Copyright 2013, Naoki Takebayashi # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Version: 20130612 my $usage = "\nUsage: $0 [-f columnNumber] [-hds] sourceDir1 [sourceDir2 ...]\n" . " -h: help\n" . " -s: Survey the directories. Print out the summary of used Primer names\n". " and sample IDs and names\n". " -d: Print out the internal database of known values\n". "sourceDir contains *.ab1 files (output from ABI sequencer). This " . "program assumes that the *.ab1 files follows this naming convention: " . "sampleID.templateType.ab1. The portion before the 1st dot corresponds " . "the name of sample. You can put whatever info in the templateType " . "section. St. Louis naming convention follows this pattern.\n\n" . "Specifically in our lab, we use sampleID and templateType in following " . "format:\n " . " sp-pop-indID.[gb]-[dc]-pcrPrimerSet-tissue-cloneID-seqPrimer-seqDate.ab1\n" . "cloneID has 3 dot-delimited fields: e.g. 041006:3.1.7, which means \n" . "sample (tube) number 3 of 2004/10/6 DNA extraction, \n". "clone # 7 from 1st PCR amplification. \n\n". "This program has a database of possible values for primer Names, and \n" . "correspondence between sampleID (tube # and DNA extraction date).\n". "The program recursively go down the sourceDirs to find *.ab1 files, and\n". "check if the filename is legitimate. The script also check if there is ". "any files with duplicated names."; my $debug = 0; use File::Find; use File::Basename; use Getopt::Std; getopts('hf:ds') || die $usage; die $usage if (defined($opt_h)); #die "$usage\nERROR: Please specify at least one source directory\n" # unless @ARGV; %nameDB = (); # key is sample id, value is sp-pop-indID %revNameDB = (); %pcrPrimerSetDB = (); # key is the posiible values in pcrPrimers section %seqPrimerDB = (); #if (defined($opt_t)) { # -t genCodeTbl was given # open (ALLOWED_TBL, "<$opt_t") || die "ERROR: Can't open $opt_t\n"; # %aminoAcid = InitDB(*ALLOWED_TBL); # close (ALLOWED_TBL); #} else { # use the default standard table supplied at the end # %aminoAcid = InitDB(*DATA); #} InitDB(); # print database and exit if (defined($opt_d)) { # print the database of known names, primers etc. PrintDB(); exit(0); } # start processing arguments my @fileName =(); my @sourceDirName; if (defined($opt_f)) { # arguments are the file my $colNum; if ($opt_f > 1) { $colNum = $opt_f - 1; } else { die "ERROR: -f requires an integer, corresponding to the column\n"; } foreach my $file (@ARGV) { open STDIN, "<$file" || die "ERROR: Can't open $file\n"; while () { s/^\s+//; s/\s+$//; next if (/^$/); my @line = split /\t/; push @fileName, $line[$colNum]; } close STDIN; } } else { # arguments are directories foreach my $dir (@ARGV) { $dir =~ s@/$@@; # Strip any trailing slash if (-d $dir) { push @sourceDirName, $dir; } elsif (-f $dir) { push @fileName, $dir; } else { warn "Don't know how to handle argument '$dir'\n"; next; } } # extract the plain files and symlinks push @fileName, ListRegFilesRecursive(@sourceDirName); # print join "\n", @fileName, "\n"; # for debug } @fileName = SelectAB1Files (@fileName); my @duped = CheckDuplicatedBasename (@fileName); if (@duped > 0) { warn "ERROR: the filenames should be unique\n"; warn (join "\n", @duped); warn ("\n"); die; } ## if (defined($opt_s)) { SurveyUsedNames(@fileName); exit(0); } ## processing each file name foreach my $f (@fileName) { my %infoH = Filename2Hash($f); CheckFilename(%infoH); } # if some files are already in the destination, they are removed from the list #@fileName = CheckDestination(@fileName); # Now it should be safe to make the symlinks. #MakeSymlinks (@fileName); #if (defined ($opt_r)) { # RunPhredPhrap (@fileName); #} exit (0); # take a list of directories and returns the names of plain files and sym links sub ListRegFilesRecursive { my @names =(); find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_; return @names; } # only select ab1 files sub SelectAB1Files { my @result = (); foreach $file (@_) { my $id = GetIDFromFilename($file); if ($id eq "") { warn "INFO: $file does not follow the naming convention " . "(i.e., templateID.type.ab1). Ignoring ..." if ($debug); next; } push @result, $file; } return @result; } sub GetIDFromFilename { my $name = shift; my $bn = basename $name; # "+?" is stingy match if ($bn =~ /^(.+?)\..*\.ab1$/) { return $1; } else { return ""; } } sub CheckDuplicatedBasename { my %seen = (); my @dupPairs = (); foreach my $name (@_) { $base=basename($name); if ($seen{$base}) { push @dupPairs, "$seen{$base} = $name"; } else { $seen{$base} = $name; } } return (@dupPairs); } sub Unique { my %seen = (); my @uniq = grep { ! $seen{$_} ++ } @_; return @uniq; } # Take a filename as an argument. # Make a hash with information extracted from the basename sub Filename2Hash { my ($templateName, $info); my $file = shift; my %result = (); my $fn = $file; $fn =~ s/\.ab1$//; $fn = basename $fn; # +? is stingy match if ($fn =~ /(.+?)\.(.+)/) { $templateName = $1; $info = $2; } else { warn "$file doesn't have '.' in the name. Skipping...\n"; } my @idArray = split /-/, $templateName; my @infoArray = split /-/, $info; if (@idArray != 3 || @infoArray != 7) { warn "$file doesn't conform to the format. Is there extra '-'?\n"; } $result{'species'} = $idArray[0]; $result{'population'} = $idArray[1]; $result{'individualID'} = $idArray[2]; $result{'chemistry'} = $infoArray[0]; $result{'dnaRna'} = $infoArray[1]; $result{'pcrPrimerSet'} = $infoArray[2]; $result{'tissueOrigin'} = $infoArray[3]; $result{'cloneID'} = $infoArray[4]; $result{'sampleID'} = $infoArray[4]; $result{'sampleID'} =~ s/^(.+?)\..+$/$1/; # taking previous to 1st dot. $result{'seqPrimer'} = $infoArray[5]; $result{'seqDate'} = $infoArray[6]; $result{'fileName'} = $file; return %result; } sub CheckFilename { my %hash = @_; my @problem = (); my $sampleID = $hash{'sampleID'}; my $id = "$hash{'species'}-$hash{'population'}-$hash{'individualID'}"; if (defined($nameDB{$sampleID})) { if (($nameDB{$sampleID} ne $id)) { if (defined($revNameDB{$id})) { push @problem, "- Mislabeling?\n" . " $nameDB{$sampleID} <- $sampleID and \n". " $id -> $revNameDB{$id} in database"; } else { push @problem, "- Misspell in $id? $sampleID should be $nameDB{$sampleID}."; } } } else { # sample ID is not in DB if (defined($revNameDB{$id})) { # other sample ID correspond to name push @problem, "- $sampleID not in the database. But $id " . "-> $revNameDB{$id} in database."; } else { push @problem, "- $sampleID & $id are not in the database. Is ". "database updated in the script $0 ??"; } } if ($hash{'chemistry'} !~ /^[gb]$/ ) { push @problem, "- chemistry code is weird: $hash{'chemistry'}."; } if ($hash{'dnaRna'} !~ /^[dc]$/ ) { push @problem, "- DNA (d) or cDNA (c)?: $hash{'dnaRna'} is given."; } # not checking tissue Origin unless(defined($seqPrimerDB{$hash{'seqPrimer'}})) { push @problem, "- Sequencing Primer: $hash{'seqPrimer'} not in the Database."; } unless(defined($pcrPrimerSetDB{$hash{'pcrPrimerSet'}})) { push @problem, "- PCR Primers: $hash{'pcrPrimerSet'} not in the Database."; } my @clID = split /\./, $hash{'cloneID'}; if (@clID != 3) { push @problem, "- clonedID ($hash{'cloneID'}) doesn't have 3 dot-delimited fields."; } # seqDate not checked if (@problem >=1) { print "$hash{'fileName'}\n", join("\n", @problem), "\n"; } } # each entry in array contains 1 value, or 2 values delimited by spaces (\s+) sub DBArray2Hash { my @arr = @_; my %result = (); foreach my $i (@arr) { $i =~ s/#.*$//; # remove comments $i =~ s/^\s+//; $i =~ s/\s+$//; next if ($i =~ /^$/); # empty line my @entry = split /\s+/, $i; if (@entry == 1) { push @entry, 1; # add a dummy value 1 } elsif (@entry > 2) { die "ERROR: \"Possible values\" database should be space(s) ". "delimited, with 1-2 columns.\n" . "$i contains more than 2 columns.\n"; } if (defined ($result{$entry[0]})) { # check if hash key exists if ($result{$entry[0]} eq $entry[1]) { warn "WARN: ignoring duplicated entries for possible-value ". "database: $i\n"; } else { die "ERROR: inconsistency in possible-values database. " . "For the key $entry[0], 2 values defined: " . "$result{$entry[0]} and $entry[1]\n"; } } else { $result{$entry[0]} = $entry[1]; } } return (%result); } # initialize 3 global hash tables: # %nameDB = (); # key is sample id, value is sp-pop-indID # %revNameDB; # key is sp.pop-indID, value is sample id('s) # %pcrPrimerSetDB = (); # key is the posiible values in pcrPrimers section # %seqPrimerDB = (); sub InitDB { my @db = ReadDefaultNameDB(); %nameDB = DBArray2Hash(@db); @db = ReadDefaultPcrPrimerSetDB(); %pcrPrimerSetDB = DBArray2Hash(@db); @db = ReadDefaultSeqPrimerDB(); %seqPrimerDB = DBArray2Hash(@db); foreach my $key (keys(%nameDB)) { if(defined($revNameDB{$nameDB{$key}})) { $revNameDB{$nameDB{$key}} = $revNameDB{$nameDB{$key}} . " $key"; } else { $revNameDB{$nameDB{$key}} = $key; } } } sub PrintDB{ my $key; print "Known PCR primer sets:\n"; foreach $key (sort(keys(%pcrPrimerSetDB))) { print "$key\n"; } print "\nKnown Sequencing primers:\n"; foreach $key (sort(keys(%seqPrimerDB))) { print "$key\n"; } print "\nKnown sampleID and individual names:\n"; foreach $key (sort(keys(%revNameDB))) { print "$key\t$revNameDB{$key}\n"; } } sub PrintSortedKeys { my %hash = @_; foreach my $key (sort(keys(%hash))) { print "$key\n"; } } # go through the directories, and summarize the ab1 files by printing # all observed samples, primer sets, seq primers, etc. # take a list of ab1 filenames as the argument sub SurveyUsedNames { # my %seenName =(); my %seenSampleID = (); my %seenPcrPrimerSet = (); my %seenSeqPrimer =(); my %seenSeqDate = (); foreach my $f (@_) { my %hash = Filename2Hash($f); my $id = "$hash{'species'}-$hash{'population'}-$hash{'individualID'}"; # unless(defined($seenName{$id})) { # $seenName{$id} = 1; # } if(defined($seenSampleID{$hash{'sampleID'}})) { if ($seenSampleID{$hash{'sampleID'}} ne $id) { my $i = 1; do { $tmpID = $hash{'sampleID'} . "dup" . $i; $i++; } while (defined($seenSampleID{$tmpID}) && ($seenSampleID{$tmpID} ne $id)); $seenSampleID{$tmpID} = $id; } } else { $seenSampleID{$hash{'sampleID'}} = $id; } unless(defined($seenPcrPrimerSet{$hash{'pcrPrimerSet'}})) { $seenPcrPrimerSet{$hash{'pcrPrimerSet'}} = 1; } unless(defined($seenSeqPrimer{$hash{'seqPrimer'}})) { $seenSeqPrimer{$hash{'seqPrimer'}} = 1; } unless(defined($seenSeqDate{$hash{'seqDate'}})) { $seenSeqDate{$hash{'seqDate'}} = 1; } } print "## Observed samples\n"; foreach my $key (sort(keys(%seenSampleID))) { print "$key\t$seenSampleID{$key}\n"; } print "\n## Sequencing date, observed in all ab1 files\n"; PrintSortedKeys(%seenSeqDate); print "\n## PCR primer sets, observed in all ab1 files\n"; PrintSortedKeys(%seenPcrPrimerSet); print "\n## Sequencing primer, observed in all ab1 files\n"; PrintSortedKeys(%seenSeqPrimer); } ############################################################################# ### Allowed values sub ReadDefaultNameDB { my $a = <<'NAMEDB'; 040617:25 kamc-portage-3B 040617:26 kamc-darling-5A 040617:27 kamc-savage-6 040617:28 kamc-cParker-A 040617:29 kamc-cParker-B 041006:1 petr-plech-C 041006:2 petr-durn-A 041006:3 lyra-NY-A 041006:4 lyra-WI-A 041006:5 petr-plech-A 041006:6 petr-exeter-Z 041006:7 petr-plech-B 041006:8 petr-exeter-A 041006:9 lyra-WI-C 041006:10 kamc-portage-5AA 041006:11 kamc-savage-5 041006:12 kamc-bearCr-5C 041006:13 kamc-portage-2B 041006:14 kamc-bearCr-3B 041006:15 kamc-portage-4B 041006:16 kamc-portage-2A 041006:19 kamc-portage-1Z 041006:20 kamc-portage-4X 041006:21 kamc-darling-1AA 041006:22 kamc-savage-4A 041006:24 kamc-bearCr-3AA 041006:25 kamc-portage-3AA 041006:26 kamc-chena-13AA 041006:27 kamc-chena-20AA 041006:29 kamc-chena-9A 041006:64 kamc-chena-20B 041006:65 petr-reykjavik-NoID 041130:3 thal-kashmirIndia-CS6751 041130:5 thal-burrenIreland-CS1028 041130:6 thal-micklesFellUK-CS1362 041130:7 thal-miramareItaly-CS1379 041130:8 thal-sanElenoSpain-CS1502 041130:9 thal-marthasVineyard-CS1387 041130:10 thal-turinItaly-CS6875 041130:11 thal-yosemite-CS1622 041130:12 thal-toledo-CS8026 041130:14 kamc-goodnews-A 041130:15 kamc-goodnews-B 041130:16 kamc-goodnews-C 041130:19 thal-pamiroTadjikistan-CS57924 041130:21 thal-IbelMorocco-CS1244 041130:22 thal-oysteseNorway-CS1436 041130:23 thal-denmark-CS1220 041130:25 thal-ukraine-CS22623 041130:26 thal-yosemite-CS1622 041130:27 thal-japan-CS1564 041130:28 thal-czech-CS22589 041130:29 thal-lithuania-CS925 041130:30 thal-sampoMtRussia-CS22492 041130:31 thal-chisdraRussia-CS1076 041130:32 thal-rschewRussia-CS1491 # These are using 4 digits for the year 20041130:3 thal-kashmirIndia-CS6751 20041130:5 thal-burrenIreland-CS1028 20041130:6 thal-micklesFellUK-CS1362 20041130:7 thal-miramareItaly-CS1379 20041130:8 thal-sanElenoSpain-CS1502 20041130:9 thal-marthasVineyard-CS1387 20041130:21 thal-IbelMorocco-CS1244 20041130:25 thal-ukraine-CS22623 20041130:26 thal-yosemite-CS1622 050310:1 petr-esjaMtIceland-A 050310:4 petr-esjaMtIceland-B 050310:5 petr-reykjavik-B 050310:7 petr-austria-NoID 050310:9 drum-galenaAk-A 050310:10 drum-galenaAk-B 050310:14 kamc-rainbow:2004-3I 050310:15 kamc-rainbow:2004-1C 050310:16 kamc-rainbow:2004-11G 050310:18 kamc-exitGl:2004-2A 050310:19 kamc-exitGl:2004-25A 050310:20 kamc-ptarmiganCr-3C 050310:21 kamc-ptarmiganCr-2C 050310:22 kamc-ptarmiganCr-5F 050310:23 kamc-darling:2004-2E 050310:24 kamc-portage1:2004-11C 050428:1 petr-kurhu5Russ-NoID 050428:2 petr-braemarScotland-B 050428:3 petr-austria-NoID 050428:4 petr-braemarScotland-A 050428:5 petr-esjaMtIceland-B 050428:6 kawa-biwaJapan-NoID 050428:7 petr-esjaMtIceland-A 050428:8 petr-reykjavik-B 050428:9 kamc-goodnews-A 050428:10 kamc-goodnews-D 050428:11 kamc-goodnews-E 050428:12 kamc-goodnews-F 050428:18 lyra-WI-C 050524:3 kamc-ptarmiganCr-3D 050525:6 petr-reykjavik-B 050525:7 petr-esjaMtIceland-A 050525:15 kawa-biwaJapan-NoID 050525:18 drum-galena-C 050525:33 kamc-goodnews-H 050525:34 kamc-cParker-A 050525:35 kamc-portage-3AA 050525:36 kamc-portage-4:1 050525:37 kamc-chena-21E 050525:38 kamc-darling-1AA 050525:39 kamc-savage-4 050525:40 kamc-bearCr-3AA 050525:41 unkn-ALAV136266-NoID 050525:42 unkn-ALAV136266-NoID 20050310:10 drum-galenaAk-B 20050310:21 kamc-ptarmiganCr-2C 20050310:9 drum-galenaAk-A #Following need to be renamed At20041130:21 thal-IbelMorocco-CS1244 At20041130:22 thal-oysteseNorway-CS1436 At20041130:25 thal-ukraine-CS22623 At20041130:27 thal-japan-CS1564 At20041130:28 thal-czech-CS22589 At20041130:29 thal-lithuania-CS925 At20 thal-catalinaItaly-CS1094 At21 thal-berghaunBushschlag-CS1008 At23 thal-coiumbriaPortugal-CS1086 At24 thal-baselSwitz-CS1000 At50 thal-ukraine-CS22623 At51 thal-durhamOldOxford-1 At52 thal-durhamWaterford-noID At53 thal-yosemite-CS1622 At54 thal-playaDeAroSpain-CS6916 At55 thal-marthasVineyard-CS1387 At56 thal-turinItaly-CS6875 At57 thal-toledo-CS8026 At58 thal-sanElenoSpain-CS1502 At59 thal-osthammarSweden-CS1430 7 thal-miramareItaly-CS1379 21 thal-berghaunBushschlag-CS1008 21dup1 thal-IbelMorocco-CS1244 55 thal-marthasVineyard-CS1387 56 thal-turinItaly-CS6875 59 thal-osthammarSweden-CS1430 NAMEDB return (split /\n/, $a); } sub ReadDefaultPcrPrimerSetDB { my $a = <<'PCRDB'; Mea6fMea26r # thal media promoter Mea36fMea25r # thal media promoter Mea36fMea26r # thal media promoter Mea36fMea27r # thal media promoter Mea17fMea16r # thal media coding # lyrata media #Mea6fMea26r Mea6f36f26r25r #Mea17fMea16r #Mea36fMea25r Mea36f25r9r Mea36fMea44r Mea48fMea40r Mea51fMea41r Mea52fMea40r Mea54fMea42r # CHS CHSFOR1REV5 Chs1fChs1r # HAT4 Hat1fHat2r Hat3fHat3r PCRDB return (split /\n/, $a); } sub ReadDefaultSeqPrimerDB { my $a = <<'SEQDB'; M13F M13R # thaliana media promoter Mea6f Mea37F Mea33R Mea26r # thaliana media coding Mea17f MeaTMOf MeaTMOr Mea36R Mea44F Mea11R Mea12R Mea45F Mea14R Mea42F Mea16R # lyrata Mea12r Mea14r Mea16f Mea20r Mea37r Mea41f Mea43f Mea45f Mea36f Mea45r Mea21r ## need to be revised below this 11R 14R 33R 36R 37F 42F 44F 45F M13F M13R Mea11R Mea12R Mea12r Mea14R Mea14r Mea16F Mea16R Mea16f Mea16r Mea20R Mea20r Mea25r Mea26R Mea33R Mea34r Mea35r Mea36R Mea36f Mea36r Mea37F Mea37R Mea37r Mea38r Mea39f Mea40f Mea40r Mea41F Mea41f Mea41r Mea42F Mea42f Mea42r Mea43F Mea43f Mea44F Mea44F.2 Mea44r Mea45F Mea45F.2 Mea45R Mea45f Mea46f Mea48f Mea4f Mea51f Mea52f Mea54f MeaTMOf MeaTMOr TMOF TMOR TMOf TMOr mea11R mea12r mea14R mea14r mea16f mea20r mea36R mea37r mea41f mea42F mea43f mea44F mea45F mea45f meaTMOF meaTMOR # For chs CHSFOR1 CHSREV3 CHSFOR3 CHSREV5 Chs1f Chs1r # For HAT4 Hat1f Hat1r Hat2f Hat2r Hat3f Hat3r Hat4f SEQDB return (split /\n/, $a); }