#!/usr/bin/perl -w ########################################################################### # Quickly extract many subsequences from fasta files. Makes # critical assumption that all data lines (aside from the last) in the # FASTA file(s) are the same length! # # Copyright 2010 Philip Johnson. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License at for # more details. # # # Input data: # 1. file w/ ranges -- [rest..] # 2. either fasta files OR file containing index of fasta files # (will build on demand if supply index parameter & file doesn't exist) # [3.] Optional alias file contains mapping for ids from first col # to second col; will be added to index: # ||... (special cased to process chr_NC_gi files # from NCBI) # # Output data: # either tab delimited: # [rest..] # or if -fastaOut: # >:- [rest..] # ########################################################################### use strict; use Getopt::Long; use lib <~>.'/bin'; use FaIndex; my ($idxFile, $aliasFile, $rangesFile, $justSeq, $ucBases, $fastaOut); if (!GetOptions('idx=s' => \$idxFile, 'aliases=s' => \$aliasFile, 'i=s' => \$rangesFile, 'justSeq' => \$justSeq, 'uc' => \$ucBases, 'fastaOut' => \$fastaOut) || !defined($rangesFile) || (@ARGV == 0 && !defined($idxFile))) { print "usage: $0 -i [-idx ] [-aliases ] [-justSeq] [-uc] [-fastaOut] \n". "\n". "Where:\n". "\t-alias file defines extra names for sequences (col1 maps to col2\n". "\t with the latter split by |); special case handling for\n". "\t 'chr_NC_gi' files from NCBI.\n". "\t-justSeq will only output sequence and not descriptive columns\n". "\t (chr, start, stop)\n". "\t-uc will upper case all bases (eliminating lc masking)\n". "\t-fastaOut will give output in fasta-format\n". "\n". "Must supply either index filename or fasta files. If both given,\n". "then index will be created (or updated if necessary) & saved to\n". "specified file.\n". "\n". "Input : [...]\n". "Output: [...]\n". " -or\n". " >:- [...]\n". " \n". "\n". "Note: coordinates are 1-based. Extraction is much faster if input ranges are grouped such that identical accessions\nare adjacent.\n". "\n"; exit 1; } my $idx; $idx = FaIndex->new(idx => $idxFile, fa => \@ARGV, alias => $aliasFile, undefOutOfRange => 1); open INFILE, $rangesFile; while () { chomp; my ($id, $sep1, $start, $sep2, $stop, $sep3, $remainder) = split /(:|,|(?<=\d)-|\s)/, $_, 4; if (!defined $stop && $start =~ /^[0-9]+$/) { $stop = $start; #ASSume requesting only one position $sep2 = $sep1; } die "fa_extract_many: invalid range ($_) around input line $.\n" if (!defined($stop)); my $seq = $idx->GetSeq($id, $start, $stop); if (!defined $seq) { warn "Not reporting data for $id:$start-$stop -- no data or off end of $id.\n"; next; } $remainder = defined($remainder) ? $sep3.$remainder : ''; if (defined $justSeq) { print STDOUT (defined $ucBases) ? uc $$seq : $$seq, "\n"; } elsif (defined $fastaOut) { print STDOUT ">$id:$start-$stop $remainder\n", (defined $ucBases) ? uc $seq : $$seq, "\n"; } else { print STDOUT $id, $sep1, $start, $sep2, $stop, "\t", (defined $ucBases) ? uc $seq : $$seq, $remainder, "\n"; } } close INFILE;