#!/usr/bin/perl -w
###########################################################################
# Quickly extract many subsequences from fasta files. Makes
# critical assumption that all data lines (aside from the last) in the
# FASTA file(s) are the same length!
#
# Copyright 2010 Philip Johnson.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License at for
# more details.
#
#
# Input data:
# 1. file w/ ranges -- [rest..]
# 2. either fasta files OR file containing index of fasta files
# (will build on demand if supply index parameter & file doesn't exist)
# [3.] Optional alias file contains mapping for ids from first col
# to second col; will be added to index:
# ||... (special cased to process chr_NC_gi files
# from NCBI)
#
# Output data:
# either tab delimited:
# [rest..]
# or if -fastaOut:
# >:- [rest..]
#
###########################################################################
use strict;
use Getopt::Long;
use lib <~>.'/bin';
use FaIndex;
my ($idxFile, $aliasFile, $rangesFile, $justSeq, $ucBases, $fastaOut);
if (!GetOptions('idx=s' => \$idxFile,
'aliases=s' => \$aliasFile,
'i=s' => \$rangesFile,
'justSeq' => \$justSeq,
'uc' => \$ucBases,
'fastaOut' => \$fastaOut) ||
!defined($rangesFile) ||
(@ARGV == 0 && !defined($idxFile))) {
print "usage: $0 -i [-idx ] [-aliases ] [-justSeq] [-uc] [-fastaOut] \n".
"\n".
"Where:\n".
"\t-alias file defines extra names for sequences (col1 maps to col2\n".
"\t with the latter split by |); special case handling for\n".
"\t 'chr_NC_gi' files from NCBI.\n".
"\t-justSeq will only output sequence and not descriptive columns\n".
"\t (chr, start, stop)\n".
"\t-uc will upper case all bases (eliminating lc masking)\n".
"\t-fastaOut will give output in fasta-format\n".
"\n".
"Must supply either index filename or fasta files. If both given,\n".
"then index will be created (or updated if necessary) & saved to\n".
"specified file.\n".
"\n".
"Input : [...]\n".
"Output: [...]\n".
" -or\n".
" >:- [...]\n".
" \n".
"\n".
"Note: coordinates are 1-based. Extraction is much faster if input ranges are grouped such that identical accessions\nare adjacent.\n".
"\n";
exit 1;
}
my $idx;
$idx = FaIndex->new(idx => $idxFile,
fa => \@ARGV,
alias => $aliasFile,
undefOutOfRange => 1);
open INFILE, $rangesFile;
while () {
chomp;
my ($id, $sep1, $start, $sep2, $stop, $sep3, $remainder) =
split /(:|,|(?<=\d)-|\s)/, $_, 4;
if (!defined $stop && $start =~ /^[0-9]+$/) {
$stop = $start; #ASSume requesting only one position
$sep2 = $sep1;
}
die "fa_extract_many: invalid range ($_) around input line $.\n" if (!defined($stop));
my $seq = $idx->GetSeq($id, $start, $stop);
if (!defined $seq) {
warn "Not reporting data for $id:$start-$stop -- no data or off end of $id.\n";
next;
}
$remainder = defined($remainder) ? $sep3.$remainder : '';
if (defined $justSeq) {
print STDOUT (defined $ucBases) ? uc $$seq : $$seq, "\n";
} elsif (defined $fastaOut) {
print STDOUT ">$id:$start-$stop $remainder\n",
(defined $ucBases) ? uc $seq : $$seq, "\n";
} else {
print STDOUT $id, $sep1, $start, $sep2, $stop, "\t",
(defined $ucBases) ? uc $seq : $$seq, $remainder, "\n";
}
}
close INFILE;