#!/usr/bin/perl -w
###########################################################################
#
# Lift over single coordinates from one genome to another -- but do it
# waay faster than UCSC's liftOver program
#
# Copyright 2010 Philip Johnson.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License at for
# more details.
#
# Two requirements:
# 1) query coordinates must be sorted by chromosome and position.
# 2) chain file must be split into non-overlapping blocks and
# sorted according to the same key as the query coords
#
# 24 Aug 2010: ug..added stranding to output (assume input is +)
#
# Input data:
# [other fields optional]
# Output data (remapped):
# [other fields optional]
#
###########################################################################
use strict;
use Getopt::Long;
use File::Temp;
my ($posFile, $chainFile, $reSort, $contigBases);
$contigBases = 1;
if (!GetOptions('pos=s' => \$posFile,
'chain=s' => \$chainFile,
'reSort' => \$reSort,
'contiguousBases=i' => \$contigBases) ||
!defined($posFile && $chainFile)) {
print "usage: $0 -pos -chain [-contig ] [-reSort]\n".
"Where\n".
"\t-pos <> file w/ coordinates to be lifted\n".
"\t-chain <> chain file must be sorted!\n".
"\t-contig <> specifies a minimum number of contigous mapped bases\n".
"\t centered around the coordinate to be lifted\n".
"\t-reSort sorts the positions to be lifted, then resorts the output\n".
"\t back to the original order (i.e. potentially slow)\n".
"\n".
"Input data (must be sorted or specify -reSort; coordinates are 1-based):
[other fields optional]
Output data (mapped on new genome):
[other fields propagated through]
-or-
Unmappable:
\n";
exit 1;
}
$contigBases = int($contigBases / 2); #convert to +/- window
my $OUTFILE = \*STDOUT;
if (defined $reSort) {
SortPosfile($posFile);
$OUTFILE = new File::Temp;
}
my ($tName, $tSize, $tStrand, $tStart, $tEnd,
$qName, $qSize, $qStrand, $qStart, $qEnd);
my ($size, $dt, $dq);
open POS, $posFile or die $!;
open CHAIN, $chainFile or die $!;
my ($prevAcc, $prevPos) = ('', 0);
$tName = '';
$tStart = 0;
while (defined($_ = )) {
chomp;
my ($fromAcc, $fromPos, $remainder) = split /\s+/, $_, 3;
--$fromPos; #convert to zero-based
if ($fromAcc lt $prevAcc || #protect against forgetfulness
($fromAcc eq $prevAcc && $fromPos < $prevPos)) {
die "Input file not sorted! fastLift requires sorted input.\n";
}
$prevPos = $fromPos;
$prevAcc = $fromAcc;
while (!eof(CHAIN) && ($tName lt $fromAcc ||
($tName eq $fromAcc && $tEnd <= $fromPos)) &&
LoadNextBlock()) {
#print "$tName, $fromAcc; $fromPos, $tStart, $tEnd\n";
}
if ($tName ne $fromAcc || $fromPos < $tStart+$contigBases || $fromPos > $tEnd-$contigBases) {
print "Unmappable:\t$fromAcc\t", $fromPos+1, "\n";
next;
}
my $toAcc = $qName;
my $toPos = ($qStrand eq '+') ?
$qStart + ($fromPos - $tStart):
$qSize - ($qStart + ($fromPos - $tStart)) - 1;
++$toPos; #convert to one-based
$remainder = (defined $remainder) ? "\t".$remainder : '';
print $OUTFILE join("\t", $toAcc, $toPos, $qStrand), $remainder, "\n";
}
close CHAIN;
close POS;
if (defined $reSort) {
ResortOutfile($OUTFILE);
}
sub LoadNextBlock {
my $l = ;
while (defined $l && $l =~ /^$/) { #skip blank lines separating chains
$l = ;
}
return 0 if !defined $l;
if ($l =~ /^c/) {
(undef, undef, $tName, $tSize, $tStrand, $tStart, $tEnd,
$qName, $qSize, $qStrand, $qStart, $qEnd) = split /\s+/, $l;
$l = ;
} else {
$tStart += $size + $dt;
$qStart += $size + $dq;
}
($size, $dt, $dq) = split /\s+/, $l;
$tEnd = $tStart + $size;
$qEnd = $qStart + $size;
return 1;
}
#------------------------------------------------------------------------------
sub SortPosfile {
my ($origFile) = @_;
my $tmpfile = new File::Temp;
system(q*perl -ne 'chomp; print "$_\t$.\n"' *." $origFile | sort -k1,1 -k2n,2 > $tmpfile") == 0 or die;
$_[0] = $tmpfile;
}
sub ResortOutfile {
my ($outFile) = @_;
my $cmd = q*perl -nae 'print scalar(@F)' *.$outFile;
my $field = `$cmd`;
system("sort -k${field}n,$field $outFile | ".q*perl -nae 'print join("\t", @F[0..($#F-1)]), "\n"'*) == 0 or die;
}