###########################################################################
# FF_Index --> Flat-File index package. Written by Philip Johnson.
# Version 3 which ties to hash
#
# Copyright 2010 Philip Johnson.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License at for
# more details.
#
###########################################################################
=head1 NAME
FF_Index - Flat file indexer ver. 3.1
=head1 SYNOPSIS
use lib <~>.'/bin';
use FF_Index;
my $idx = FF_Index->new($filename, \&FindNextCB, inMemory => 1);
print $idx->{key} if defined $idx->{key};
=head1 DESCRIPTION
FF_Index creates an on-disk index (named $filename.idx) of records stored in $filename. Records are defined by the supplied callback function ("FindNextCB" above). If the index is small enough, tell FF_Index to keep it in memory via the "inMemory" option.
=head1 CONSTRUCTOR
=head2 FF_Index->new()
$idx = FF_Index->new($filename, \&FindNextCB);
$idx = FF_Index->new($filename, \&FindNextCB,
cmpCB => \&ComparisionCB,
|=~-optional arguments below-~=|
inMemory => 1,
returnPos => 1,
multi => 0,
idx => '/tmp/myfile.idx');
Creates a new index (if necessary) and returns a reference to a tied hash that can be used to access the data.
=over 4
=item filename
The data file
=item FindNextCB
Callback function that takes two parameters: a filehandle and a hash reference. Returns true upon success and hash contains a "key" and a "pos" (tell() position within the filehandle). If eof, returns false.
=item cmpCB
Callback function for comparing two keys. Defaults to 'cmp' operator.
=item inMemory
If == 0 (default), then uses disk-based binary search.
If >= 1, then stores index in memory.
If >= 2, then stores *entire* data file in memory. Make sure you have enough RAM!
=item returnPos
When querying a key, returns position (byte #) in file; otherwise returns an open file handle pointing to this position (the default).
=item multi
Allow multiple records with the same key (return list when querying). Otherwise will throw an error when indexing.
=item idxEE
Use the specified filename (and path, if supplied) for the index; otherwise defaults to ".idx".
=item forceNoUpdateIdx
Dangerous!! Will *NOT* update index even if timestamp on data file is younger.
=back
=head1 CALLBACK EXAMPLE
sub FindNext($$) {
my ($fh, $record) = @_;
while (defined($_ = <$fh>) && !/^>/) {} #skip to next
if (defined($_)) {
($record->{key}) = /^>(\w+)/;
$record->{pos} = tell($fh) - length($_);
return 1;
} else {
return 0;
}
}
=head1 OTHER METHODS
=head2 GetKeys()
my @keys = tied(%$idx)->GetKeys();
Note this ONLY works if using an in-memory index.
=head1 AUTHOR - Philip Johnson
=cut
package FF_Index;
use strict;
use Carp;
use IO::File;
our($VERSION) = 3.1;
our($IDXVERSION) = 3.1;
#-----------------------------------------------------------------------------
# PRE : filename of flat file, reference to comparison function
# POST: FF_Index object constructed
sub new($$$%) {
my ($class, $filename, $findNextCB, %args) = @_;
my $this = {};
$this->{m_Filename} = $filename;
$this->{m_CmpCB} = $args{cmpCB};
$this->{m_MemIndex} = (defined($args{inMemory}) && $args{inMemory} > 0 ?
{} : undef);
$this->{m_MemData} = (defined($args{inMemory}) && $args{inMemory} > 1);
$this->{m_ReturnPos} = (defined($args{returnPos}) && $args{returnPos});
$this->{m_Multi} = (defined($args{multi}) && $args{multi});
$this->{m_IdxFilename} = defined($args{idx}) ? $args{idx} :
$filename.'.idx';
$args{forceNoUpdateIdx} = 0 if !defined($args{forceNoUpdateIdx});
if ($this->{m_MemData}) {
open DATAFILE, $filename or croak "'$filename': $!";
read DATAFILE, $this->{m_MemData}, (-s $filename) or
croak "Error trying to load '$filename' into memory: $!";
close DATAFILE;
$this->{m_FileHandle}=IO::File->new(\$this->{m_MemData}, '<:')
or croak "$!";
} else {
$this->{m_FileHandle} = IO::File->new($filename) or
croak "'$filename': $!";
}
if (!(-e $this->{m_IdxFilename}) ||
((-M $this->{m_IdxFilename} > -M $filename) &&
!$args{forceNoUpdateIdx})) {
if (!defined $findNextCB) {
croak "No index found for '$filename' and no callback provided.\n";
}
CreateIndex($this, $findNextCB);
};
if (defined $this->{m_MemIndex}) {
LoadIndex($this);
}
my $dummy = {};
tie %$dummy, $class, $this;
return $dummy;
}
#-----------------------------------------------------------------------------
# PRE : none
# POST: FF_Index object destroyed
sub DESTROY {
my $this = shift;
close $this->{m_FileHandle} if defined $this->{m_FileHandle};
}
#-----------------------------------------------------------------------------
# PRE : reference to function that scans for the next record in the file
# POST: key/position index created & saved to disk
sub CreateIndex($$) {
my ($this, $findNextCB) = @_;
seek($this->{m_FileHandle}, 0, 0);
my @index;
my %record;
my $c=0;
while (&$findNextCB($this->{m_FileHandle}, \%record)) {
print STDERR "Indexing $c\r" if (++$c % 10000 == 0);
if (!defined $record{key} || !defined $record{pos}) {
croak "Error in FINDNEXT callback function\n";
}
push @index, [$record{key}, $record{pos}];
}
open INDEX, ">$this->{m_IdxFilename}" or
confess "Error opening '$this->{m_IdxFilename}': $!\n";
print INDEX "#FF_Index.pm index version $IDXVERSION\n";
my $prev;
foreach my $record
(defined($this->{m_CmpCB}) ?
sort {$this->{m_CmpCB}($a->[0], $b->[0])} @index :
sort {$a->[0] cmp $b->[0]} @index)
{
if (defined $prev && $record->[0] eq $prev) {
if ($this->{m_Multi}) {
print INDEX ",", $record->[1];
} else {
confess("Multiple records with key '$prev' and '-multi' ".
"option not specified to FF_Index.");
}
} else {
print INDEX "\n" if defined $prev;
print INDEX join("\t", @$record);
}
$prev = $record->[0];
}
print INDEX "\n";
close INDEX or confess "$!\n";
}
#-----------------------------------------------------------------------------
# PRE : index exists on disk
# POST: index loaded into memory
sub LoadIndex($) {
my ($this) = @_;
$this->{m_MemIndex} = defined($this->{m_CmpCB}) ? [] : {};
open INDEX, $this->{m_IdxFilename} or
confess "'$this->{m_IdxFilename}': $!";
while () {
next if /^#/;
chomp;
my ($key, $pos) = split;
if (!defined($this->{m_CmpCB})) { #hash if possible
$this->{m_MemIndex}->{$key} = $pos;
} else { #array otherwise
push @{$this->{m_MemIndex}}, [$key, $pos];
}
}
close INDEX;
}
#-----------------------------------------------------------------------------
# PRE : memory index
# POST: list of keys
sub GetKeys($) {
my ($this) = @_;
croak "FF_Index::GetKeys called without a memory-based index!"
if !defined($this->{m_MemIndex});
return keys(%{$this->{m_MemIndex}});
}
#-----------------------------------------------------------------------------
# PRE : two scalars
# POST: the larger of the two
sub max($$) {
return ($_[0] > $_[1]) ? $_[0] : $_[1];
}
#-----------------------------------------------------------------------------
# PRE : open filehandle
# POST: seeked to beginning of current line
sub SeekLineBegin($) {
my ($fh) = @_;
my ($buff, $pos);
do {
my $prevPos = tell($fh);
my $newPos = max(0, $prevPos - 50);
seek($fh, $newPos, 0);
read($fh, $buff, $prevPos - $newPos);
$pos = rindex($buff, "\n");
seek($fh, $newPos, 0);
} until ($pos >= 0 || tell($fh) == 0);
seek(INDEX, tell($fh)+$pos+1, 0);
}
#-----------------------------------------------------------------------------
# PRE : key
# POST: location from index on disk
sub UseDiskIndex($$) {
my ($this, $target) = @_;
my $fwd = 0;
my $rev = (-s $this->{m_IdxFilename}) - 1;
open INDEX, $this->{m_IdxFilename} or
confess "'$this->{m_IdxFilename}': $!";
while ($fwd <= $rev) {
my $mid = int(($fwd + $rev) / 2);
seek(INDEX, $mid,0);
SeekLineBegin(\*INDEX);
my $line = ;
#print STDERR "$mid\t$line";
chomp $line;
my ($key, $pos) = split /\t/, $line;
my $res = (defined $this->{m_CmpCB} ?
&{$this->{m_CmpCB}}($target, $key) :
$target cmp $key);
if ($res < 0) {
if ($rev == $mid) {
#$fwd -= 2;
#print STDERR "yep--$target\n"
}
$rev = $mid-1;#tell(INDEX) - length($line);
} elsif ($res > 0) {
if ($fwd == $mid) {
#print STDERR "hmmm--$target\n";
}
$fwd = $mid+1;#tell(INDEX) - length($line);
} else {
close INDEX;
return $pos;
}
}
close INDEX;
return undef;
}
#-----------------------------------------------------------------------------
# PRE : key
# POST: location from index in memory
sub UseMemIndex($$) {
my ($this, $target) = @_;
if (!defined $this->{m_CmpCB}) { #hash if possible; otherwise binary search
return $this->{m_MemIndex}->{$target};
}
my $fwd = 0;
my $rev = @{$this->{m_MemIndex}}-1;
while ($fwd <= $rev) {
my $mid = int(($fwd + $rev) / 2);
my $entry = ${$this->{m_MemIndex}}[$mid];
my $res = &{$this->{m_CmpCB}}($target, $entry->[0]);
if ($res < 0) {
$rev = $mid-1;
} elsif ($res > 0) {
$fwd = $mid+1;
} else {
return $entry->[1];
}
}
return undef;
}
#-----------------------------------------------------------------------------
# PRE : none
# POST: filehandle open on data
sub CreateFH($) {
my ($this) = @_;
if ($this->{m_MemData}) {
return IO::File->new(\$this->{m_MemData}, '<:')
or croak "$!";
} else {
return IO::File->new($this->{m_Filename}) or
croak "'$this->{m_Filename}': $!";
}
}
#-----------------------------------------------------------------------------
# PRE : key
# POST: filehandle pointing to this record (undef if doesn't exist)
sub GetRecord($$) {
my ($this, $target) = @_;
croak "GetRecord called with undefined target!" if !defined $target;
my $pos = (!defined $this->{m_MemIndex}) ?
$this->UseDiskIndex($target) :
$this->UseMemIndex($target);
if (!defined $pos) {
return undef;
}
if ($this->{m_ReturnPos}) {
return $this->{m_Multi} ? [split(/,/, $pos)] : $pos;
} # if not m_ReturnPos, then return an open filehandle pointing to this pos
if (!defined fileno($this->{m_FileHandle})) {
$this->{m_FileHandle} = $this->CreateFH();
# if ($this->{m_MemData}) {
# $this->{m_FileHandle}=IO::File->new(\$this->{m_MemData}, '<:')
# or croak "$!";
# } else {
# $this->{m_FileHandle} = IO::File->new($this->{m_Filename}) or
# croak "'$this->{m_Filename}': $!";
# }
}
my @pos = split(/,/, $pos);
seek($this->{m_FileHandle}, shift(@pos), 0) or confess "$!";
if (!$this->{m_Multi}) {
return $this->{m_FileHandle};
} else {
my $fh = [$this->{m_FileHandle}];
foreach my $p (@pos) {
push @$fh, $this->CreateFH();
seek($fh->[-1], $p, 0) or die $!;
}
return $fh;
}
}
#-----------------------------------------------------------------------------
# Tied methods
sub TIEHASH {
my ($class, $this) = @_;
return bless $this, $class;
}
sub FETCH {
my ($this, $target) = @_;
return $this->GetRecord($target);
}
#perl wants modules to return "TRUE"
1;