########################################################################### # FF_Index --> Flat-File index package. Written by Philip Johnson. # Version 3 which ties to hash # # Copyright 2010 Philip Johnson. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License at for # more details. # ########################################################################### =head1 NAME FF_Index - Flat file indexer ver. 3.1 =head1 SYNOPSIS use lib <~>.'/bin'; use FF_Index; my $idx = FF_Index->new($filename, \&FindNextCB, inMemory => 1); print $idx->{key} if defined $idx->{key}; =head1 DESCRIPTION FF_Index creates an on-disk index (named $filename.idx) of records stored in $filename. Records are defined by the supplied callback function ("FindNextCB" above). If the index is small enough, tell FF_Index to keep it in memory via the "inMemory" option. =head1 CONSTRUCTOR =head2 FF_Index->new() $idx = FF_Index->new($filename, \&FindNextCB); $idx = FF_Index->new($filename, \&FindNextCB, cmpCB => \&ComparisionCB, |=~-optional arguments below-~=| inMemory => 1, returnPos => 1, multi => 0, idx => '/tmp/myfile.idx'); Creates a new index (if necessary) and returns a reference to a tied hash that can be used to access the data. =over 4 =item filename The data file =item FindNextCB Callback function that takes two parameters: a filehandle and a hash reference. Returns true upon success and hash contains a "key" and a "pos" (tell() position within the filehandle). If eof, returns false. =item cmpCB Callback function for comparing two keys. Defaults to 'cmp' operator. =item inMemory If == 0 (default), then uses disk-based binary search. If >= 1, then stores index in memory. If >= 2, then stores *entire* data file in memory. Make sure you have enough RAM! =item returnPos When querying a key, returns position (byte #) in file; otherwise returns an open file handle pointing to this position (the default). =item multi Allow multiple records with the same key (return list when querying). Otherwise will throw an error when indexing. =item idxEE Use the specified filename (and path, if supplied) for the index; otherwise defaults to ".idx". =item forceNoUpdateIdx Dangerous!! Will *NOT* update index even if timestamp on data file is younger. =back =head1 CALLBACK EXAMPLE sub FindNext($$) { my ($fh, $record) = @_; while (defined($_ = <$fh>) && !/^>/) {} #skip to next if (defined($_)) { ($record->{key}) = /^>(\w+)/; $record->{pos} = tell($fh) - length($_); return 1; } else { return 0; } } =head1 OTHER METHODS =head2 GetKeys() my @keys = tied(%$idx)->GetKeys(); Note this ONLY works if using an in-memory index. =head1 AUTHOR - Philip Johnson =cut package FF_Index; use strict; use Carp; use IO::File; our($VERSION) = 3.1; our($IDXVERSION) = 3.1; #----------------------------------------------------------------------------- # PRE : filename of flat file, reference to comparison function # POST: FF_Index object constructed sub new($$$%) { my ($class, $filename, $findNextCB, %args) = @_; my $this = {}; $this->{m_Filename} = $filename; $this->{m_CmpCB} = $args{cmpCB}; $this->{m_MemIndex} = (defined($args{inMemory}) && $args{inMemory} > 0 ? {} : undef); $this->{m_MemData} = (defined($args{inMemory}) && $args{inMemory} > 1); $this->{m_ReturnPos} = (defined($args{returnPos}) && $args{returnPos}); $this->{m_Multi} = (defined($args{multi}) && $args{multi}); $this->{m_IdxFilename} = defined($args{idx}) ? $args{idx} : $filename.'.idx'; $args{forceNoUpdateIdx} = 0 if !defined($args{forceNoUpdateIdx}); if ($this->{m_MemData}) { open DATAFILE, $filename or croak "'$filename': $!"; read DATAFILE, $this->{m_MemData}, (-s $filename) or croak "Error trying to load '$filename' into memory: $!"; close DATAFILE; $this->{m_FileHandle}=IO::File->new(\$this->{m_MemData}, '<:') or croak "$!"; } else { $this->{m_FileHandle} = IO::File->new($filename) or croak "'$filename': $!"; } if (!(-e $this->{m_IdxFilename}) || ((-M $this->{m_IdxFilename} > -M $filename) && !$args{forceNoUpdateIdx})) { if (!defined $findNextCB) { croak "No index found for '$filename' and no callback provided.\n"; } CreateIndex($this, $findNextCB); }; if (defined $this->{m_MemIndex}) { LoadIndex($this); } my $dummy = {}; tie %$dummy, $class, $this; return $dummy; } #----------------------------------------------------------------------------- # PRE : none # POST: FF_Index object destroyed sub DESTROY { my $this = shift; close $this->{m_FileHandle} if defined $this->{m_FileHandle}; } #----------------------------------------------------------------------------- # PRE : reference to function that scans for the next record in the file # POST: key/position index created & saved to disk sub CreateIndex($$) { my ($this, $findNextCB) = @_; seek($this->{m_FileHandle}, 0, 0); my @index; my %record; my $c=0; while (&$findNextCB($this->{m_FileHandle}, \%record)) { print STDERR "Indexing $c\r" if (++$c % 10000 == 0); if (!defined $record{key} || !defined $record{pos}) { croak "Error in FINDNEXT callback function\n"; } push @index, [$record{key}, $record{pos}]; } open INDEX, ">$this->{m_IdxFilename}" or confess "Error opening '$this->{m_IdxFilename}': $!\n"; print INDEX "#FF_Index.pm index version $IDXVERSION\n"; my $prev; foreach my $record (defined($this->{m_CmpCB}) ? sort {$this->{m_CmpCB}($a->[0], $b->[0])} @index : sort {$a->[0] cmp $b->[0]} @index) { if (defined $prev && $record->[0] eq $prev) { if ($this->{m_Multi}) { print INDEX ",", $record->[1]; } else { confess("Multiple records with key '$prev' and '-multi' ". "option not specified to FF_Index."); } } else { print INDEX "\n" if defined $prev; print INDEX join("\t", @$record); } $prev = $record->[0]; } print INDEX "\n"; close INDEX or confess "$!\n"; } #----------------------------------------------------------------------------- # PRE : index exists on disk # POST: index loaded into memory sub LoadIndex($) { my ($this) = @_; $this->{m_MemIndex} = defined($this->{m_CmpCB}) ? [] : {}; open INDEX, $this->{m_IdxFilename} or confess "'$this->{m_IdxFilename}': $!"; while () { next if /^#/; chomp; my ($key, $pos) = split; if (!defined($this->{m_CmpCB})) { #hash if possible $this->{m_MemIndex}->{$key} = $pos; } else { #array otherwise push @{$this->{m_MemIndex}}, [$key, $pos]; } } close INDEX; } #----------------------------------------------------------------------------- # PRE : memory index # POST: list of keys sub GetKeys($) { my ($this) = @_; croak "FF_Index::GetKeys called without a memory-based index!" if !defined($this->{m_MemIndex}); return keys(%{$this->{m_MemIndex}}); } #----------------------------------------------------------------------------- # PRE : two scalars # POST: the larger of the two sub max($$) { return ($_[0] > $_[1]) ? $_[0] : $_[1]; } #----------------------------------------------------------------------------- # PRE : open filehandle # POST: seeked to beginning of current line sub SeekLineBegin($) { my ($fh) = @_; my ($buff, $pos); do { my $prevPos = tell($fh); my $newPos = max(0, $prevPos - 50); seek($fh, $newPos, 0); read($fh, $buff, $prevPos - $newPos); $pos = rindex($buff, "\n"); seek($fh, $newPos, 0); } until ($pos >= 0 || tell($fh) == 0); seek(INDEX, tell($fh)+$pos+1, 0); } #----------------------------------------------------------------------------- # PRE : key # POST: location from index on disk sub UseDiskIndex($$) { my ($this, $target) = @_; my $fwd = 0; my $rev = (-s $this->{m_IdxFilename}) - 1; open INDEX, $this->{m_IdxFilename} or confess "'$this->{m_IdxFilename}': $!"; while ($fwd <= $rev) { my $mid = int(($fwd + $rev) / 2); seek(INDEX, $mid,0); SeekLineBegin(\*INDEX); my $line = ; #print STDERR "$mid\t$line"; chomp $line; my ($key, $pos) = split /\t/, $line; my $res = (defined $this->{m_CmpCB} ? &{$this->{m_CmpCB}}($target, $key) : $target cmp $key); if ($res < 0) { if ($rev == $mid) { #$fwd -= 2; #print STDERR "yep--$target\n" } $rev = $mid-1;#tell(INDEX) - length($line); } elsif ($res > 0) { if ($fwd == $mid) { #print STDERR "hmmm--$target\n"; } $fwd = $mid+1;#tell(INDEX) - length($line); } else { close INDEX; return $pos; } } close INDEX; return undef; } #----------------------------------------------------------------------------- # PRE : key # POST: location from index in memory sub UseMemIndex($$) { my ($this, $target) = @_; if (!defined $this->{m_CmpCB}) { #hash if possible; otherwise binary search return $this->{m_MemIndex}->{$target}; } my $fwd = 0; my $rev = @{$this->{m_MemIndex}}-1; while ($fwd <= $rev) { my $mid = int(($fwd + $rev) / 2); my $entry = ${$this->{m_MemIndex}}[$mid]; my $res = &{$this->{m_CmpCB}}($target, $entry->[0]); if ($res < 0) { $rev = $mid-1; } elsif ($res > 0) { $fwd = $mid+1; } else { return $entry->[1]; } } return undef; } #----------------------------------------------------------------------------- # PRE : none # POST: filehandle open on data sub CreateFH($) { my ($this) = @_; if ($this->{m_MemData}) { return IO::File->new(\$this->{m_MemData}, '<:') or croak "$!"; } else { return IO::File->new($this->{m_Filename}) or croak "'$this->{m_Filename}': $!"; } } #----------------------------------------------------------------------------- # PRE : key # POST: filehandle pointing to this record (undef if doesn't exist) sub GetRecord($$) { my ($this, $target) = @_; croak "GetRecord called with undefined target!" if !defined $target; my $pos = (!defined $this->{m_MemIndex}) ? $this->UseDiskIndex($target) : $this->UseMemIndex($target); if (!defined $pos) { return undef; } if ($this->{m_ReturnPos}) { return $this->{m_Multi} ? [split(/,/, $pos)] : $pos; } # if not m_ReturnPos, then return an open filehandle pointing to this pos if (!defined fileno($this->{m_FileHandle})) { $this->{m_FileHandle} = $this->CreateFH(); # if ($this->{m_MemData}) { # $this->{m_FileHandle}=IO::File->new(\$this->{m_MemData}, '<:') # or croak "$!"; # } else { # $this->{m_FileHandle} = IO::File->new($this->{m_Filename}) or # croak "'$this->{m_Filename}': $!"; # } } my @pos = split(/,/, $pos); seek($this->{m_FileHandle}, shift(@pos), 0) or confess "$!"; if (!$this->{m_Multi}) { return $this->{m_FileHandle}; } else { my $fh = [$this->{m_FileHandle}]; foreach my $p (@pos) { push @$fh, $this->CreateFH(); seek($fh->[-1], $p, 0) or die $!; } return $fh; } } #----------------------------------------------------------------------------- # Tied methods sub TIEHASH { my ($class, $this) = @_; return bless $this, $class; } sub FETCH { my ($this, $target) = @_; return $this->GetRecord($target); } #perl wants modules to return "TRUE" 1;