wordpos/tools/fastIndex.js

167 lines
4.2 KiB
JavaScript
Raw Normal View History

2012-05-20 18:29:10 +00:00
/**
* fastIndex.js
*
* override natural.WordNet's IndexFile to use fast index data
*
*/
var _ = require('underscore')._,
util = require('util'),
path = require('path'),
fs = require('fs'),
KEY_LENGTH = 3;
// load fast index bucket data
function loadFastIndex(dir, name) {
var jsonFile = path.join(dir, 'fast-' + name + '.json'),
data = null;
try{
data = JSON.parse( fs.readFileSync(jsonFile,'utf8') );
//console.log('loaded %d buckets for %s', data.stats.buckets, data.name);
} catch(e) {
console.error('Error with fast index file %s\n ', jsonFile, e);
}
return data;
}
function readIndexForKey(key, index, callback) {
var data = index.fastIndex,
offset = data.offsets[key][0],
nextKey = data.offsets[key][1],
nextOffset = data.offsets[nextKey][0],
len = nextOffset - offset - 1,
buffer = new Buffer(len);
fs.read(index.fd, buffer, 0, len, offset, function(err, count){
if (err) return console.log(err);
//console.log(' read %d bytes for <%s>', count, key);
callback(buffer);
});
}
function find(search, callback) {
var self = this,
data = this.fastIndex,
readCallbacks = this.cache,
miss = {status: 'miss'},
args = [search, callback];
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss);
// queue up if already reading file for this key
if (key in readCallbacks){
readCallbacks[key].push(args);
return;
}
readCallbacks[key] = [args];
if (!this.fd) {
//console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// ref count so we know when to close the main index file
++this.refcount;
readIndexForKey(key, this, function (buffer){
var lines = buffer.toString().split('\n'),
keys = lines.map(function(line){
return line.substring(0,line.indexOf(' '));
});
readCallbacks[key].forEach( test );
delete readCallbacks[key];
if (--self.refcount == 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
function test(item) {
var search = item[0],
callback = item[1],
ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search!
//console.log(' %s is %d', search, ind);
if (ind == -1) return callback(miss);
var tokens = lines[ind].split(/\s+/),
key = tokens[0],
result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens};
callback(result);
}
});
}
function find____(search, callback) {
// console.log(' >> ', search, this.fileName, this.fd);
var self = this,
data = this.fastIndex,
miss = {status: 'miss'};
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss);
if (!this.fd) {
// console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// ref count so we know when to close the main index file
++this.refcount;
var offset = data.offsets[key][0],
nextKey = data.offsets[key][1],
nextOffset = data.offsets[nextKey][0],
len = nextOffset - offset - 1,
buffer = new Buffer(len),
pos = Math.ceil(len / 2) - 0;
console.log('--', offset, len, offset+len, offset+pos);
// call base class's _findAt to search only relevant portion
this._findAt(this.fd, // fd
offset+len * 1, // size (more like 'end' of buffer)
offset+pos, // pos
null, // lastPos
pos * 1, // adjustment
search, // key
done); // callback
function done(result) {
//console.log(self.refcount, search, result && result.line);
if (--self.refcount == 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
callback(result);
}
}
// cache of fast index data across instances of WordPOS class
var cache = {};
module.exports = {
find: function(index){
var key = index.filePath,
data;
if (!(key in cache)) {
data = loadFastIndex(index.dataDir, index.fileName);
cache[key] = data;
}
// if no fast index data was found or was corrupt, use original find
if (!cache[key]) return index.find;
index.fastIndex = cache[key];
index.refcount = 0;
index.cache = {};
return find;
}
};