added fastIndex feature. v0.1.4

This commit is contained in:
moos 2012-05-20 11:29:10 -07:00
parent a8ae4c3f13
commit 6652265ef0
6 changed files with 762 additions and 7 deletions

View File

@ -197,7 +197,12 @@ WordPOS.defaults = {
/** /**
* enable profiling, time in msec returned as second argument in callback * enable profiling, time in msec returned as second argument in callback
*/ */
profile: false profile: false,
/**
* use fast index if available
*/
fastIndex: true
}; };
``` ```
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call. To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
@ -208,6 +213,8 @@ To override, pass an options hash to the constructor. With the `profile` option,
// true 29 // true 29
``` ```
Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tool/stat.js.
Benchmark Benchmark
---------- ----------
@ -225,7 +232,7 @@ Single word lookup:
getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 } getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 }
``` ```
128-word lookup: 128-word lookup (orig) :
``` ```
getPOS : 0 ops/s { iterations: 1, elapsed: 2210 } getPOS : 0 ops/s { iterations: 1, elapsed: 2210 }
getNouns : 2 ops/s { iterations: 1, elapsed: 666 } getNouns : 2 ops/s { iterations: 1, elapsed: 666 }
@ -234,9 +241,17 @@ Single word lookup:
getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 } getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 }
``` ```
128-word lookup (fastIndex) :
```
getPOS : 36 ops/s { iterations: 1, elapsed: 28 }
getNouns : 125 ops/s { iterations: 1, elapsed: 8 }
getVerbs : 500 ops/s { iterations: 1, elapsed: 2 }
getAdjectives : 500 ops/s { iterations: 1, elapsed: 2 }
getAdverbs : 1000 ops/s { iterations: 1, elapsed: 1 }
```
On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files. On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.
There is probably room for optimization in the underlying library.
License License
------- -------

View File

@ -3,7 +3,7 @@
"author": "Moos <mooster@42at.com>", "author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"], "keywords": ["natural", "language", "wordnet", "pos"],
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.3", "version": "0.1.4",
"homepage": "https://github.com/moos/wordpos", "homepage": "https://github.com/moos/wordpos",
"engines": { "engines": {
"node": ">=0.4.10" "node": ">=0.4.10"
@ -20,5 +20,8 @@
"type" : "git", "type" : "git",
"url" : "git://github.com/moos/wordpos.git" "url" : "git://github.com/moos/wordpos.git"
}, },
"main": "./wordpos.js" "main": "./wordpos.js",
"scripts": {
"postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun"
}
} }

402
tools/buffered-reader.js Normal file
View File

@ -0,0 +1,402 @@
/**
* @name BufferedReader.
* @description Fully configurable buffered reader for node.js.
*
* @author Gabriel Llamas
* @created 10/04/2012
* @modified 01/05/2012
* @version 0.2.0
*
* Forked: https://github.com/moos/Node-BufferedReader
*/
"use strict";
var EVENTS = require ("events");
var FS = require ("fs");
var BUFFER_SIZE = 16384;
var INVALID_BUFFER_SIZE = "The buffer size must be greater than 0.";
var INVALID_START_OFFSET = "The start offset must be greater than or equals to 0.";
var INVALID_END_OFFSET = "The end offset must be greater than or equals to 0.";
var INVALID_RANGE_OFFSET = "The end offset must be greater than or equals to the start offset.";
var INVALID_BYTES_RANGE_ERROR = "The number of bytes to read must be greater than 0.";
var INVALID_SEEK_OFFSET = "The offset must be greater than or equals to 0.";
var NO_FILE_ERROR = "The source is not a file.";
var BufferedReader = function (fileName, settings){
EVENTS.EventEmitter.call (this);
settings = settings || {};
if (settings.bufferSize === 0) settings.bufferSize = -1;
this._settings = {
bufferSize: settings.bufferSize || BUFFER_SIZE,
encoding: settings.encoding || null,
start: settings.start || 0,
end: settings.end
};
if (this._settings.bufferSize < 1) throw new Error (INVALID_BUFFER_SIZE);
if (this._settings.start < 0) throw new Error (INVALID_START_OFFSET);
if (this._settings.end < 0) throw new Error (INVALID_END_OFFSET);
if (this._settings.end < this._settings.start) throw new Error (INVALID_RANGE_OFFSET);
this._fileName = fileName;
this._fd = null;
this._buffer = null;
this._fileOffset = this._settings.start;
this._bufferOffset = 0;
this._dataOffset = 0;
this._realOffset = this._settings.start;
this._fileSize = null;
this._initialized = false;
this._interrupted = false;
this._isEOF = false;
this._noMoreBuffers = false;
this._needRead = false;
};
BufferedReader.prototype = Object.create (EVENTS.EventEmitter.prototype);
BufferedReader.prototype.constructor = BufferedReader;
BufferedReader.prototype.interrupt = function (){
this._interrupted = true;
};
BufferedReader.prototype.read = function (){
var stream = FS.createReadStream (this._fileName, this._settings);
var lastChunk;
var buffer;
var me = this;
var lineOffset = 0,
lineCount = 0,
byteOffset = 0;
var onChar = this.listeners ("character").length !== 0,
onLine = this.listeners ("line").length !== 0,
onByte = this.listeners ("byte").length !== 0,
loop = onChar || onLine || onByte;
stream.on ("data", function (data){
buffer = data;
var offset = 0;
var chunk;
var character;
var len = data.length;
if (loop){
for (var i=0; i<len; i++){
if (me._interrupted) break;
character = data[i];
if (stream.encoding){
onChar && me.emit ("character", character === "\r" ? "\n" : character, byteOffset + i);
}else{
onByte && me.emit ("byte", character, byteOffset + i);
continue;
}
if (!onLine) continue;
if (character === "\n" || character === "\r"){
chunk = data.slice (offset, i);
if (lastChunk){
chunk = lastChunk.concat (chunk);
}
if (i + 1 !== len && character === "\r" && data[i + 1] === "\n"){
i++;
}
me.emit ("line", chunk, lineOffset + offset, ++lineCount);
offset = i + 1;
if (lastChunk){
lineOffset += lastChunk.length;
lastChunk = null;
}
}
}
if (stream.encoding && offset !== len){
var s = offset === 0 ? data : data.slice (offset);
lastChunk = lastChunk ? lastChunk.concat (s) : s;
}
lineOffset += offset;
}
me.emit ("buffer", data, byteOffset);
if (me._interrupted){
me._interrupted = false;
stream.destroy ();
me.emit ("end");
}
byteOffset += len;
});
stream.on ("end", function (){
me._interrupted = false;
if (loop && lastChunk){
me.emit ("line", lastChunk);
}
me.emit ("end");
});
stream.on ("error", function (error){
me._interrupted = false;
me.emit ("error", error);
});
};
BufferedReader.prototype._init = function (cb){
var me = this;
FS.stat (this._fileName, function (error, stats){
if (error) return cb (error);
if (stats.isFile ()){
if (me._settings.start >= stats.size){
me._isEOF = true;
return cb (null);
}
if (!me._settings.end && me._settings.end !== 0){
me._settings.end = stats.size;
}
if (me._settings.end >= stats.size){
me._settings.end = stats.size - 1;
}
me._fileSize = stats.size;
cb (null);
}else{
cb (new Error (NO_FILE_ERROR));
}
});
};
BufferedReader.prototype._read = function (cb){
var me = this;
var size = this._settings.bufferSize;
FS.read (this._fd, this._buffer, 0, size, this._fileOffset, function (error, bytesRead){
if (error) return cb (error);
me._fileOffset += bytesRead;
if (me._fileOffset === me._fileSize){
me._noMoreBuffers = true;
}
if (bytesRead < size){
me._buffer = me._buffer.slice (0, bytesRead);
}
cb (null);
});
};
BufferedReader.prototype._readBytes = function (bytes, cb){
if (this._needRead){
this._needRead = false;
var me = this;
this._read (function (error){
if (error) return cb (error, null, -1);
me._readBytes (bytes, cb);
});
return;
}
var fill = function (){
var endData = bytes - me._dataOffset;
var endBuffer = me._buffer.length - me._bufferOffset;
var end = endBuffer <= endData ? endBuffer : endData;
me._buffer.copy (data, me._dataOffset, me._bufferOffset, me._bufferOffset + end);
me._bufferOffset += end;
me._realOffset += end;
if (me._bufferOffset === me._buffer.length){
me._bufferOffset = 0;
me._needRead = true;
}
me._dataOffset += end;
if (me._dataOffset === bytes){
me._dataOffset = 0;
me._isEOF = me._noMoreBuffers;
cb (null, data, bytes);
}else{
if (me._noMoreBuffers){
me._isEOF = true;
end = me._dataOffset;
me._dataOffset = 0;
cb (null, data.slice (0, end), end);
}else{
me._needRead = false;
me._read (function (error){
if (error) return cb (error, null, -1);
fill ();
});
}
}
};
var me = this;
var max = me._settings.end - me._realOffset + 1;
bytes = max < bytes ? max : bytes;
if (bytes === 0) return cb (null, null, 0);
var data = new Buffer (bytes);
var len = me._buffer.length;
if (bytes <= len){
var end = me._bufferOffset + bytes;
if (end <= len){
me._buffer.copy (data, 0, me._bufferOffset, end);
me._bufferOffset = end;
me._realOffset += bytes;
cb (null, data, bytes);
}else{
var last = len - me._bufferOffset;
me._realOffset += last;
if (last !== 0){
me._buffer.copy (data, 0, me._bufferOffset, me._bufferOffset + last);
}
if (me._noMoreBuffers){
me._isEOF = true;
return cb (null, data.slice (0, last), last);
}
me._read (function (error){
if (error) return cb (error, null, -1);
len = me._buffer.length;
var remaining = bytes - last;
if (len <= remaining){
me._realOffset += len;
me._isEOF = true;
me._buffer.copy (data, last, 0, len);
var lastChunk = last + len;
cb (null, data.slice (0, lastChunk), lastChunk);
}else{
me._realOffset += remaining;
me._bufferOffset = remaining;
me._buffer.copy (data, last, 0, me._bufferOffset);
cb (null, data, bytes);
}
});
}
}else{
fill ();
}
};
BufferedReader.prototype.close = function (cb){
if (cb) cb = cb.bind (this);
if (!this._fd){
if (cb) cb (null);
return;
}
var me = this;
FS.close (this._fd, function (error){
me._fd = null;
me._buffer = null;
if (cb) cb (error);
});
};
BufferedReader.prototype.readBytes = function (bytes, cb){
cb = cb.bind (this);
if (bytes < 1 || this._isEOF) return cb (null, null, 0);
var open = function (){
if (me._isEOF) return cb (null, null, 0);
FS.open (me._fileName, "r", function (error, fd){
if (error) return cb (error, null, -1);
me._fd = fd;
me._buffer = new Buffer (me._settings.bufferSize);
me._read (function (error){
if (error) return cb (error, null, -1);
me._readBytes (bytes, cb);
});
});
};
var me = this;
if (!this._initialized){
this._init (function (error){
if (error) return cb (error, null);
me._initialized = true;
open ();
});
}else{
if (!this._fd) return open ();
this._readBytes (bytes, cb);
}
};
BufferedReader.prototype.seek = function (offset, cb){
cb = cb.bind (this);
if (offset < 0) return cb (new Error (INVALID_SEEK_OFFSET));
var seek = function (){
offset += me._settings.start;
if (offset >= me._settings.end + 1){
me._isEOF = true;
}else{
me._isEOF = false;
var start = me._fileOffset - (me._buffer ? me._buffer.length : 0);
if (offset >= start && offset < me._fileOffset){
me._bufferOffset = offset - start;
me._realOffset = offset;
}else{
me._needRead = me._fd ? true : false;
me._noMoreBuffers = false;
me._fileOffset = offset;
me._bufferOffset = 0;
me._realOffset = offset;
}
}
cb (null);
};
var me = this;
if (!this._initialized){
this._init (function (error){
if (error) return cb (error, null);
me._initialized = true;
seek ();
});
}else{
seek ();
}
};
BufferedReader.prototype.skip = function (bytes, cb){
cb = cb.bind (this);
if (bytes < 1 || this._isEOF) return cb (null, 0);
var skip = function (){
var remaining = me._settings.end - me._realOffset + 1;
bytes = bytes <= remaining ? bytes : remaining;
me.seek (me._realOffset - me._settings.start + bytes, function (){
cb (null, bytes);
});
};
var me = this;
if (!this._initialized){
this._init (function (error){
if (error) return cb (error, null);
me._initialized = true;
skip ();
});
}else{
skip ();
}
};
module.exports = BufferedReader;

166
tools/fastIndex.js Normal file
View File

@ -0,0 +1,166 @@
/**
* fastIndex.js
*
* override natural.WordNet's IndexFile to use fast index data
*
*/
var _ = require('underscore')._,
util = require('util'),
path = require('path'),
fs = require('fs'),
KEY_LENGTH = 3;
// load fast index bucket data
function loadFastIndex(dir, name) {
var jsonFile = path.join(dir, 'fast-' + name + '.json'),
data = null;
try{
data = JSON.parse( fs.readFileSync(jsonFile,'utf8') );
//console.log('loaded %d buckets for %s', data.stats.buckets, data.name);
} catch(e) {
console.error('Error with fast index file %s\n ', jsonFile, e);
}
return data;
}
function readIndexForKey(key, index, callback) {
var data = index.fastIndex,
offset = data.offsets[key][0],
nextKey = data.offsets[key][1],
nextOffset = data.offsets[nextKey][0],
len = nextOffset - offset - 1,
buffer = new Buffer(len);
fs.read(index.fd, buffer, 0, len, offset, function(err, count){
if (err) return console.log(err);
//console.log(' read %d bytes for <%s>', count, key);
callback(buffer);
});
}
function find(search, callback) {
var self = this,
data = this.fastIndex,
readCallbacks = this.cache,
miss = {status: 'miss'},
args = [search, callback];
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss);
// queue up if already reading file for this key
if (key in readCallbacks){
readCallbacks[key].push(args);
return;
}
readCallbacks[key] = [args];
if (!this.fd) {
//console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// ref count so we know when to close the main index file
++this.refcount;
readIndexForKey(key, this, function (buffer){
var lines = buffer.toString().split('\n'),
keys = lines.map(function(line){
return line.substring(0,line.indexOf(' '));
});
readCallbacks[key].forEach( test );
delete readCallbacks[key];
if (--self.refcount == 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
function test(item) {
var search = item[0],
callback = item[1],
ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search!
//console.log(' %s is %d', search, ind);
if (ind == -1) return callback(miss);
var tokens = lines[ind].split(/\s+/),
key = tokens[0],
result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens};
callback(result);
}
});
}
function find____(search, callback) {
// console.log(' >> ', search, this.fileName, this.fd);
var self = this,
data = this.fastIndex,
miss = {status: 'miss'};
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss);
if (!this.fd) {
// console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// ref count so we know when to close the main index file
++this.refcount;
var offset = data.offsets[key][0],
nextKey = data.offsets[key][1],
nextOffset = data.offsets[nextKey][0],
len = nextOffset - offset - 1,
buffer = new Buffer(len),
pos = Math.ceil(len / 2) - 0;
console.log('--', offset, len, offset+len, offset+pos);
// call base class's _findAt to search only relevant portion
this._findAt(this.fd, // fd
offset+len * 1, // size (more like 'end' of buffer)
offset+pos, // pos
null, // lastPos
pos * 1, // adjustment
search, // key
done); // callback
function done(result) {
//console.log(self.refcount, search, result && result.line);
if (--self.refcount == 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
callback(result);
}
}
// cache of fast index data across instances of WordPOS class
var cache = {};
module.exports = {
find: function(index){
var key = index.filePath,
data;
if (!(key in cache)) {
data = loadFastIndex(index.dataDir, index.fileName);
cache[key] = data;
}
// if no fast index data was found or was corrupt, use original find
if (!cache[key]) return index.find;
index.fastIndex = cache[key];
index.refcount = 0;
index.cache = {};
return find;
}
};

149
tools/stat.js Normal file
View File

@ -0,0 +1,149 @@
/**
* generate fast index for WordNet index files
*
* Usage:
* node stat [--no-stats] index.adv ...
*
* --no-stats prevents writing stat data to file
* Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'
*
* Format of the fast index:
* {
* "firstKey":".22", // first key value
* "keyLength":3, // #characters in key
* "version":"3.0", // WNdb version
* "name":"index.adj", // index file name
* "stats":{
* "buckets":2326, // # of buckets
* "words":21479, // total # words
* "biggest":310, // #words in biggest bucket
* "avg":"9.23", // average #words per bucket
* "median":3 // median #words per bucket
* },
* "offsets":{
* "100":[2271,"101"], // "100" is the key,
* // value=[byte offset in index file, next key]
* ...
* }
* }
*
* To lookup a word:
*
* find key (first <keyLength> chars of word)
* look it up in <offsets> O(1)
* if it exists
* get offset of key and offset of next key
* read index file between the two offsets
* binary search read data O(log avg)
*/
var
WNdb = require('../wordpos').WNdb,
util = require('util'),
BufferedReader = require ("./buffered-reader"),
_ = require('underscore')._,
fs = require('fs'),
path = require('path'),
KEY_LENGTH = 3,
stats = true,
eofKey = '_EOF_'; // should be unique
console.log('DB folder: ', WNdb.path);
if (process.argv.length < 3) {
console.log('#Usage:\nnode stat index.adv ...');
process.exit(1);
}
_(process.argv.slice(2)).filter(function(arg){
// disable writing stats file
if (arg == '--no-stats') {
stats = false;
return false;
}
return true;
}).forEach(function(basename){
var indexFile = path.join(WNdb.path, basename),
jsonFile = path.join(WNdb.path, 'fast-' + basename + '.json'),
countFile = 'fast-' + basename + '.tsv',
endOffset = fs.statSync(indexFile).size,
buckets = {},
lastKey = null,
offsets = {},
firstKey = null;
new BufferedReader (indexFile, {encoding: "utf8"})
.on ("error", function (error){
console.log ("error: %s", indexFile, error);
})
.on ("line", function (line, offset){
// skip license info
if (line[0] == ' ') return;
// if (++i > 225) return this.interrupt();
var key = line.substring(0, Math.min(line.indexOf(' '), KEY_LENGTH));
if (firstKey === null) firstKey = key;
if (key in buckets) {
++buckets[key];
return;
}
buckets[key] = 1;
offsets[key] = [offset];
(lastKey !== null) && offsets[lastKey].push(key); // current key is the 'next key' for the previous key
lastKey = key;
})
.on ("end", function (){
// add EOF offset
offsets[lastKey].push(eofKey);
offsets[eofKey] = [endOffset, null];
var size = _.size(buckets),
sum = _.reduce(buckets, function(memo, num){ return memo + num; }, 0),
sorted = _.sortBy(buckets, function(val){ return val }),
median = sorted[Math.floor(size/2)],
max = sorted[sorted.length-1], // _.max(buckets),
maxkey = _.reduce(buckets, function(memo, val, key){ return memo + (val == max ? key : '') }, ''),
avg = (sum/size).toFixed(2),
info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);
// console.log(sorted);
// return;
console.log(basename, info);
if (stats) {
// distribution in groups of 10
var grouped = _.groupBy(buckets, function(num){ return 1 + 10*(Math.floor((num-1)/10) ) });
_(grouped).each(function(arr, key, list){
list[key] = arr.length;
});
str = '';
_.each(grouped, function(value, key){ str += key+"\t"+value+"\n" });
fs.writeFileSync(countFile, '#'+info+'\n'
+ '#bucket_size (1-10, 11-20, etc.) \t #buckets\n'
+ str, 'utf8');
}
// offset data
var data = {
firstKey: firstKey,
keyLength: KEY_LENGTH,
version: WNdb.version,
name: basename,
stats: {
buckets: size,
words: sum,
biggest: max,
avg: avg,
median: median
},
offsets: offsets
};
fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
})
.read();
});

View File

@ -4,6 +4,8 @@
* Node.js part-of-speech utilities using natural's WordNet module. * Node.js part-of-speech utilities using natural's WordNet module.
* *
* Copyright (c) 2012 mooster@42at.com * Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license * Released under MIT license
*/ */
@ -13,7 +15,12 @@ var _ = require('underscore')._,
WordNet = natural.WordNet, WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(), tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ', stopwords = ' '+ natural.stopwords.join(' ') +' ',
WNdb = require('WNdb'); WNdb = require('WNdb'),
fastIndex = null;
try {
fastIndex = require('./tools/fastIndex');
} catch(e) {}
function normalize(word) { function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_'); return word.toLowerCase().replace(/\s+/g, '_');
@ -95,6 +102,14 @@ var WordPOS = function(options) {
WordPOS.super_.apply(this, arguments); WordPOS.super_.apply(this, arguments);
} }
this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults); this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults);
if (this.options.fastIndex && fastIndex) {
// override find
this.nounIndex.find = fastIndex.find(this.nounIndex);
this.verbIndex.find = fastIndex.find(this.verbIndex);
this.adjIndex.find = fastIndex.find(this.adjIndex);
this.advIndex.find = fastIndex.find(this.advIndex);
}
}; };
util.inherits(WordPOS, WordNet); util.inherits(WordPOS, WordNet);
@ -102,7 +117,12 @@ WordPOS.defaults = {
/** /**
* enable profiling, time in msec returned as second argument in callback * enable profiling, time in msec returned as second argument in callback
*/ */
profile: false profile: false,
/**
* use fast index if available
*/
fastIndex: true
}; };
var wordposProto = WordPOS.prototype; var wordposProto = WordPOS.prototype;