diff --git a/README.md b/README.md index 6a010bc..ae6697b 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,12 @@ WordPOS.defaults = { /** * enable profiling, time in msec returned as second argument in callback */ - profile: false + profile: false, + + /** + * use fast index if available + */ + fastIndex: true }; ``` To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call. @@ -208,6 +213,8 @@ To override, pass an options hash to the constructor. With the `profile` option, // true 29 ``` +Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tool/stat.js. + Benchmark ---------- @@ -225,7 +232,7 @@ Single word lookup: getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 } ``` -128-word lookup: +128-word lookup (orig) : ``` getPOS : 0 ops/s { iterations: 1, elapsed: 2210 } getNouns : 2 ops/s { iterations: 1, elapsed: 666 } @@ -234,9 +241,17 @@ Single word lookup: getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 } ``` +128-word lookup (fastIndex) : +``` + getPOS : 36 ops/s { iterations: 1, elapsed: 28 } + getNouns : 125 ops/s { iterations: 1, elapsed: 8 } + getVerbs : 500 ops/s { iterations: 1, elapsed: 2 } + getAdjectives : 500 ops/s { iterations: 1, elapsed: 2 } + getAdverbs : 1000 ops/s { iterations: 1, elapsed: 1 } +``` + On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files. -There is probably room for optimization in the underlying library. License ------- diff --git a/package.json b/package.json index 1c46ccf..9c7c5c6 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "author": "Moos ", "keywords": ["natural", "language", "wordnet", "pos"], "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", - "version": "0.1.3", + "version": "0.1.4", "homepage": "https://github.com/moos/wordpos", "engines": { "node": ">=0.4.10" @@ -20,5 +20,8 @@ "type" : "git", "url" : "git://github.com/moos/wordpos.git" }, - "main": "./wordpos.js" + "main": "./wordpos.js", + "scripts": { + "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun" + } } diff --git a/tools/buffered-reader.js b/tools/buffered-reader.js new file mode 100644 index 0000000..12db383 --- /dev/null +++ b/tools/buffered-reader.js @@ -0,0 +1,402 @@ +/** + * @name BufferedReader. + * @description Fully configurable buffered reader for node.js. + * + * @author Gabriel Llamas + * @created 10/04/2012 + * @modified 01/05/2012 + * @version 0.2.0 + * + * Forked: https://github.com/moos/Node-BufferedReader + */ +"use strict"; + +var EVENTS = require ("events"); +var FS = require ("fs"); + +var BUFFER_SIZE = 16384; + +var INVALID_BUFFER_SIZE = "The buffer size must be greater than 0."; +var INVALID_START_OFFSET = "The start offset must be greater than or equals to 0."; +var INVALID_END_OFFSET = "The end offset must be greater than or equals to 0."; +var INVALID_RANGE_OFFSET = "The end offset must be greater than or equals to the start offset."; +var INVALID_BYTES_RANGE_ERROR = "The number of bytes to read must be greater than 0."; +var INVALID_SEEK_OFFSET = "The offset must be greater than or equals to 0."; +var NO_FILE_ERROR = "The source is not a file."; + +var BufferedReader = function (fileName, settings){ + EVENTS.EventEmitter.call (this); + + settings = settings || {}; + + if (settings.bufferSize === 0) settings.bufferSize = -1; + this._settings = { + bufferSize: settings.bufferSize || BUFFER_SIZE, + encoding: settings.encoding || null, + start: settings.start || 0, + end: settings.end + }; + + if (this._settings.bufferSize < 1) throw new Error (INVALID_BUFFER_SIZE); + if (this._settings.start < 0) throw new Error (INVALID_START_OFFSET); + if (this._settings.end < 0) throw new Error (INVALID_END_OFFSET); + if (this._settings.end < this._settings.start) throw new Error (INVALID_RANGE_OFFSET); + + this._fileName = fileName; + this._fd = null; + this._buffer = null; + + this._fileOffset = this._settings.start; + this._bufferOffset = 0; + this._dataOffset = 0; + this._realOffset = this._settings.start; + + this._fileSize = null; + this._initialized = false; + this._interrupted = false; + this._isEOF = false; + this._noMoreBuffers = false; + this._needRead = false; +}; + +BufferedReader.prototype = Object.create (EVENTS.EventEmitter.prototype); +BufferedReader.prototype.constructor = BufferedReader; + +BufferedReader.prototype.interrupt = function (){ + this._interrupted = true; +}; + +BufferedReader.prototype.read = function (){ + var stream = FS.createReadStream (this._fileName, this._settings); + + var lastChunk; + var buffer; + var me = this; + var lineOffset = 0, + lineCount = 0, + byteOffset = 0; + + var onChar = this.listeners ("character").length !== 0, + onLine = this.listeners ("line").length !== 0, + onByte = this.listeners ("byte").length !== 0, + loop = onChar || onLine || onByte; + + stream.on ("data", function (data){ + buffer = data; + var offset = 0; + var chunk; + var character; + var len = data.length; + + if (loop){ + for (var i=0; i= stats.size){ + me._isEOF = true; + return cb (null); + } + if (!me._settings.end && me._settings.end !== 0){ + me._settings.end = stats.size; + } + if (me._settings.end >= stats.size){ + me._settings.end = stats.size - 1; + } + me._fileSize = stats.size; + cb (null); + }else{ + cb (new Error (NO_FILE_ERROR)); + } + }); +}; + +BufferedReader.prototype._read = function (cb){ + var me = this; + var size = this._settings.bufferSize; + FS.read (this._fd, this._buffer, 0, size, this._fileOffset, function (error, bytesRead){ + if (error) return cb (error); + + me._fileOffset += bytesRead; + if (me._fileOffset === me._fileSize){ + me._noMoreBuffers = true; + } + if (bytesRead < size){ + me._buffer = me._buffer.slice (0, bytesRead); + } + cb (null); + }); +}; + +BufferedReader.prototype._readBytes = function (bytes, cb){ + if (this._needRead){ + this._needRead = false; + var me = this; + this._read (function (error){ + if (error) return cb (error, null, -1); + me._readBytes (bytes, cb); + }); + return; + } + + var fill = function (){ + var endData = bytes - me._dataOffset; + var endBuffer = me._buffer.length - me._bufferOffset; + var end = endBuffer <= endData ? endBuffer : endData; + + me._buffer.copy (data, me._dataOffset, me._bufferOffset, me._bufferOffset + end); + me._bufferOffset += end; + me._realOffset += end; + + if (me._bufferOffset === me._buffer.length){ + me._bufferOffset = 0; + me._needRead = true; + } + me._dataOffset += end; + + if (me._dataOffset === bytes){ + me._dataOffset = 0; + me._isEOF = me._noMoreBuffers; + cb (null, data, bytes); + }else{ + if (me._noMoreBuffers){ + me._isEOF = true; + end = me._dataOffset; + me._dataOffset = 0; + cb (null, data.slice (0, end), end); + }else{ + me._needRead = false; + me._read (function (error){ + if (error) return cb (error, null, -1); + + fill (); + }); + } + } + }; + + var me = this; + + var max = me._settings.end - me._realOffset + 1; + bytes = max < bytes ? max : bytes; + if (bytes === 0) return cb (null, null, 0); + + var data = new Buffer (bytes); + var len = me._buffer.length; + + if (bytes <= len){ + var end = me._bufferOffset + bytes; + + if (end <= len){ + me._buffer.copy (data, 0, me._bufferOffset, end); + me._bufferOffset = end; + me._realOffset += bytes; + cb (null, data, bytes); + }else{ + var last = len - me._bufferOffset; + me._realOffset += last; + + if (last !== 0){ + me._buffer.copy (data, 0, me._bufferOffset, me._bufferOffset + last); + } + if (me._noMoreBuffers){ + me._isEOF = true; + return cb (null, data.slice (0, last), last); + } + + me._read (function (error){ + if (error) return cb (error, null, -1); + + len = me._buffer.length; + var remaining = bytes - last; + if (len <= remaining){ + me._realOffset += len; + me._isEOF = true; + me._buffer.copy (data, last, 0, len); + var lastChunk = last + len; + cb (null, data.slice (0, lastChunk), lastChunk); + }else{ + me._realOffset += remaining; + me._bufferOffset = remaining; + me._buffer.copy (data, last, 0, me._bufferOffset); + cb (null, data, bytes); + } + }); + } + }else{ + fill (); + } +}; + +BufferedReader.prototype.close = function (cb){ + if (cb) cb = cb.bind (this); + if (!this._fd){ + if (cb) cb (null); + return; + } + + var me = this; + FS.close (this._fd, function (error){ + me._fd = null; + me._buffer = null; + if (cb) cb (error); + }); +}; + +BufferedReader.prototype.readBytes = function (bytes, cb){ + cb = cb.bind (this); + if (bytes < 1 || this._isEOF) return cb (null, null, 0); + + var open = function (){ + if (me._isEOF) return cb (null, null, 0); + FS.open (me._fileName, "r", function (error, fd){ + if (error) return cb (error, null, -1); + + me._fd = fd; + me._buffer = new Buffer (me._settings.bufferSize); + me._read (function (error){ + if (error) return cb (error, null, -1); + me._readBytes (bytes, cb); + }); + }); + }; + + var me = this; + if (!this._initialized){ + this._init (function (error){ + if (error) return cb (error, null); + me._initialized = true; + open (); + }); + }else{ + if (!this._fd) return open (); + this._readBytes (bytes, cb); + } +}; + +BufferedReader.prototype.seek = function (offset, cb){ + cb = cb.bind (this); + if (offset < 0) return cb (new Error (INVALID_SEEK_OFFSET)); + + var seek = function (){ + offset += me._settings.start; + if (offset >= me._settings.end + 1){ + me._isEOF = true; + }else{ + me._isEOF = false; + var start = me._fileOffset - (me._buffer ? me._buffer.length : 0); + if (offset >= start && offset < me._fileOffset){ + me._bufferOffset = offset - start; + me._realOffset = offset; + }else{ + me._needRead = me._fd ? true : false; + me._noMoreBuffers = false; + me._fileOffset = offset; + me._bufferOffset = 0; + me._realOffset = offset; + } + } + cb (null); + }; + + var me = this; + if (!this._initialized){ + this._init (function (error){ + if (error) return cb (error, null); + me._initialized = true; + seek (); + }); + }else{ + seek (); + } +}; + +BufferedReader.prototype.skip = function (bytes, cb){ + cb = cb.bind (this); + if (bytes < 1 || this._isEOF) return cb (null, 0); + + var skip = function (){ + var remaining = me._settings.end - me._realOffset + 1; + bytes = bytes <= remaining ? bytes : remaining; + me.seek (me._realOffset - me._settings.start + bytes, function (){ + cb (null, bytes); + }); + }; + + var me = this; + if (!this._initialized){ + this._init (function (error){ + if (error) return cb (error, null); + me._initialized = true; + skip (); + }); + }else{ + skip (); + } +}; + +module.exports = BufferedReader; \ No newline at end of file diff --git a/tools/fastIndex.js b/tools/fastIndex.js new file mode 100644 index 0000000..9e03b7c --- /dev/null +++ b/tools/fastIndex.js @@ -0,0 +1,166 @@ +/** + * fastIndex.js + * + * override natural.WordNet's IndexFile to use fast index data + * + */ + +var _ = require('underscore')._, + util = require('util'), + path = require('path'), + fs = require('fs'), + KEY_LENGTH = 3; + +// load fast index bucket data +function loadFastIndex(dir, name) { + var jsonFile = path.join(dir, 'fast-' + name + '.json'), + data = null; + try{ + data = JSON.parse( fs.readFileSync(jsonFile,'utf8') ); + //console.log('loaded %d buckets for %s', data.stats.buckets, data.name); + } catch(e) { + console.error('Error with fast index file %s\n ', jsonFile, e); + } + return data; +} + +function readIndexForKey(key, index, callback) { + var data = index.fastIndex, + offset = data.offsets[key][0], + nextKey = data.offsets[key][1], + nextOffset = data.offsets[nextKey][0], + len = nextOffset - offset - 1, + buffer = new Buffer(len); + + fs.read(index.fd, buffer, 0, len, offset, function(err, count){ + if (err) return console.log(err); + //console.log(' read %d bytes for <%s>', count, key); + callback(buffer); + }); +} + +function find(search, callback) { + var self = this, + data = this.fastIndex, + readCallbacks = this.cache, + miss = {status: 'miss'}, + args = [search, callback]; + + var key = search.slice(0, KEY_LENGTH); + if (!(key in data.offsets)) return callback(miss); + + // queue up if already reading file for this key + if (key in readCallbacks){ + readCallbacks[key].push(args); + return; + } + readCallbacks[key] = [args]; + if (!this.fd) { + //console.log(' ... opening', this.filePath); + this.fd = fs.openSync(this.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++this.refcount; + + readIndexForKey(key, this, function (buffer){ + var lines = buffer.toString().split('\n'), + keys = lines.map(function(line){ + return line.substring(0,line.indexOf(' ')); + }); + + readCallbacks[key].forEach( test ); + delete readCallbacks[key]; + + if (--self.refcount == 0) { + //console.log(' ... closing', self.filePath); + fs.close(self.fd); + self.fd = null; + } + + function test(item) { + var search = item[0], + callback = item[1], + ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search! + //console.log(' %s is %d', search, ind); + if (ind == -1) return callback(miss); + + var tokens = lines[ind].split(/\s+/), + key = tokens[0], + result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens}; + + callback(result); + } + }); +} + +function find____(search, callback) { +// console.log(' >> ', search, this.fileName, this.fd); + var self = this, + data = this.fastIndex, + miss = {status: 'miss'}; + + var key = search.slice(0, KEY_LENGTH); + if (!(key in data.offsets)) return callback(miss); + + if (!this.fd) { +// console.log(' ... opening', this.filePath); + this.fd = fs.openSync(this.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++this.refcount; + + var offset = data.offsets[key][0], + nextKey = data.offsets[key][1], + nextOffset = data.offsets[nextKey][0], + len = nextOffset - offset - 1, + buffer = new Buffer(len), + pos = Math.ceil(len / 2) - 0; + + console.log('--', offset, len, offset+len, offset+pos); + + // call base class's _findAt to search only relevant portion + this._findAt(this.fd, // fd + offset+len * 1, // size (more like 'end' of buffer) + offset+pos, // pos + null, // lastPos + pos * 1, // adjustment + search, // key + done); // callback + + function done(result) { + //console.log(self.refcount, search, result && result.line); + if (--self.refcount == 0) { + //console.log(' ... closing', self.filePath); + fs.close(self.fd); + self.fd = null; + } + callback(result); + } +} + +// cache of fast index data across instances of WordPOS class +var cache = {}; + +module.exports = { + find: function(index){ + + var key = index.filePath, + data; + + if (!(key in cache)) { + data = loadFastIndex(index.dataDir, index.fileName); + cache[key] = data; + } + + // if no fast index data was found or was corrupt, use original find + if (!cache[key]) return index.find; + + index.fastIndex = cache[key]; + index.refcount = 0; + index.cache = {}; + + return find; + } +}; diff --git a/tools/stat.js b/tools/stat.js new file mode 100644 index 0000000..6d429f3 --- /dev/null +++ b/tools/stat.js @@ -0,0 +1,149 @@ +/** + * generate fast index for WordNet index files + * + * Usage: + * node stat [--no-stats] index.adv ... + * + * --no-stats prevents writing stat data to file + * Fast index is based on buckets keyed off first THREE characters in the index word, + * eg, 'awesome' goes into bucket 'awe' + * + * Format of the fast index: + * { + * "firstKey":".22", // first key value + * "keyLength":3, // #characters in key + * "version":"3.0", // WNdb version + * "name":"index.adj", // index file name + * "stats":{ + * "buckets":2326, // # of buckets + * "words":21479, // total # words + * "biggest":310, // #words in biggest bucket + * "avg":"9.23", // average #words per bucket + * "median":3 // median #words per bucket + * }, + * "offsets":{ + * "100":[2271,"101"], // "100" is the key, + * // value=[byte offset in index file, next key] + * ... + * } + * } + * + * To lookup a word: + * + * find key (first chars of word) + * look it up in O(1) + * if it exists + * get offset of key and offset of next key + * read index file between the two offsets + * binary search read data O(log avg) + */ +var + WNdb = require('../wordpos').WNdb, + util = require('util'), + BufferedReader = require ("./buffered-reader"), + _ = require('underscore')._, + fs = require('fs'), + path = require('path'), + KEY_LENGTH = 3, + stats = true, + eofKey = '_EOF_'; // should be unique + +console.log('DB folder: ', WNdb.path); +if (process.argv.length < 3) { + console.log('#Usage:\nnode stat index.adv ...'); + process.exit(1); +} + +_(process.argv.slice(2)).filter(function(arg){ + // disable writing stats file + if (arg == '--no-stats') { + stats = false; + return false; + } + return true; +}).forEach(function(basename){ + + var indexFile = path.join(WNdb.path, basename), + jsonFile = path.join(WNdb.path, 'fast-' + basename + '.json'), + countFile = 'fast-' + basename + '.tsv', + endOffset = fs.statSync(indexFile).size, + buckets = {}, + lastKey = null, + offsets = {}, + firstKey = null; + + new BufferedReader (indexFile, {encoding: "utf8"}) + .on ("error", function (error){ + console.log ("error: %s", indexFile, error); + }) + .on ("line", function (line, offset){ + // skip license info + if (line[0] == ' ') return; + + // if (++i > 225) return this.interrupt(); + var key = line.substring(0, Math.min(line.indexOf(' '), KEY_LENGTH)); + if (firstKey === null) firstKey = key; + + if (key in buckets) { + ++buckets[key]; + return; + } + + buckets[key] = 1; + offsets[key] = [offset]; + (lastKey !== null) && offsets[lastKey].push(key); // current key is the 'next key' for the previous key + lastKey = key; + }) + .on ("end", function (){ + + // add EOF offset + offsets[lastKey].push(eofKey); + offsets[eofKey] = [endOffset, null]; + + var size = _.size(buckets), + sum = _.reduce(buckets, function(memo, num){ return memo + num; }, 0), + sorted = _.sortBy(buckets, function(val){ return val }), + median = sorted[Math.floor(size/2)], + max = sorted[sorted.length-1], // _.max(buckets), + maxkey = _.reduce(buckets, function(memo, val, key){ return memo + (val == max ? key : '') }, ''), + avg = (sum/size).toFixed(2), + info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median); + +// console.log(sorted); +// return; + + console.log(basename, info); + + if (stats) { + // distribution in groups of 10 + var grouped = _.groupBy(buckets, function(num){ return 1 + 10*(Math.floor((num-1)/10) ) }); + _(grouped).each(function(arr, key, list){ + list[key] = arr.length; + }); + str = ''; + _.each(grouped, function(value, key){ str += key+"\t"+value+"\n" }); + fs.writeFileSync(countFile, '#'+info+'\n' + + '#bucket_size (1-10, 11-20, etc.) \t #buckets\n' + + str, 'utf8'); + } + + // offset data + var data = { + firstKey: firstKey, + keyLength: KEY_LENGTH, + version: WNdb.version, + name: basename, + stats: { + buckets: size, + words: sum, + biggest: max, + avg: avg, + median: median + }, + offsets: offsets + }; + + fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8'); + }) + .read(); +}); diff --git a/wordpos.js b/wordpos.js index c584334..073a426 100644 --- a/wordpos.js +++ b/wordpos.js @@ -4,6 +4,8 @@ * Node.js part-of-speech utilities using natural's WordNet module. * * Copyright (c) 2012 mooster@42at.com +* https://github.com/moos/wordpos +* * Released under MIT license */ @@ -13,7 +15,12 @@ var _ = require('underscore')._, WordNet = natural.WordNet, tokenizer = new natural.WordTokenizer(), stopwords = ' '+ natural.stopwords.join(' ') +' ', - WNdb = require('WNdb'); + WNdb = require('WNdb'), + fastIndex = null; + +try { + fastIndex = require('./tools/fastIndex'); +} catch(e) {} function normalize(word) { return word.toLowerCase().replace(/\s+/g, '_'); @@ -95,6 +102,14 @@ var WordPOS = function(options) { WordPOS.super_.apply(this, arguments); } this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults); + + if (this.options.fastIndex && fastIndex) { + // override find + this.nounIndex.find = fastIndex.find(this.nounIndex); + this.verbIndex.find = fastIndex.find(this.verbIndex); + this.adjIndex.find = fastIndex.find(this.adjIndex); + this.advIndex.find = fastIndex.find(this.advIndex); + } }; util.inherits(WordPOS, WordNet); @@ -102,7 +117,12 @@ WordPOS.defaults = { /** * enable profiling, time in msec returned as second argument in callback */ - profile: false + profile: false, + + /** + * use fast index if available + */ + fastIndex: true }; var wordposProto = WordPOS.prototype;