added fastIndex feature. v0.1.4

2012-05-20 11:29:10 -07:00 · 2012-05-20 11:29:10 -07:00 · 6652265ef0
parent a8ae4c3f13
commit 6652265ef0
6 changed files with 762 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -197,7 +197,12 @@ WordPOS.defaults = {
  /**
   * enable profiling, time in msec returned as second argument in callback
   */
-  profile: false
+  profile: false,
  /**
   * use fast index if available
   */
  fastIndex: true
 };
 ```
 To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
@ -208,6 +213,8 @@ To override, pass an options hash to the constructor. With the `profile` option,
    // true 29
 ```
 Version 0.1.4 introduces `fastIndex` option.  This uses a secondary index on the index files and is much faster. It is on by default.  Secondary index files are generated at install time and placed in the same directory as WNdb.path.  Details can be found in tool/stat.js.
 Benchmark
 ----------
@ -225,7 +232,7 @@ Single word lookup:
  getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 }
 ```
-128-word lookup:
+128-word lookup (orig) :
 ```
  getPOS : 0 ops/s { iterations: 1, elapsed: 2210 }
  getNouns : 2 ops/s { iterations: 1, elapsed: 666 }
@ -234,9 +241,17 @@ Single word lookup:
  getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 }
 ```
 128-word lookup (fastIndex) :
 ```
  getPOS : 36 ops/s { iterations: 1, elapsed: 28 }
  getNouns : 125 ops/s { iterations: 1, elapsed: 8 }
  getVerbs : 500 ops/s { iterations: 1, elapsed: 2 }
  getAdjectives : 500 ops/s { iterations: 1, elapsed: 2 }
  getAdverbs : 1000 ops/s { iterations: 1, elapsed: 1 }
 ```
 On a win7/64-bit/dual-core/3GHz.  getPOS() is slowest as it searches through all four index files.
 There is probably room for optimization in the underlying library.
 License
 -------
--- a/package.json
+++ b/package.json
@ -3,7 +3,7 @@
  "author": "Moos <mooster@42at.com>",
  "keywords": ["natural", "language", "wordnet", "pos"],
  "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
-  "version": "0.1.3",
+  "version": "0.1.4",
  "homepage": "https://github.com/moos/wordpos",
  "engines": {
    "node": ">=0.4.10"
@ -20,5 +20,8 @@
    "type" : "git",
    "url" : "git://github.com/moos/wordpos.git"
  },
-  "main": "./wordpos.js"
+  "main": "./wordpos.js",
  "scripts": {
    "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun"
  }
 }
--- a/tools/buffered-reader.js
+++ b/tools/buffered-reader.js
@ -0,0 +1,402 @@
 /**
 * @name BufferedReader.
 * @description Fully configurable buffered reader for node.js.
 *
 * @author Gabriel Llamas
 * @created 10/04/2012
 * @modified 01/05/2012
 * @version 0.2.0
 *
 * Forked: https://github.com/moos/Node-BufferedReader
 */
 "use strict";
 var EVENTS = require ("events");
 var FS = require ("fs");
 var BUFFER_SIZE = 16384;
 var INVALID_BUFFER_SIZE = "The buffer size must be greater than 0.";
 var INVALID_START_OFFSET = "The start offset must be greater than or equals to 0.";
 var INVALID_END_OFFSET = "The end offset must be greater than or equals to 0.";
 var INVALID_RANGE_OFFSET = "The end offset must be greater than or equals to the start offset.";
 var INVALID_BYTES_RANGE_ERROR = "The number of bytes to read must be greater than 0.";
 var INVALID_SEEK_OFFSET = "The offset must be greater than or equals to 0.";
 var NO_FILE_ERROR = "The source is not a file.";
 var BufferedReader = function (fileName, settings){
  EVENTS.EventEmitter.call (this);
  settings = settings || {};
  if (settings.bufferSize === 0) settings.bufferSize = -1;
  this._settings = {
    bufferSize: settings.bufferSize || BUFFER_SIZE,
    encoding: settings.encoding || null,
    start: settings.start || 0,
    end: settings.end
  };
  if (this._settings.bufferSize < 1) throw new Error (INVALID_BUFFER_SIZE);
  if (this._settings.start < 0) throw new Error (INVALID_START_OFFSET);
  if (this._settings.end < 0) throw new Error (INVALID_END_OFFSET);
  if (this._settings.end < this._settings.start) throw new Error (INVALID_RANGE_OFFSET);
  this._fileName = fileName;
  this._fd = null;
  this._buffer = null;
  this._fileOffset = this._settings.start;
  this._bufferOffset = 0;
  this._dataOffset = 0;
  this._realOffset = this._settings.start;
  this._fileSize = null;
  this._initialized = false;
  this._interrupted = false;
  this._isEOF = false;
  this._noMoreBuffers = false;
  this._needRead = false;
 };
 BufferedReader.prototype = Object.create (EVENTS.EventEmitter.prototype);
 BufferedReader.prototype.constructor = BufferedReader;
 BufferedReader.prototype.interrupt = function (){
  this._interrupted = true;
 };
 BufferedReader.prototype.read = function (){
  var stream = FS.createReadStream (this._fileName, this._settings);
  var lastChunk;
  var buffer;
  var me = this;
  var lineOffset = 0,
    lineCount = 0,
    byteOffset = 0;
  var onChar = this.listeners ("character").length !== 0,
    onLine = this.listeners ("line").length !== 0,
    onByte = this.listeners ("byte").length !== 0,
    loop = onChar || onLine || onByte;
  stream.on ("data", function (data){
    buffer = data;
    var offset = 0;
    var chunk;
    var character;
    var len = data.length;
    if (loop){
      for (var i=0; i<len; i++){
        if (me._interrupted) break;
        character = data[i];
        if (stream.encoding){
          onChar && me.emit ("character", character === "\r" ? "\n" : character, byteOffset + i);
        }else{
          onByte && me.emit ("byte", character, byteOffset + i);
          continue;
        }
        if (!onLine) continue;
        if (character === "\n" || character === "\r"){
          chunk = data.slice (offset, i);
          if (lastChunk){
            chunk = lastChunk.concat (chunk);
          }
          if (i + 1 !== len && character === "\r" && data[i + 1] === "\n"){
            i++;
          }
          me.emit ("line", chunk, lineOffset + offset, ++lineCount);
          offset = i + 1;
          if (lastChunk){
              lineOffset += lastChunk.length;
              lastChunk = null;
            }
        }
      }
      if (stream.encoding && offset !== len){
        var s = offset === 0 ? data : data.slice (offset);
        lastChunk = lastChunk ? lastChunk.concat (s) : s;
      }
      lineOffset += offset;
    }
    me.emit ("buffer", data, byteOffset);
    if (me._interrupted){
      me._interrupted = false;
      stream.destroy ();
      me.emit ("end");
    }
    byteOffset += len;
  });
  stream.on ("end", function (){
    me._interrupted = false;
    if (loop && lastChunk){
      me.emit ("line", lastChunk);
    }
    me.emit ("end");
  });
  stream.on ("error", function (error){
    me._interrupted = false;
    me.emit ("error", error);
  });
 };
 BufferedReader.prototype._init = function (cb){
  var me = this;
  FS.stat (this._fileName, function (error, stats){
    if (error) return cb (error);
    if (stats.isFile ()){
      if (me._settings.start >= stats.size){
        me._isEOF = true;
        return cb (null);
      }
      if (!me._settings.end && me._settings.end !== 0){
        me._settings.end = stats.size;
      }
      if (me._settings.end >= stats.size){
        me._settings.end = stats.size - 1;
      }
      me._fileSize = stats.size;
      cb (null);
    }else{
      cb (new Error (NO_FILE_ERROR));
    }
  });
 };
 BufferedReader.prototype._read = function (cb){
  var me = this;
  var size = this._settings.bufferSize;
  FS.read (this._fd, this._buffer, 0, size, this._fileOffset, function (error, bytesRead){
    if (error) return cb (error);
    me._fileOffset += bytesRead;
    if (me._fileOffset === me._fileSize){
      me._noMoreBuffers = true;
    }
    if (bytesRead < size){
      me._buffer = me._buffer.slice (0, bytesRead);
    }
    cb (null);
  });
 };
 BufferedReader.prototype._readBytes = function (bytes, cb){
  if (this._needRead){
    this._needRead = false;
    var me = this;
    this._read (function (error){
      if (error) return cb (error, null, -1);
      me._readBytes (bytes, cb);
    });
    return;
  }
  var fill = function (){
    var endData = bytes - me._dataOffset;
    var endBuffer = me._buffer.length - me._bufferOffset;
    var end = endBuffer <= endData ? endBuffer : endData;
    me._buffer.copy (data, me._dataOffset, me._bufferOffset, me._bufferOffset + end);
    me._bufferOffset += end;
    me._realOffset += end;
    if (me._bufferOffset === me._buffer.length){
      me._bufferOffset = 0;
      me._needRead = true;
    }
    me._dataOffset += end;
    if (me._dataOffset === bytes){
      me._dataOffset = 0;
      me._isEOF = me._noMoreBuffers;
      cb (null, data, bytes);
    }else{
      if (me._noMoreBuffers){
        me._isEOF = true;
        end = me._dataOffset;
        me._dataOffset = 0;
        cb (null, data.slice (0, end), end);
      }else{
        me._needRead = false;
        me._read (function (error){
          if (error) return cb (error, null, -1);
          fill ();
        });
      }
    }
  };
  var me = this;
  var max = me._settings.end - me._realOffset + 1;
  bytes = max < bytes ? max : bytes;
  if (bytes === 0) return cb (null, null, 0);
  var data = new Buffer (bytes);
  var len = me._buffer.length;
  if (bytes <= len){
    var end = me._bufferOffset + bytes;
    if (end <= len){
      me._buffer.copy (data, 0, me._bufferOffset, end);
      me._bufferOffset = end;
      me._realOffset += bytes;
      cb (null, data, bytes);
    }else{
      var last = len - me._bufferOffset;
      me._realOffset += last;
      if (last !== 0){
        me._buffer.copy (data, 0, me._bufferOffset, me._bufferOffset + last);
      }
      if (me._noMoreBuffers){
        me._isEOF = true;
        return cb (null, data.slice (0, last), last);
      }
      me._read (function (error){
        if (error) return cb (error, null, -1);
        len = me._buffer.length;
        var remaining = bytes - last;
        if (len <= remaining){
          me._realOffset += len;
          me._isEOF = true;
          me._buffer.copy (data, last, 0, len);
          var lastChunk = last + len;
          cb (null, data.slice (0, lastChunk), lastChunk);
        }else{
          me._realOffset += remaining;
          me._bufferOffset = remaining;
          me._buffer.copy (data, last, 0, me._bufferOffset);
          cb (null, data, bytes);
        }
      });
    }
  }else{
    fill ();
  }
 };
 BufferedReader.prototype.close = function (cb){
  if (cb) cb = cb.bind (this);
  if (!this._fd){
    if (cb) cb (null);
    return;
  }
  var me = this;
  FS.close (this._fd, function (error){
    me._fd = null;
    me._buffer = null;
    if (cb) cb (error);
  });
 };
 BufferedReader.prototype.readBytes = function (bytes, cb){
  cb = cb.bind (this);
  if (bytes < 1 || this._isEOF) return cb (null, null, 0);
  var open = function (){
    if (me._isEOF) return cb (null, null, 0);
    FS.open (me._fileName, "r", function (error, fd){
      if (error) return cb (error, null, -1);
      me._fd = fd;
      me._buffer = new Buffer (me._settings.bufferSize);
      me._read (function (error){
        if (error) return cb (error, null, -1);
        me._readBytes (bytes, cb);
      });
    });
  };
  var me = this;
  if (!this._initialized){
    this._init (function (error){
      if (error) return cb (error, null);
      me._initialized = true;
      open ();
    });
  }else{
    if (!this._fd) return open ();
    this._readBytes (bytes, cb);
  }
 };
 BufferedReader.prototype.seek = function (offset, cb){
  cb = cb.bind (this);
  if (offset < 0) return cb (new Error (INVALID_SEEK_OFFSET));
  var seek = function (){
    offset += me._settings.start;
    if (offset >= me._settings.end + 1){
      me._isEOF = true;
    }else{
      me._isEOF = false;
      var start = me._fileOffset - (me._buffer ? me._buffer.length : 0);
      if (offset >= start && offset < me._fileOffset){
        me._bufferOffset = offset - start;
        me._realOffset = offset;
      }else{
        me._needRead = me._fd ? true : false;
        me._noMoreBuffers = false;
        me._fileOffset = offset;
        me._bufferOffset = 0;
        me._realOffset = offset;
      }
    }
    cb (null);
  };
  var me = this;
  if (!this._initialized){
    this._init (function (error){
      if (error) return cb (error, null);
      me._initialized = true;
      seek ();
    });
  }else{
    seek ();
  }
 };
 BufferedReader.prototype.skip = function (bytes, cb){
  cb = cb.bind (this);
  if (bytes < 1 || this._isEOF) return cb (null, 0);
  var skip = function (){
    var remaining = me._settings.end - me._realOffset + 1;
    bytes = bytes <= remaining ? bytes : remaining;
    me.seek (me._realOffset - me._settings.start + bytes, function (){
      cb (null, bytes);
    });
  };
  var me = this;
  if (!this._initialized){
    this._init (function (error){
      if (error) return cb (error, null);
      me._initialized = true;
      skip ();
    });
  }else{
    skip ();
  }
 };
 module.exports = BufferedReader;
--- a/tools/fastIndex.js
+++ b/tools/fastIndex.js
@ -0,0 +1,166 @@
 /**
 * fastIndex.js
 *
 * 		override natural.WordNet's IndexFile to use fast index data
 *
 */
 var _ = require('underscore')._,
  util = require('util'),
  path = require('path'),
  fs = require('fs'),
  KEY_LENGTH = 3;
 // load fast index bucket data
 function loadFastIndex(dir, name) {
  var jsonFile = path.join(dir, 'fast-' + name + '.json'),
    data = null;
  try{
    data = JSON.parse( fs.readFileSync(jsonFile,'utf8') );
    //console.log('loaded %d buckets for %s', data.stats.buckets, data.name);
  } catch(e) {
    console.error('Error with fast index file %s\n  ', jsonFile, e);
  }
  return data;
 }
 function readIndexForKey(key, index, callback) {
  var data = index.fastIndex,
    offset = data.offsets[key][0],
    nextKey = data.offsets[key][1],
    nextOffset = data.offsets[nextKey][0],
    len = nextOffset - offset - 1,
    buffer = new Buffer(len);
  fs.read(index.fd, buffer, 0, len, offset, function(err, count){
     if (err) return console.log(err);
     //console.log('  read %d bytes for <%s>', count, key);
     callback(buffer);
  });
 }
 function find(search, callback) {
  var self = this,
    data = this.fastIndex,
    readCallbacks = this.cache,
    miss = {status: 'miss'},
    args = [search, callback];
  var key = search.slice(0, KEY_LENGTH);
  if (!(key in data.offsets)) return callback(miss);
  // queue up if already reading file for this key
  if (key in readCallbacks){
    readCallbacks[key].push(args);
    return;
  }
  readCallbacks[key] = [args];
  if (!this.fd) {
    //console.log(' ... opening', this.filePath);
    this.fd = fs.openSync(this.filePath, 'r');
  }
  // ref count so we know when to close the main index file
  ++this.refcount;
  readIndexForKey(key, this, function (buffer){
    var lines = buffer.toString().split('\n'),
      keys = lines.map(function(line){
        return line.substring(0,line.indexOf(' '));
      });
    readCallbacks[key].forEach( test );
    delete readCallbacks[key];
    if (--self.refcount == 0) {
        //console.log(' ... closing', self.filePath);
        fs.close(self.fd);
        self.fd = null;
    }
    function test(item) {
      var search = item[0],
        callback = item[1],
        ind = _.indexOf(keys, search, /*isSorted*/ true);	// binary search!
      //console.log(' %s is %d', search, ind);
      if (ind == -1) return callback(miss);
      var tokens = lines[ind].split(/\s+/),
        key = tokens[0],
        result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens};
      callback(result);
    }
  });
 }
 function find____(search, callback) {
 //  console.log('   >> ', search, this.fileName, this.fd);
  var self = this,
    data = this.fastIndex,
    miss = {status: 'miss'};
  var key = search.slice(0, KEY_LENGTH);
  if (!(key in data.offsets)) return callback(miss);
  if (!this.fd) {
 //    console.log(' ... opening', this.filePath);
    this.fd = fs.openSync(this.filePath, 'r');
  }
  // ref count so we know when to close the main index file
  ++this.refcount;
  var offset = data.offsets[key][0],
    nextKey = data.offsets[key][1],
    nextOffset = data.offsets[nextKey][0],
    len = nextOffset - offset - 1,
    buffer = new Buffer(len),
    pos = Math.ceil(len / 2) - 0;
  console.log('--', offset, len, offset+len, offset+pos);
  // call base class's _findAt to search only relevant portion
  this._findAt(this.fd, // fd
      offset+len * 1,	// size (more like 'end' of buffer)
      offset+pos, 	// pos
      null, 		// lastPos
      pos * 1, 		// adjustment
      search, 		// key
      done);		// callback
  function done(result) {
    //console.log(self.refcount, search, result && result.line);
    if (--self.refcount == 0) {
      //console.log(' ... closing', self.filePath);
      fs.close(self.fd);
      self.fd = null;
    }
    callback(result);
  }
 }
 // cache of fast index data across instances of WordPOS class
 var cache = {};
 module.exports = {
  find: function(index){
    var key = index.filePath,
      data;
    if (!(key in cache)) {
      data = loadFastIndex(index.dataDir, index.fileName);
      cache[key] = data;
    }
    // if no fast index data was found or was corrupt, use original find
    if (!cache[key]) return index.find;
    index.fastIndex = cache[key];
    index.refcount = 0;
    index.cache = {};
    return find;
  }
 };
--- a/tools/stat.js
+++ b/tools/stat.js
@ -0,0 +1,149 @@
 /**
 * generate fast index for WordNet index files
 *
 * Usage:
 *    node stat [--no-stats] index.adv ...
 *
 * --no-stats prevents writing stat data to file
 * Fast index is based on buckets keyed off first THREE characters in the index word,
 * eg, 'awesome' goes into bucket 'awe'
 *
 * Format of the fast index:
 *  {
 *   "firstKey":".22",				// first key value
 *   "keyLength":3,					// #characters in key
 *   "version":"3.0",				// WNdb version
 *   "name":"index.adj",			// index file name
 *   "stats":{
 *   	"buckets":2326,				// # of buckets
 *   	"words":21479,				// total # words
 *   	"biggest":310,				// #words in biggest bucket
 *   	"avg":"9.23",				// average #words per bucket
 *   	"median":3					// median #words per bucket
 *     },
 *   "offsets":{
 *     "100":[2271,"101"],			// "100" is the key,
 *     								// value=[byte offset in index file, next key]
 *      ...
 *    }
 *  }
 *
 *  To lookup a word:
 *
 *  find key (first <keyLength> chars of word)
 *  look it up in <offsets> O(1)
 *  if it exists
 *  	get offset of key and offset of next key
 *      read index file between the two offsets
 *  	binary search read data O(log avg)
 */
 var
  WNdb = require('../wordpos').WNdb,
  util = require('util'),
  BufferedReader = require ("./buffered-reader"),
  _ = require('underscore')._,
  fs = require('fs'),
  path = require('path'),
  KEY_LENGTH = 3,
  stats = true,
  eofKey = '_EOF_'; // should be unique
 console.log('DB folder: ', WNdb.path);
 if (process.argv.length < 3) {
  console.log('#Usage:\nnode stat index.adv ...');
  process.exit(1);
 }
 _(process.argv.slice(2)).filter(function(arg){
  // disable writing stats file
  if (arg == '--no-stats') {
    stats = false;
    return false;
  }
  return true;
 }).forEach(function(basename){
  var indexFile = path.join(WNdb.path, basename),
    jsonFile = path.join(WNdb.path, 'fast-' + basename + '.json'),
    countFile = 'fast-' + basename + '.tsv',
    endOffset = fs.statSync(indexFile).size,
    buckets = {},
    lastKey = null,
    offsets = {},
    firstKey = null;
  new BufferedReader (indexFile, {encoding: "utf8"})
    .on ("error", function (error){
      console.log ("error: %s", indexFile, error);
    })
    .on ("line", function (line, offset){
      // skip license info
      if (line[0] == ' ') return;
      // if (++i > 225) return this.interrupt();
      var key = line.substring(0, Math.min(line.indexOf(' '), KEY_LENGTH));
      if (firstKey === null) firstKey = key;
      if (key in buckets) {
        ++buckets[key];
        return;
      }
      buckets[key] = 1;
      offsets[key] = [offset];
      (lastKey !== null) && offsets[lastKey].push(key);	// current key is the 'next key' for the previous key
      lastKey = key;
    })
    .on ("end", function (){
      // add EOF offset
      offsets[lastKey].push(eofKey);
      offsets[eofKey] = [endOffset, null];
      var size = _.size(buckets),
        sum = _.reduce(buckets, function(memo, num){ return memo + num; }, 0),
        sorted = _.sortBy(buckets, function(val){ return val }),
        median = sorted[Math.floor(size/2)],
        max =  sorted[sorted.length-1], // _.max(buckets),
        maxkey = _.reduce(buckets, function(memo, val, key){ return memo + (val == max ? key :  '') }, ''),
        avg = (sum/size).toFixed(2),
        info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);
 //      console.log(sorted);
 //      return;
      console.log(basename, info);
      if (stats) {
        // distribution in groups of 10
        var grouped = _.groupBy(buckets, function(num){ return 1 + 10*(Math.floor((num-1)/10) ) });
        _(grouped).each(function(arr, key, list){
              list[key] = arr.length;
            });
        str = '';
        _.each(grouped, function(value, key){ str += key+"\t"+value+"\n" });
        fs.writeFileSync(countFile, '#'+info+'\n'
            + '#bucket_size (1-10, 11-20, etc.) \t #buckets\n'
            + str, 'utf8');
      }
      // offset data
      var data = {
          firstKey: firstKey,
          keyLength: KEY_LENGTH,
          version: WNdb.version,
          name: basename,
          stats: {
            buckets: size,
            words: sum,
            biggest: max,
            avg: avg,
            median: median
          },
          offsets: offsets
      };
      fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
    })
    .read();
 });
--- a/wordpos.js
+++ b/wordpos.js
@ -4,6 +4,8 @@
 *    Node.js part-of-speech utilities using natural's WordNet module.
 *
 * Copyright (c) 2012 mooster@42at.com
 * https://github.com/moos/wordpos
 *
 * Released under MIT license
 */
@ -13,7 +15,12 @@ var _ = require('underscore')._,
  WordNet = natural.WordNet,
  tokenizer = new natural.WordTokenizer(),
  stopwords = ' '+ natural.stopwords.join(' ') +' ',
-  WNdb = require('WNdb');
+  WNdb = require('WNdb'),
  fastIndex = null;
 try {
  fastIndex = require('./tools/fastIndex');
 } catch(e) {}
 function normalize(word) {
  return word.toLowerCase().replace(/\s+/g, '_');
@ -95,6 +102,14 @@ var WordPOS = function(options) {
    WordPOS.super_.apply(this, arguments);
  }
  this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults);
  if (this.options.fastIndex && fastIndex) {
    // override find
    this.nounIndex.find = fastIndex.find(this.nounIndex);
    this.verbIndex.find = fastIndex.find(this.verbIndex);
    this.adjIndex.find = fastIndex.find(this.adjIndex);
    this.advIndex.find = fastIndex.find(this.advIndex);
  }
 };
 util.inherits(WordPOS, WordNet);
@ -102,7 +117,12 @@ WordPOS.defaults = {
  /**
   * enable profiling, time in msec returned as second argument in callback
   */
-  profile: false
+  profile: false,
  /**
   * use fast index if available
   */
  fastIndex: true
 };
 var wordposProto = WordPOS.prototype;