Added seek() method and lexName property. Bump to 1.1.

2016-04-09 13:46:36 -07:00 · 2016-04-09 13:46:36 -07:00 · 0abe5a9010
parent 57c3340130
commit 0abe5a9010
7 changed files with 296 additions and 32 deletions
--- a/README.md
+++ b/README.md
@ -29,7 +29,7 @@ wordpos.isAdjective('awesome', function(result){
 // true 'awesome'
 ```
-Command-line: (see [CLI](bin))
+Command-line: (see [CLI](bin) for full command list)
 ```bash
 $ wordpos def git
 git
@ -71,7 +71,7 @@ WordPOS.defaults = {
  stopwords: true
 };
 ```
-To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a last argument that is the execution time in msec of the call.
+To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call.
 ```js
    wordpos = new WordPOS({profile: true});
@ -165,19 +165,33 @@ Example:
 ```js
 wordpos.lookupAdjective('awesome', console.log);
 // output:
-[ { synsetOffset: 1282510,
+[ { synsetOffset: 1285602,
    lexFilenum: 0,
    lexName: 'adj.all',
    pos: 's',
    wCnt: 5,
    lemma: 'amazing',
    synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
    lexId: '0',
    ptrs: [],
-    gloss: 'inspiring awe or admiration or wonder; <snip> awing majesty, so vast, so high, so silent"  ' 
+    gloss: 'inspiring awe or admiration or wonder; [...] awing majesty, so vast, so high, so silent"  '
    def: 'inspiring awe or admiration or wonder',     
    ...
 } ], 'awesome'
 ```
-In this case only one lookup was found, but there could be several.
+In this case only one lookup was found, but there could be several.  
 Version 1.1 adds the `lexName` parameter, which maps the lexFilenum to one of [45 lexicographer domains](https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html).
 #### seek(offset, pos, callback)
 Version 1.1 introduces the seek method to lookup a record directly from the synsetOffset for a given POS.  Unlike other methods, callback (if provided) receives `(err, result)` arguments.
 Examples:
 ```js
 wordpos.seek(1285602, 'a').then(console.log)
 // same result as wordpos.lookupAdjective('awesome', console.log);
 ```
 #### rand(options, callback)
 #### randNoun(options, callback)
@ -214,6 +228,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log)
 Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met.
 #### parse(text) 
 Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally.
@ -274,6 +289,10 @@ See [bench/README](bench).
 ## Changes
 1.1 - 
 - added seek() method
 - added lexName property
 1.0.1
 - Removed npm dependency on Natural.  Certain modules are included in /lib.
 - Add support for ES6 Promises.
--- a/bin/README.md
+++ b/bin/README.md
@ -19,6 +19,8 @@ $ wordpos
    syn        lookup synonyms
    exp        lookup examples
    seek       get record at synset offset. Must include one of POS -n, -a, -v, -r
    rand       get random words (starting with [word]). If first arg is a number, returns
               that many random words. Valid options are -b, -f, -j, -s, -i.
@ -222,6 +224,49 @@ $ wordpos rand --adj foot
 foot-shaped
 ```
 #### Seek a synset offset
 Seek offset as adjective:
 ```sh
 $ wordpos seek 1285602 -a
 { '1285602':
   { synsetOffset: 1285602,
       lexFilenum: 0,
       lexName: 'adj.all',
       pos: 's',
       wCnt: 5,
       lemma: 'amazing',
       synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
       lexId: '0',
       ptrs:
        [ { pointerSymbol: '&',
            synsetOffset: 1285124,
            pos: 'a',
 ...            
 ```
 Same as verb (not found!):
 ```sh
 $ wordpos seek 1285602 -v
 { '1285602': {} }
 ```
 Multiple offsets from same POS:
 ```sh
 $ wordpos seek 1285602 1285124 -a
 { '1285124':
   { synsetOffset: 1285124,
       lexFilenum: 0,
       ...
   },
  '1285602':
    { synsetOffset: 1285602,
        lexFilenum: 0,
        ...
    }
 ```
 Note that results are always returned as `--full` format.  To get compact JSON format, add the `-j` option.
 #### Stopwords
 List stopwords (brief):
 ```bash
--- a/bin/wordpos-cli.js
+++ b/bin/wordpos-cli.js
@ -5,9 +5,9 @@
 * command-line interface to wordpos
 *
 * Usage:
- *    wordpos [options] <get|parse|def|rand|syn|exp> <stdin|words*>
+ *    wordpos [options] <get|parse|def|rand|syn|exp|seek> <stdin|words*>
 *
- * Copyright (c) 2012 mooster@42at.com
+ * Copyright (c) 2012, 2016 mooster@42at.com
 * https://github.com/moos/wordpos
 *
 * Released under MIT license
@ -17,6 +17,7 @@ var program = require('commander'),
  _ = require('underscore')._,
  fs = require('fs'),
  POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'},
  POS_abbr = {noun:'n', adj:'a', verb:'v', adv:'r'},
  version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version,
  rawCmd = '',
  RAND_PLACEHOLDER = '__',
@ -67,6 +68,19 @@ program.command('exp')
    exec.apply(this, arguments);
  });
 program.command('seek')
  .description('get record at synset offset. Must include one of POS -n, -a, -v, -r')
  .action(function(){
    var one = _.chain(program).pick('noun adj adv verb'.split(' ')).countBy().value().true;
    if (!one || one > 1) {
      console.error('Must include one and only one of -n, -a, -v, -r');
      process.exit(-1);
    }
    // force full output mode
    program.full = 1;
    exec.apply(this, arguments);
  });
 program.command('rand')
  .description('get random words (starting with [word]). If first arg is a number, returns ' +
    'that many random words. Valid options are -b, -f, -j, -s, -i.')
@ -80,12 +94,10 @@ program.command('rand')
      args.shift();
      program.num = num;
    }
    // no startsWith given, add a placeholder
    if (args.length === 1){
      args.unshift(RAND_PLACEHOLDER);
    }
    exec.apply(this, args);
  });
@ -150,23 +162,24 @@ function read_stdin(callback) {
 }
 function optToFn() {
-  var fns = _.reject(POS, function(fn, opt) { return !program[opt] });
+  var
    map = cmd === 'seek' ? POS_abbr : POS,
    fns = _.reject(map, function(fn, opt) { return !program[opt] });
  if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
-  if (!fns.length) fns = _.values(POS); //default to all if no POS given
+  if (!fns.length) fns = _.values(map); //default to all if no POS given
  return fns;
 }
 function run(data) {
  var
    opts = {stopwords: !program.withStopwords},
    wordpos = new WordPos(opts),
-    words = wordpos.parse(data),
+    seek = cmd === 'seek',
    words = seek ? data.split(' ') : wordpos.parse(data),
    fns = optToFn(),
-    plural = (cmd=='get' ? 's':''),
+    plural = (cmd === 'get' ? 's':''),
    results = {},
-    finale = _.after(
+    finale = _.after(plural ? fns.length : words.length * fns.length,
        plural ? fns.length : words.length * fns.length,
        _.bind(output, null, results)),
    collect = function(what, result, word){
      if (word) {	// lookup
@ -184,13 +197,20 @@ function run(data) {
  _(fns).each(function(fn){
    var method = cmd + fn + plural,
      cb = _.bind(collect, null, fn);
-    if (cmd == 'get') {
+    if (cmd === 'get') {
      wordpos[method](words, cb);
-    } else if (cmd == 'rand') {
+    } else if (cmd === 'rand') {
      if (words[0] === RAND_PLACEHOLDER) words[0] = '';
      words.forEach(function(word){
        wordpos[method]({startsWith: word, count: program.num || 1}, cb);
      });
    } else if (seek) {
      words.forEach(function(offset){
        wordpos.seek(offset, fn, function(err, result){
          results[offset.trim()] = result;
          finale();
        });
      });
    } else {
      words.forEach(function(word){
        wordpos[method](word, cb);
@ -227,8 +247,9 @@ function sprint(results) {
    }, '');
  default:
    return _.reduce(results, function(memo, v, k){
-      var pre = program.brief ? '' : util.format('# %s %d:%s', k,  v.length, sep);
+      var pre = program.brief ? '' : util.format('# %s %d:%s', k,  v.length, sep),
-      return memo + (v.length && util.format('%s%s%s\n', pre, v.join(sep), sep) || '');
+        res = v.length ? v.join(sep) : '';
      return memo + (v.length && util.format('%s%s%s\n', pre, res, sep) || '');
    }, '');
  }
--- a/package.json
+++ b/package.json
@ -11,7 +11,7 @@
    "verbs"
  ],
  "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
-  "version": "1.0.1",
+  "version": "1.1",
  "homepage": "https://github.com/moos/wordpos",
  "engines": {
    "node": ">=0.12"
--- a/src/dataFile.js
+++ b/src/dataFile.js
@ -13,6 +13,17 @@ var fs = require('fs'),
  path = require('path'),
  _ = require('underscore');
 /**
 * sanity check read data - line must start with zero-padded location
 *
 * @param line {string} - line data read
 * @return {boolean} true if line data is good
 */
 function dataCheck(line, location) {
  var pad = '00000000', // 8 zeros
    padded = String(pad + location).slice( - pad.length);
  return line.indexOf(padded) === 0;
 }
 /**
 * parse a single data file line, returning data object
@ -22,7 +33,9 @@ var fs = require('fs'),
 *
 * Credit for this routine to https://github.com/NaturalNode/natural
 */
-function lineDataToJSON(line) {
+function lineDataToJSON(line, location) {
  if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
  var data = line.split('| '),
    tokens = data[0].split(/\s+/),
    ptrs = [],
@ -48,6 +61,7 @@ function lineDataToJSON(line) {
  var glossArray = data[1].split("; ");
  var definition = glossArray[0];
  var examples = glossArray.slice(1);
  var lexFilenum = parseInt(tokens[1], 10);
  for (var k = 0; k < examples.length; k++) {
    examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
@ -55,7 +69,8 @@ function lineDataToJSON(line) {
  return {
    synsetOffset: parseInt(tokens[0], 10),
-    lexFilenum: parseInt(tokens[1], 10),
+    lexFilenum: lexFilenum,
    lexName: DataFile.LEX_NAMES[ lexFilenum ],
    pos: tokens[2],
    wCnt: wCnt,
    lemma: tokens[4],
@ -85,12 +100,12 @@ function readLocation(location, callback) {
  readChunk(location, function(err, count) {
    if (err) {
-      console.log(err);
+      //console.log(err);
      callback(err);
      return;
    }
    //console.log('  read %d bytes at <%d>', count, location);
-    callback(null, lineDataToJSON(str));
+    callback(null, lineDataToJSON(str, location));
  });
  function readChunk(pos, cb) {
@ -98,12 +113,13 @@ function readLocation(location, callback) {
      str += buffer.toString('ascii');
      var eol = str.indexOf('\n');
      //console.log('  -- read %d bytes at <%d>', count, pos, eol);
-      if (eol === -1 && len < file.maxLineLength) {
+      if (count && eol === -1 && len < file.maxLineLength) {
        // continue reading
        return readChunk(pos + count, cb);
      }
      str = str.substr(0, eol);
      if (str === '' && !err) err = new Error('no data at offset ' + pos);
      cb(err, count);
    });
  }
@ -112,15 +128,16 @@ function readLocation(location, callback) {
 /**
 * main lookup function
 *
- * @param record {object} - record to lookup, obtained from index.find()
+ * @param offsets {array} - array of offsets to lookup (obtained from index.find())
 * @param callback{function} (optional) - callback function
 * @returns {Promise}
 */
-function lookup(record, callback) {
+function lookup(offsets, callback) {
  var results = [],
    self = this,
-    offsets = record.synsetOffset;
+    single = !_.isArray(offsets);
  if (single) offsets = [offsets];
  return new Promise(function(resolve, reject) {
    offsets
      .map(function (offset) {
@ -134,9 +151,10 @@ function lookup(record, callback) {
    function done(lastResult) {
      closeFile();
      if (lastResult instanceof Error) {
-        callback && callback(lastResult, []);
+        callback && callback(lastResult, single ? {} :[]);
        reject(lastResult);
      } else {
        if (single) results = results[0];
        callback && callback(null, results);
        resolve(results);
      }
@ -233,5 +251,58 @@ DataFile.MAX_LINE_LENGTH = {
  adv: 638
 };
 /**
 * map of lexFilenum to lex names
 *
 * @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
 * @type {string[]}
 */
 DataFile.LEX_NAMES = [
  'adj.all',
  'adj.pert',
  'adv.all',
  'noun.Tops',
  'noun.act',
  'noun.animal',
  'noun.artifact',
  'noun.attribute',
  'noun.body',
  'noun.cognition',
  'noun.communication',
  'noun.event',
  'noun.feeling',
  'noun.food',
  'noun.group',
  'noun.location',
  'noun.motive',
  'noun.object',
  'noun.person',
  'noun.phenomenon',
  'noun.plant',
  'noun.possession',
  'noun.process',
  'noun.quantity',
  'noun.relation',
  'noun.shape',
  'noun.state',
  'noun.substance',
  'noun.time',
  'verb.body',
  'verb.change',
  'verb.cognition',
  'verb.communication',
  'verb.competition',
  'verb.consumption',
  'verb.contact',
  'verb.creation',
  'verb.emotion',
  'verb.motion',
  'verb.perception',
  'verb.possession',
  'verb.social',
  'verb.stative',
  'verb.weather',
  'adj.ppl'
 ];
 module.exports = DataFile;
--- a/src/wordpos.js
+++ b/src/wordpos.js
@ -63,7 +63,7 @@ function lookup(pos) {
      .then(function(result) {
        if (result) {
          // lookup data
-          return files.data.lookup(result).then(done);
+          return files.data.lookup(result.synsetOffset).then(done);
        } else {
          // not found in index
          return done([]);
@ -362,6 +362,31 @@ wordposProto.getVerbs = get('isVerb');
 wordposProto.parse = prepText;
 /**
 * seek - get record at offset for pos
 *
 * @param offset {number} - synset offset
 * @param pos {string} - POS a/r/n/v
 * @param callback {function} - optional callback
 * @returns Promise
 */
 wordposProto.seek = function(offset, pos, callback){
  offset = Number(offset);
  if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
  var data = this.getFilesFor(pos).data;
  if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
  return data.lookup(offset, callback);
  function error(msg) {
    var err = new Error(msg);
    callback && callback(err, {});
    return Promise.reject(err);
  }
 };
 /**
 * access to WordNet DB
 * @type {object}
--- a/test/wordpos_test.js
+++ b/test/wordpos_test.js
@ -21,6 +21,7 @@
 var
  chai = require('chai'),
  _ = require('underscore'),
  assert = chai.assert,
  WordPOS = require('../src/wordpos'),
  wordpos = new WordPOS({profile: false});
@ -35,7 +36,9 @@ var str = "The angry bear chased the frightened little squirrel",
    adverbs: [ 'little' ],
    rest: [ 'The' ]
  },
-  garble = 'garblegarble';	// expect not to find word
+  garble = 'garblegarble',	// expect not to find word
  offset = 1285602,
  offset_pos ='a';
@ -356,6 +359,62 @@ describe('randX()...', function() {
 });
 describe('seek()...', function() {
  it('should handle bad offset', function(done) {
      wordpos.seek('foobar', 'a', function(err, result){
        assert(err instanceof Error);
        assert.equal(err.message, 'offset must be valid positive number.');
        done();
      });
  });
  it('should handle wrong offset', function(done) {
    var bad_offset = offset + 1;
    wordpos.seek(bad_offset, offset_pos, function(err, result) {
      assert(err instanceof Error);
      assert.equal(err.message, 'Bad data at location ' + bad_offset);
      assert.deepEqual(result, {});
      done();
    });
  });
  it('should handle very large offset', function(done) {
    var bad_offset = offset + 100000000;
    wordpos.seek(bad_offset, offset_pos, function(err, result) {
      assert(err instanceof Error);
      assert.equal(err.message, 'no data at offset ' + bad_offset);
      assert.deepEqual(result, {});
      done();
    });
  });
  it('should handle bad pos', function(done) {
    wordpos.seek(offset, 'g', function(err, result) {
      assert(err instanceof Error);
      assert(/Incorrect POS/.test(err.message));
      done();
    });
  });
  it('should handle wrong pos', function(done) {
    wordpos.seek(offset, 'v', function(err, result){
      assert.equal(err.message, 'Bad data at location ' + offset);
    });
    done();
  });
  it('should seek offset', function(done) {
    wordpos.seek(offset, offset_pos, function(err, result) {
      assert.equal(result.synsetOffset, offset);
      assert.equal(result.pos, 's');
      assert.equal(result.lemma, 'amazing');
      done();
    });
  });
 });
 describe('Promise pattern', function() {
@ -413,4 +472,28 @@ describe('Promise pattern', function() {
      assert.equal(result[0].indexOf('foo'), 0);
    });
  });
  it('seek()', function () {
    return wordpos.seek(offset, offset_pos).then(function (result) {
      assert.equal(result.synsetOffset, offset);
      assert.equal(result.pos, 's');
      assert.equal(result.lemma, 'amazing');
    });
  });
  it('seek() - wrong offset', function () {
    return wordpos.seek(offset + 1, offset_pos).catch(function (err) {
      assert(err instanceof Error);
      assert.equal(err.message, 'Bad data at location ' + (offset+1));
    });
  });
  it('seek() - bad offset', function () {
    return wordpos.seek('foobar', offset_pos).catch(function (err) {
      assert(err instanceof Error);
      assert.equal(err.message, 'offset must be valid positive number.');
    });
  });
 });