Added seek() method and lexName property. Bump to 1.1.

This commit is contained in:
Moos 2016-04-09 13:46:36 -07:00
parent 57c3340130
commit 0abe5a9010
7 changed files with 296 additions and 32 deletions

View File

@ -29,7 +29,7 @@ wordpos.isAdjective('awesome', function(result){
// true 'awesome' // true 'awesome'
``` ```
Command-line: (see [CLI](bin)) Command-line: (see [CLI](bin) for full command list)
```bash ```bash
$ wordpos def git $ wordpos def git
git git
@ -71,7 +71,7 @@ WordPOS.defaults = {
stopwords: true stopwords: true
}; };
``` ```
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a last argument that is the execution time in msec of the call. To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call.
```js ```js
wordpos = new WordPOS({profile: true}); wordpos = new WordPOS({profile: true});
@ -165,19 +165,33 @@ Example:
```js ```js
wordpos.lookupAdjective('awesome', console.log); wordpos.lookupAdjective('awesome', console.log);
// output: // output:
[ { synsetOffset: 1282510, [ { synsetOffset: 1285602,
lexFilenum: 0, lexFilenum: 0,
lexName: 'adj.all',
pos: 's', pos: 's',
wCnt: 5, wCnt: 5,
lemma: 'amazing', lemma: 'amazing',
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ], synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
lexId: '0', lexId: '0',
ptrs: [], ptrs: [],
gloss: 'inspiring awe or admiration or wonder; <snip> awing majesty, so vast, so high, so silent" ' gloss: 'inspiring awe or admiration or wonder; [...] awing majesty, so vast, so high, so silent" '
def: 'inspiring awe or admiration or wonder',
...
} ], 'awesome' } ], 'awesome'
``` ```
In this case only one lookup was found, but there could be several. In this case only one lookup was found, but there could be several.
Version 1.1 adds the `lexName` parameter, which maps the lexFilenum to one of [45 lexicographer domains](https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html).
#### seek(offset, pos, callback)
Version 1.1 introduces the seek method to lookup a record directly from the synsetOffset for a given POS. Unlike other methods, callback (if provided) receives `(err, result)` arguments.
Examples:
```js
wordpos.seek(1285602, 'a').then(console.log)
// same result as wordpos.lookupAdjective('awesome', console.log);
```
#### rand(options, callback) #### rand(options, callback)
#### randNoun(options, callback) #### randNoun(options, callback)
@ -214,6 +228,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log)
Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met. Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met.
#### parse(text) #### parse(text)
Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally. Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally.
@ -274,6 +289,10 @@ See [bench/README](bench).
## Changes ## Changes
1.1 -
- added seek() method
- added lexName property
1.0.1 1.0.1
- Removed npm dependency on Natural. Certain modules are included in /lib. - Removed npm dependency on Natural. Certain modules are included in /lib.
- Add support for ES6 Promises. - Add support for ES6 Promises.

View File

@ -20,6 +20,8 @@ $ wordpos
exp lookup examples exp lookup examples
seek get record at synset offset. Must include one of POS -n, -a, -v, -r
rand get random words (starting with [word]). If first arg is a number, returns rand get random words (starting with [word]). If first arg is a number, returns
that many random words. Valid options are -b, -f, -j, -s, -i. that many random words. Valid options are -b, -f, -j, -s, -i.
@ -222,6 +224,49 @@ $ wordpos rand --adj foot
foot-shaped foot-shaped
``` ```
#### Seek a synset offset
Seek offset as adjective:
```sh
$ wordpos seek 1285602 -a
{ '1285602':
{ synsetOffset: 1285602,
lexFilenum: 0,
lexName: 'adj.all',
pos: 's',
wCnt: 5,
lemma: 'amazing',
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
lexId: '0',
ptrs:
[ { pointerSymbol: '&',
synsetOffset: 1285124,
pos: 'a',
...
```
Same as verb (not found!):
```sh
$ wordpos seek 1285602 -v
{ '1285602': {} }
```
Multiple offsets from same POS:
```sh
$ wordpos seek 1285602 1285124 -a
{ '1285124':
{ synsetOffset: 1285124,
lexFilenum: 0,
...
},
'1285602':
{ synsetOffset: 1285602,
lexFilenum: 0,
...
}
```
Note that results are always returned as `--full` format. To get compact JSON format, add the `-j` option.
#### Stopwords #### Stopwords
List stopwords (brief): List stopwords (brief):
```bash ```bash

View File

@ -5,9 +5,9 @@
* command-line interface to wordpos * command-line interface to wordpos
* *
* Usage: * Usage:
* wordpos [options] <get|parse|def|rand|syn|exp> <stdin|words*> * wordpos [options] <get|parse|def|rand|syn|exp|seek> <stdin|words*>
* *
* Copyright (c) 2012 mooster@42at.com * Copyright (c) 2012, 2016 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
@ -17,6 +17,7 @@ var program = require('commander'),
_ = require('underscore')._, _ = require('underscore')._,
fs = require('fs'), fs = require('fs'),
POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'}, POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'},
POS_abbr = {noun:'n', adj:'a', verb:'v', adv:'r'},
version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version, version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version,
rawCmd = '', rawCmd = '',
RAND_PLACEHOLDER = '__', RAND_PLACEHOLDER = '__',
@ -67,6 +68,19 @@ program.command('exp')
exec.apply(this, arguments); exec.apply(this, arguments);
}); });
program.command('seek')
.description('get record at synset offset. Must include one of POS -n, -a, -v, -r')
.action(function(){
var one = _.chain(program).pick('noun adj adv verb'.split(' ')).countBy().value().true;
if (!one || one > 1) {
console.error('Must include one and only one of -n, -a, -v, -r');
process.exit(-1);
}
// force full output mode
program.full = 1;
exec.apply(this, arguments);
});
program.command('rand') program.command('rand')
.description('get random words (starting with [word]). If first arg is a number, returns ' + .description('get random words (starting with [word]). If first arg is a number, returns ' +
'that many random words. Valid options are -b, -f, -j, -s, -i.') 'that many random words. Valid options are -b, -f, -j, -s, -i.')
@ -80,12 +94,10 @@ program.command('rand')
args.shift(); args.shift();
program.num = num; program.num = num;
} }
// no startsWith given, add a placeholder // no startsWith given, add a placeholder
if (args.length === 1){ if (args.length === 1){
args.unshift(RAND_PLACEHOLDER); args.unshift(RAND_PLACEHOLDER);
} }
exec.apply(this, args); exec.apply(this, args);
}); });
@ -150,23 +162,24 @@ function read_stdin(callback) {
} }
function optToFn() { function optToFn() {
var fns = _.reject(POS, function(fn, opt) { return !program[opt] }); var
map = cmd === 'seek' ? POS_abbr : POS,
fns = _.reject(map, function(fn, opt) { return !program[opt] });
if (!fns.length && cmd === 'rand') return fns = ['']; // run rand() if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
if (!fns.length) fns = _.values(POS); //default to all if no POS given if (!fns.length) fns = _.values(map); //default to all if no POS given
return fns; return fns;
} }
function run(data) { function run(data) {
var var
opts = {stopwords: !program.withStopwords}, opts = {stopwords: !program.withStopwords},
wordpos = new WordPos(opts), wordpos = new WordPos(opts),
words = wordpos.parse(data), seek = cmd === 'seek',
words = seek ? data.split(' ') : wordpos.parse(data),
fns = optToFn(), fns = optToFn(),
plural = (cmd=='get' ? 's':''), plural = (cmd === 'get' ? 's':''),
results = {}, results = {},
finale = _.after( finale = _.after(plural ? fns.length : words.length * fns.length,
plural ? fns.length : words.length * fns.length,
_.bind(output, null, results)), _.bind(output, null, results)),
collect = function(what, result, word){ collect = function(what, result, word){
if (word) { // lookup if (word) { // lookup
@ -184,13 +197,20 @@ function run(data) {
_(fns).each(function(fn){ _(fns).each(function(fn){
var method = cmd + fn + plural, var method = cmd + fn + plural,
cb = _.bind(collect, null, fn); cb = _.bind(collect, null, fn);
if (cmd == 'get') { if (cmd === 'get') {
wordpos[method](words, cb); wordpos[method](words, cb);
} else if (cmd == 'rand') { } else if (cmd === 'rand') {
if (words[0] === RAND_PLACEHOLDER) words[0] = ''; if (words[0] === RAND_PLACEHOLDER) words[0] = '';
words.forEach(function(word){ words.forEach(function(word){
wordpos[method]({startsWith: word, count: program.num || 1}, cb); wordpos[method]({startsWith: word, count: program.num || 1}, cb);
}); });
} else if (seek) {
words.forEach(function(offset){
wordpos.seek(offset, fn, function(err, result){
results[offset.trim()] = result;
finale();
});
});
} else { } else {
words.forEach(function(word){ words.forEach(function(word){
wordpos[method](word, cb); wordpos[method](word, cb);
@ -227,8 +247,9 @@ function sprint(results) {
}, ''); }, '');
default: default:
return _.reduce(results, function(memo, v, k){ return _.reduce(results, function(memo, v, k){
var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep); var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep),
return memo + (v.length && util.format('%s%s%s\n', pre, v.join(sep), sep) || ''); res = v.length ? v.join(sep) : '';
return memo + (v.length && util.format('%s%s%s\n', pre, res, sep) || '');
}, ''); }, '');
} }

View File

@ -11,7 +11,7 @@
"verbs" "verbs"
], ],
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
"version": "1.0.1", "version": "1.1",
"homepage": "https://github.com/moos/wordpos", "homepage": "https://github.com/moos/wordpos",
"engines": { "engines": {
"node": ">=0.12" "node": ">=0.12"

View File

@ -13,6 +13,17 @@ var fs = require('fs'),
path = require('path'), path = require('path'),
_ = require('underscore'); _ = require('underscore');
/**
* sanity check read data - line must start with zero-padded location
*
* @param line {string} - line data read
* @return {boolean} true if line data is good
*/
function dataCheck(line, location) {
var pad = '00000000', // 8 zeros
padded = String(pad + location).slice( - pad.length);
return line.indexOf(padded) === 0;
}
/** /**
* parse a single data file line, returning data object * parse a single data file line, returning data object
@ -22,7 +33,9 @@ var fs = require('fs'),
* *
* Credit for this routine to https://github.com/NaturalNode/natural * Credit for this routine to https://github.com/NaturalNode/natural
*/ */
function lineDataToJSON(line) { function lineDataToJSON(line, location) {
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '), var data = line.split('| '),
tokens = data[0].split(/\s+/), tokens = data[0].split(/\s+/),
ptrs = [], ptrs = [],
@ -48,6 +61,7 @@ function lineDataToJSON(line) {
var glossArray = data[1].split("; "); var glossArray = data[1].split("; ");
var definition = glossArray[0]; var definition = glossArray[0];
var examples = glossArray.slice(1); var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) { for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
@ -55,7 +69,8 @@ function lineDataToJSON(line) {
return { return {
synsetOffset: parseInt(tokens[0], 10), synsetOffset: parseInt(tokens[0], 10),
lexFilenum: parseInt(tokens[1], 10), lexFilenum: lexFilenum,
lexName: DataFile.LEX_NAMES[ lexFilenum ],
pos: tokens[2], pos: tokens[2],
wCnt: wCnt, wCnt: wCnt,
lemma: tokens[4], lemma: tokens[4],
@ -85,12 +100,12 @@ function readLocation(location, callback) {
readChunk(location, function(err, count) { readChunk(location, function(err, count) {
if (err) { if (err) {
console.log(err); //console.log(err);
callback(err); callback(err);
return; return;
} }
//console.log(' read %d bytes at <%d>', count, location); //console.log(' read %d bytes at <%d>', count, location);
callback(null, lineDataToJSON(str)); callback(null, lineDataToJSON(str, location));
}); });
function readChunk(pos, cb) { function readChunk(pos, cb) {
@ -98,12 +113,13 @@ function readLocation(location, callback) {
str += buffer.toString('ascii'); str += buffer.toString('ascii');
var eol = str.indexOf('\n'); var eol = str.indexOf('\n');
//console.log(' -- read %d bytes at <%d>', count, pos, eol); //console.log(' -- read %d bytes at <%d>', count, pos, eol);
if (eol === -1 && len < file.maxLineLength) { if (count && eol === -1 && len < file.maxLineLength) {
// continue reading // continue reading
return readChunk(pos + count, cb); return readChunk(pos + count, cb);
} }
str = str.substr(0, eol); str = str.substr(0, eol);
if (str === '' && !err) err = new Error('no data at offset ' + pos);
cb(err, count); cb(err, count);
}); });
} }
@ -112,15 +128,16 @@ function readLocation(location, callback) {
/** /**
* main lookup function * main lookup function
* *
* @param record {object} - record to lookup, obtained from index.find() * @param offsets {array} - array of offsets to lookup (obtained from index.find())
* @param callback{function} (optional) - callback function * @param callback{function} (optional) - callback function
* @returns {Promise} * @returns {Promise}
*/ */
function lookup(record, callback) { function lookup(offsets, callback) {
var results = [], var results = [],
self = this, self = this,
offsets = record.synsetOffset; single = !_.isArray(offsets);
if (single) offsets = [offsets];
return new Promise(function(resolve, reject) { return new Promise(function(resolve, reject) {
offsets offsets
.map(function (offset) { .map(function (offset) {
@ -134,9 +151,10 @@ function lookup(record, callback) {
function done(lastResult) { function done(lastResult) {
closeFile(); closeFile();
if (lastResult instanceof Error) { if (lastResult instanceof Error) {
callback && callback(lastResult, []); callback && callback(lastResult, single ? {} :[]);
reject(lastResult); reject(lastResult);
} else { } else {
if (single) results = results[0];
callback && callback(null, results); callback && callback(null, results);
resolve(results); resolve(results);
} }
@ -233,5 +251,58 @@ DataFile.MAX_LINE_LENGTH = {
adv: 638 adv: 638
}; };
/**
* map of lexFilenum to lex names
*
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]}
*/
DataFile.LEX_NAMES = [
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
module.exports = DataFile; module.exports = DataFile;

View File

@ -63,7 +63,7 @@ function lookup(pos) {
.then(function(result) { .then(function(result) {
if (result) { if (result) {
// lookup data // lookup data
return files.data.lookup(result).then(done); return files.data.lookup(result.synsetOffset).then(done);
} else { } else {
// not found in index // not found in index
return done([]); return done([]);
@ -362,6 +362,31 @@ wordposProto.getVerbs = get('isVerb');
wordposProto.parse = prepText; wordposProto.parse = prepText;
/**
* seek - get record at offset for pos
*
* @param offset {number} - synset offset
* @param pos {string} - POS a/r/n/v
* @param callback {function} - optional callback
* @returns Promise
*/
wordposProto.seek = function(offset, pos, callback){
offset = Number(offset);
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
return data.lookup(offset, callback);
function error(msg) {
var err = new Error(msg);
callback && callback(err, {});
return Promise.reject(err);
}
};
/** /**
* access to WordNet DB * access to WordNet DB
* @type {object} * @type {object}

View File

@ -21,6 +21,7 @@
var var
chai = require('chai'), chai = require('chai'),
_ = require('underscore'),
assert = chai.assert, assert = chai.assert,
WordPOS = require('../src/wordpos'), WordPOS = require('../src/wordpos'),
wordpos = new WordPOS({profile: false}); wordpos = new WordPOS({profile: false});
@ -35,7 +36,9 @@ var str = "The angry bear chased the frightened little squirrel",
adverbs: [ 'little' ], adverbs: [ 'little' ],
rest: [ 'The' ] rest: [ 'The' ]
}, },
garble = 'garblegarble'; // expect not to find word garble = 'garblegarble', // expect not to find word
offset = 1285602,
offset_pos ='a';
@ -356,6 +359,62 @@ describe('randX()...', function() {
}); });
describe('seek()...', function() {
it('should handle bad offset', function(done) {
wordpos.seek('foobar', 'a', function(err, result){
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
done();
});
});
it('should handle wrong offset', function(done) {
var bad_offset = offset + 1;
wordpos.seek(bad_offset, offset_pos, function(err, result) {
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + bad_offset);
assert.deepEqual(result, {});
done();
});
});
it('should handle very large offset', function(done) {
var bad_offset = offset + 100000000;
wordpos.seek(bad_offset, offset_pos, function(err, result) {
assert(err instanceof Error);
assert.equal(err.message, 'no data at offset ' + bad_offset);
assert.deepEqual(result, {});
done();
});
});
it('should handle bad pos', function(done) {
wordpos.seek(offset, 'g', function(err, result) {
assert(err instanceof Error);
assert(/Incorrect POS/.test(err.message));
done();
});
});
it('should handle wrong pos', function(done) {
wordpos.seek(offset, 'v', function(err, result){
assert.equal(err.message, 'Bad data at location ' + offset);
});
done();
});
it('should seek offset', function(done) {
wordpos.seek(offset, offset_pos, function(err, result) {
assert.equal(result.synsetOffset, offset);
assert.equal(result.pos, 's');
assert.equal(result.lemma, 'amazing');
done();
});
});
});
describe('Promise pattern', function() { describe('Promise pattern', function() {
@ -413,4 +472,28 @@ describe('Promise pattern', function() {
assert.equal(result[0].indexOf('foo'), 0); assert.equal(result[0].indexOf('foo'), 0);
}); });
}); });
it('seek()', function () {
return wordpos.seek(offset, offset_pos).then(function (result) {
assert.equal(result.synsetOffset, offset);
assert.equal(result.pos, 's');
assert.equal(result.lemma, 'amazing');
});
});
it('seek() - wrong offset', function () {
return wordpos.seek(offset + 1, offset_pos).catch(function (err) {
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + (offset+1));
});
});
it('seek() - bad offset', function () {
return wordpos.seek('foobar', offset_pos).catch(function (err) {
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
});
});
}); });