Added seek() method and lexName property. Bump to 1.1.

This commit is contained in:
Moos 2016-04-09 13:46:36 -07:00
parent 57c3340130
commit 0abe5a9010
7 changed files with 296 additions and 32 deletions

View File

@ -29,7 +29,7 @@ wordpos.isAdjective('awesome', function(result){
// true 'awesome'
```
Command-line: (see [CLI](bin))
Command-line: (see [CLI](bin) for full command list)
```bash
$ wordpos def git
git
@ -71,7 +71,7 @@ WordPOS.defaults = {
stopwords: true
};
```
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a last argument that is the execution time in msec of the call.
To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call.
```js
wordpos = new WordPOS({profile: true});
@ -165,19 +165,33 @@ Example:
```js
wordpos.lookupAdjective('awesome', console.log);
// output:
[ { synsetOffset: 1282510,
[ { synsetOffset: 1285602,
lexFilenum: 0,
lexName: 'adj.all',
pos: 's',
wCnt: 5,
lemma: 'amazing',
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
lexId: '0',
ptrs: [],
gloss: 'inspiring awe or admiration or wonder; <snip> awing majesty, so vast, so high, so silent" '
gloss: 'inspiring awe or admiration or wonder; [...] awing majesty, so vast, so high, so silent" '
def: 'inspiring awe or admiration or wonder',
...
} ], 'awesome'
```
In this case only one lookup was found, but there could be several.
Version 1.1 adds the `lexName` parameter, which maps the lexFilenum to one of [45 lexicographer domains](https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html).
#### seek(offset, pos, callback)
Version 1.1 introduces the seek method to lookup a record directly from the synsetOffset for a given POS. Unlike other methods, callback (if provided) receives `(err, result)` arguments.
Examples:
```js
wordpos.seek(1285602, 'a').then(console.log)
// same result as wordpos.lookupAdjective('awesome', console.log);
```
#### rand(options, callback)
#### randNoun(options, callback)
@ -214,6 +228,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log)
Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met.
#### parse(text)
Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally.
@ -274,6 +289,10 @@ See [bench/README](bench).
## Changes
1.1 -
- added seek() method
- added lexName property
1.0.1
- Removed npm dependency on Natural. Certain modules are included in /lib.
- Add support for ES6 Promises.

View File

@ -20,6 +20,8 @@ $ wordpos
exp lookup examples
seek get record at synset offset. Must include one of POS -n, -a, -v, -r
rand get random words (starting with [word]). If first arg is a number, returns
that many random words. Valid options are -b, -f, -j, -s, -i.
@ -222,6 +224,49 @@ $ wordpos rand --adj foot
foot-shaped
```
#### Seek a synset offset
Seek offset as adjective:
```sh
$ wordpos seek 1285602 -a
{ '1285602':
{ synsetOffset: 1285602,
lexFilenum: 0,
lexName: 'adj.all',
pos: 's',
wCnt: 5,
lemma: 'amazing',
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
lexId: '0',
ptrs:
[ { pointerSymbol: '&',
synsetOffset: 1285124,
pos: 'a',
...
```
Same as verb (not found!):
```sh
$ wordpos seek 1285602 -v
{ '1285602': {} }
```
Multiple offsets from same POS:
```sh
$ wordpos seek 1285602 1285124 -a
{ '1285124':
{ synsetOffset: 1285124,
lexFilenum: 0,
...
},
'1285602':
{ synsetOffset: 1285602,
lexFilenum: 0,
...
}
```
Note that results are always returned as `--full` format. To get compact JSON format, add the `-j` option.
#### Stopwords
List stopwords (brief):
```bash

View File

@ -5,9 +5,9 @@
* command-line interface to wordpos
*
* Usage:
* wordpos [options] <get|parse|def|rand|syn|exp> <stdin|words*>
* wordpos [options] <get|parse|def|rand|syn|exp|seek> <stdin|words*>
*
* Copyright (c) 2012 mooster@42at.com
* Copyright (c) 2012, 2016 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
@ -17,6 +17,7 @@ var program = require('commander'),
_ = require('underscore')._,
fs = require('fs'),
POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'},
POS_abbr = {noun:'n', adj:'a', verb:'v', adv:'r'},
version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version,
rawCmd = '',
RAND_PLACEHOLDER = '__',
@ -67,6 +68,19 @@ program.command('exp')
exec.apply(this, arguments);
});
program.command('seek')
.description('get record at synset offset. Must include one of POS -n, -a, -v, -r')
.action(function(){
var one = _.chain(program).pick('noun adj adv verb'.split(' ')).countBy().value().true;
if (!one || one > 1) {
console.error('Must include one and only one of -n, -a, -v, -r');
process.exit(-1);
}
// force full output mode
program.full = 1;
exec.apply(this, arguments);
});
program.command('rand')
.description('get random words (starting with [word]). If first arg is a number, returns ' +
'that many random words. Valid options are -b, -f, -j, -s, -i.')
@ -80,12 +94,10 @@ program.command('rand')
args.shift();
program.num = num;
}
// no startsWith given, add a placeholder
if (args.length === 1){
args.unshift(RAND_PLACEHOLDER);
}
exec.apply(this, args);
});
@ -150,23 +162,24 @@ function read_stdin(callback) {
}
function optToFn() {
var fns = _.reject(POS, function(fn, opt) { return !program[opt] });
var
map = cmd === 'seek' ? POS_abbr : POS,
fns = _.reject(map, function(fn, opt) { return !program[opt] });
if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
if (!fns.length) fns = _.values(POS); //default to all if no POS given
if (!fns.length) fns = _.values(map); //default to all if no POS given
return fns;
}
function run(data) {
var
opts = {stopwords: !program.withStopwords},
wordpos = new WordPos(opts),
words = wordpos.parse(data),
seek = cmd === 'seek',
words = seek ? data.split(' ') : wordpos.parse(data),
fns = optToFn(),
plural = (cmd=='get' ? 's':''),
plural = (cmd === 'get' ? 's':''),
results = {},
finale = _.after(
plural ? fns.length : words.length * fns.length,
finale = _.after(plural ? fns.length : words.length * fns.length,
_.bind(output, null, results)),
collect = function(what, result, word){
if (word) { // lookup
@ -184,13 +197,20 @@ function run(data) {
_(fns).each(function(fn){
var method = cmd + fn + plural,
cb = _.bind(collect, null, fn);
if (cmd == 'get') {
if (cmd === 'get') {
wordpos[method](words, cb);
} else if (cmd == 'rand') {
} else if (cmd === 'rand') {
if (words[0] === RAND_PLACEHOLDER) words[0] = '';
words.forEach(function(word){
wordpos[method]({startsWith: word, count: program.num || 1}, cb);
});
} else if (seek) {
words.forEach(function(offset){
wordpos.seek(offset, fn, function(err, result){
results[offset.trim()] = result;
finale();
});
});
} else {
words.forEach(function(word){
wordpos[method](word, cb);
@ -227,8 +247,9 @@ function sprint(results) {
}, '');
default:
return _.reduce(results, function(memo, v, k){
var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep);
return memo + (v.length && util.format('%s%s%s\n', pre, v.join(sep), sep) || '');
var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep),
res = v.length ? v.join(sep) : '';
return memo + (v.length && util.format('%s%s%s\n', pre, res, sep) || '');
}, '');
}

View File

@ -11,7 +11,7 @@
"verbs"
],
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
"version": "1.0.1",
"version": "1.1",
"homepage": "https://github.com/moos/wordpos",
"engines": {
"node": ">=0.12"

View File

@ -13,6 +13,17 @@ var fs = require('fs'),
path = require('path'),
_ = require('underscore');
/**
* sanity check read data - line must start with zero-padded location
*
* @param line {string} - line data read
* @return {boolean} true if line data is good
*/
function dataCheck(line, location) {
var pad = '00000000', // 8 zeros
padded = String(pad + location).slice( - pad.length);
return line.indexOf(padded) === 0;
}
/**
* parse a single data file line, returning data object
@ -22,7 +33,9 @@ var fs = require('fs'),
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line) {
function lineDataToJSON(line, location) {
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
@ -48,6 +61,7 @@ function lineDataToJSON(line) {
var glossArray = data[1].split("; ");
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
@ -55,7 +69,8 @@ function lineDataToJSON(line) {
return {
synsetOffset: parseInt(tokens[0], 10),
lexFilenum: parseInt(tokens[1], 10),
lexFilenum: lexFilenum,
lexName: DataFile.LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
@ -85,12 +100,12 @@ function readLocation(location, callback) {
readChunk(location, function(err, count) {
if (err) {
console.log(err);
//console.log(err);
callback(err);
return;
}
//console.log(' read %d bytes at <%d>', count, location);
callback(null, lineDataToJSON(str));
callback(null, lineDataToJSON(str, location));
});
function readChunk(pos, cb) {
@ -98,12 +113,13 @@ function readLocation(location, callback) {
str += buffer.toString('ascii');
var eol = str.indexOf('\n');
//console.log(' -- read %d bytes at <%d>', count, pos, eol);
if (eol === -1 && len < file.maxLineLength) {
if (count && eol === -1 && len < file.maxLineLength) {
// continue reading
return readChunk(pos + count, cb);
}
str = str.substr(0, eol);
if (str === '' && !err) err = new Error('no data at offset ' + pos);
cb(err, count);
});
}
@ -112,15 +128,16 @@ function readLocation(location, callback) {
/**
* main lookup function
*
* @param record {object} - record to lookup, obtained from index.find()
* @param offsets {array} - array of offsets to lookup (obtained from index.find())
* @param callback{function} (optional) - callback function
* @returns {Promise}
*/
function lookup(record, callback) {
function lookup(offsets, callback) {
var results = [],
self = this,
offsets = record.synsetOffset;
single = !_.isArray(offsets);
if (single) offsets = [offsets];
return new Promise(function(resolve, reject) {
offsets
.map(function (offset) {
@ -134,9 +151,10 @@ function lookup(record, callback) {
function done(lastResult) {
closeFile();
if (lastResult instanceof Error) {
callback && callback(lastResult, []);
callback && callback(lastResult, single ? {} :[]);
reject(lastResult);
} else {
if (single) results = results[0];
callback && callback(null, results);
resolve(results);
}
@ -233,5 +251,58 @@ DataFile.MAX_LINE_LENGTH = {
adv: 638
};
/**
* map of lexFilenum to lex names
*
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]}
*/
DataFile.LEX_NAMES = [
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
module.exports = DataFile;

View File

@ -63,7 +63,7 @@ function lookup(pos) {
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result).then(done);
return files.data.lookup(result.synsetOffset).then(done);
} else {
// not found in index
return done([]);
@ -362,6 +362,31 @@ wordposProto.getVerbs = get('isVerb');
wordposProto.parse = prepText;
/**
* seek - get record at offset for pos
*
* @param offset {number} - synset offset
* @param pos {string} - POS a/r/n/v
* @param callback {function} - optional callback
* @returns Promise
*/
wordposProto.seek = function(offset, pos, callback){
offset = Number(offset);
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
return data.lookup(offset, callback);
function error(msg) {
var err = new Error(msg);
callback && callback(err, {});
return Promise.reject(err);
}
};
/**
* access to WordNet DB
* @type {object}

View File

@ -21,6 +21,7 @@
var
chai = require('chai'),
_ = require('underscore'),
assert = chai.assert,
WordPOS = require('../src/wordpos'),
wordpos = new WordPOS({profile: false});
@ -35,7 +36,9 @@ var str = "The angry bear chased the frightened little squirrel",
adverbs: [ 'little' ],
rest: [ 'The' ]
},
garble = 'garblegarble'; // expect not to find word
garble = 'garblegarble', // expect not to find word
offset = 1285602,
offset_pos ='a';
@ -356,6 +359,62 @@ describe('randX()...', function() {
});
describe('seek()...', function() {
it('should handle bad offset', function(done) {
wordpos.seek('foobar', 'a', function(err, result){
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
done();
});
});
it('should handle wrong offset', function(done) {
var bad_offset = offset + 1;
wordpos.seek(bad_offset, offset_pos, function(err, result) {
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + bad_offset);
assert.deepEqual(result, {});
done();
});
});
it('should handle very large offset', function(done) {
var bad_offset = offset + 100000000;
wordpos.seek(bad_offset, offset_pos, function(err, result) {
assert(err instanceof Error);
assert.equal(err.message, 'no data at offset ' + bad_offset);
assert.deepEqual(result, {});
done();
});
});
it('should handle bad pos', function(done) {
wordpos.seek(offset, 'g', function(err, result) {
assert(err instanceof Error);
assert(/Incorrect POS/.test(err.message));
done();
});
});
it('should handle wrong pos', function(done) {
wordpos.seek(offset, 'v', function(err, result){
assert.equal(err.message, 'Bad data at location ' + offset);
});
done();
});
it('should seek offset', function(done) {
wordpos.seek(offset, offset_pos, function(err, result) {
assert.equal(result.synsetOffset, offset);
assert.equal(result.pos, 's');
assert.equal(result.lemma, 'amazing');
done();
});
});
});
describe('Promise pattern', function() {
@ -413,4 +472,28 @@ describe('Promise pattern', function() {
assert.equal(result[0].indexOf('foo'), 0);
});
});
it('seek()', function () {
return wordpos.seek(offset, offset_pos).then(function (result) {
assert.equal(result.synsetOffset, offset);
assert.equal(result.pos, 's');
assert.equal(result.lemma, 'amazing');
});
});
it('seek() - wrong offset', function () {
return wordpos.seek(offset + 1, offset_pos).catch(function (err) {
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + (offset+1));
});
});
it('seek() - bad offset', function () {
return wordpos.seek('foobar', offset_pos).catch(function (err) {
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
});
});
});