Added seek() method and lexName property. Bump to 1.1.
This commit is contained in:
parent
57c3340130
commit
0abe5a9010
29
README.md
29
README.md
|
@ -29,7 +29,7 @@ wordpos.isAdjective('awesome', function(result){
|
|||
// true 'awesome'
|
||||
```
|
||||
|
||||
Command-line: (see [CLI](bin))
|
||||
Command-line: (see [CLI](bin) for full command list)
|
||||
```bash
|
||||
$ wordpos def git
|
||||
git
|
||||
|
@ -71,7 +71,7 @@ WordPOS.defaults = {
|
|||
stopwords: true
|
||||
};
|
||||
```
|
||||
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a last argument that is the execution time in msec of the call.
|
||||
To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call.
|
||||
|
||||
```js
|
||||
wordpos = new WordPOS({profile: true});
|
||||
|
@ -165,19 +165,33 @@ Example:
|
|||
```js
|
||||
wordpos.lookupAdjective('awesome', console.log);
|
||||
// output:
|
||||
[ { synsetOffset: 1282510,
|
||||
[ { synsetOffset: 1285602,
|
||||
lexFilenum: 0,
|
||||
lexName: 'adj.all',
|
||||
pos: 's',
|
||||
wCnt: 5,
|
||||
lemma: 'amazing',
|
||||
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
|
||||
lexId: '0',
|
||||
ptrs: [],
|
||||
gloss: 'inspiring awe or admiration or wonder; <snip> awing majesty, so vast, so high, so silent" '
|
||||
gloss: 'inspiring awe or admiration or wonder; [...] awing majesty, so vast, so high, so silent" '
|
||||
def: 'inspiring awe or admiration or wonder',
|
||||
...
|
||||
} ], 'awesome'
|
||||
```
|
||||
In this case only one lookup was found, but there could be several.
|
||||
In this case only one lookup was found, but there could be several.
|
||||
|
||||
Version 1.1 adds the `lexName` parameter, which maps the lexFilenum to one of [45 lexicographer domains](https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html).
|
||||
|
||||
|
||||
#### seek(offset, pos, callback)
|
||||
Version 1.1 introduces the seek method to lookup a record directly from the synsetOffset for a given POS. Unlike other methods, callback (if provided) receives `(err, result)` arguments.
|
||||
|
||||
Examples:
|
||||
```js
|
||||
wordpos.seek(1285602, 'a').then(console.log)
|
||||
// same result as wordpos.lookupAdjective('awesome', console.log);
|
||||
```
|
||||
|
||||
#### rand(options, callback)
|
||||
#### randNoun(options, callback)
|
||||
|
@ -214,6 +228,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log)
|
|||
|
||||
Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met.
|
||||
|
||||
|
||||
#### parse(text)
|
||||
Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally.
|
||||
|
||||
|
@ -274,6 +289,10 @@ See [bench/README](bench).
|
|||
|
||||
## Changes
|
||||
|
||||
1.1 -
|
||||
- added seek() method
|
||||
- added lexName property
|
||||
|
||||
1.0.1
|
||||
- Removed npm dependency on Natural. Certain modules are included in /lib.
|
||||
- Add support for ES6 Promises.
|
||||
|
|
|
@ -19,6 +19,8 @@ $ wordpos
|
|||
syn lookup synonyms
|
||||
|
||||
exp lookup examples
|
||||
|
||||
seek get record at synset offset. Must include one of POS -n, -a, -v, -r
|
||||
|
||||
rand get random words (starting with [word]). If first arg is a number, returns
|
||||
that many random words. Valid options are -b, -f, -j, -s, -i.
|
||||
|
@ -222,6 +224,49 @@ $ wordpos rand --adj foot
|
|||
foot-shaped
|
||||
```
|
||||
|
||||
#### Seek a synset offset
|
||||
Seek offset as adjective:
|
||||
```sh
|
||||
$ wordpos seek 1285602 -a
|
||||
{ '1285602':
|
||||
{ synsetOffset: 1285602,
|
||||
lexFilenum: 0,
|
||||
lexName: 'adj.all',
|
||||
pos: 's',
|
||||
wCnt: 5,
|
||||
lemma: 'amazing',
|
||||
synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ],
|
||||
lexId: '0',
|
||||
ptrs:
|
||||
[ { pointerSymbol: '&',
|
||||
synsetOffset: 1285124,
|
||||
pos: 'a',
|
||||
...
|
||||
```
|
||||
|
||||
Same as verb (not found!):
|
||||
```sh
|
||||
$ wordpos seek 1285602 -v
|
||||
{ '1285602': {} }
|
||||
```
|
||||
|
||||
Multiple offsets from same POS:
|
||||
```sh
|
||||
$ wordpos seek 1285602 1285124 -a
|
||||
{ '1285124':
|
||||
{ synsetOffset: 1285124,
|
||||
lexFilenum: 0,
|
||||
...
|
||||
},
|
||||
'1285602':
|
||||
{ synsetOffset: 1285602,
|
||||
lexFilenum: 0,
|
||||
...
|
||||
}
|
||||
```
|
||||
Note that results are always returned as `--full` format. To get compact JSON format, add the `-j` option.
|
||||
|
||||
|
||||
#### Stopwords
|
||||
List stopwords (brief):
|
||||
```bash
|
||||
|
|
|
@ -5,9 +5,9 @@
|
|||
* command-line interface to wordpos
|
||||
*
|
||||
* Usage:
|
||||
* wordpos [options] <get|parse|def|rand|syn|exp> <stdin|words*>
|
||||
* wordpos [options] <get|parse|def|rand|syn|exp|seek> <stdin|words*>
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Copyright (c) 2012, 2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -17,6 +17,7 @@ var program = require('commander'),
|
|||
_ = require('underscore')._,
|
||||
fs = require('fs'),
|
||||
POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'},
|
||||
POS_abbr = {noun:'n', adj:'a', verb:'v', adv:'r'},
|
||||
version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version,
|
||||
rawCmd = '',
|
||||
RAND_PLACEHOLDER = '__',
|
||||
|
@ -67,6 +68,19 @@ program.command('exp')
|
|||
exec.apply(this, arguments);
|
||||
});
|
||||
|
||||
program.command('seek')
|
||||
.description('get record at synset offset. Must include one of POS -n, -a, -v, -r')
|
||||
.action(function(){
|
||||
var one = _.chain(program).pick('noun adj adv verb'.split(' ')).countBy().value().true;
|
||||
if (!one || one > 1) {
|
||||
console.error('Must include one and only one of -n, -a, -v, -r');
|
||||
process.exit(-1);
|
||||
}
|
||||
// force full output mode
|
||||
program.full = 1;
|
||||
exec.apply(this, arguments);
|
||||
});
|
||||
|
||||
program.command('rand')
|
||||
.description('get random words (starting with [word]). If first arg is a number, returns ' +
|
||||
'that many random words. Valid options are -b, -f, -j, -s, -i.')
|
||||
|
@ -80,12 +94,10 @@ program.command('rand')
|
|||
args.shift();
|
||||
program.num = num;
|
||||
}
|
||||
|
||||
// no startsWith given, add a placeholder
|
||||
if (args.length === 1){
|
||||
args.unshift(RAND_PLACEHOLDER);
|
||||
}
|
||||
|
||||
exec.apply(this, args);
|
||||
});
|
||||
|
||||
|
@ -150,23 +162,24 @@ function read_stdin(callback) {
|
|||
}
|
||||
|
||||
function optToFn() {
|
||||
var fns = _.reject(POS, function(fn, opt) { return !program[opt] });
|
||||
var
|
||||
map = cmd === 'seek' ? POS_abbr : POS,
|
||||
fns = _.reject(map, function(fn, opt) { return !program[opt] });
|
||||
if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
|
||||
if (!fns.length) fns = _.values(POS); //default to all if no POS given
|
||||
if (!fns.length) fns = _.values(map); //default to all if no POS given
|
||||
return fns;
|
||||
}
|
||||
|
||||
|
||||
function run(data) {
|
||||
var
|
||||
opts = {stopwords: !program.withStopwords},
|
||||
wordpos = new WordPos(opts),
|
||||
words = wordpos.parse(data),
|
||||
seek = cmd === 'seek',
|
||||
words = seek ? data.split(' ') : wordpos.parse(data),
|
||||
fns = optToFn(),
|
||||
plural = (cmd=='get' ? 's':''),
|
||||
plural = (cmd === 'get' ? 's':''),
|
||||
results = {},
|
||||
finale = _.after(
|
||||
plural ? fns.length : words.length * fns.length,
|
||||
finale = _.after(plural ? fns.length : words.length * fns.length,
|
||||
_.bind(output, null, results)),
|
||||
collect = function(what, result, word){
|
||||
if (word) { // lookup
|
||||
|
@ -184,13 +197,20 @@ function run(data) {
|
|||
_(fns).each(function(fn){
|
||||
var method = cmd + fn + plural,
|
||||
cb = _.bind(collect, null, fn);
|
||||
if (cmd == 'get') {
|
||||
if (cmd === 'get') {
|
||||
wordpos[method](words, cb);
|
||||
} else if (cmd == 'rand') {
|
||||
} else if (cmd === 'rand') {
|
||||
if (words[0] === RAND_PLACEHOLDER) words[0] = '';
|
||||
words.forEach(function(word){
|
||||
wordpos[method]({startsWith: word, count: program.num || 1}, cb);
|
||||
});
|
||||
} else if (seek) {
|
||||
words.forEach(function(offset){
|
||||
wordpos.seek(offset, fn, function(err, result){
|
||||
results[offset.trim()] = result;
|
||||
finale();
|
||||
});
|
||||
});
|
||||
} else {
|
||||
words.forEach(function(word){
|
||||
wordpos[method](word, cb);
|
||||
|
@ -227,8 +247,9 @@ function sprint(results) {
|
|||
}, '');
|
||||
default:
|
||||
return _.reduce(results, function(memo, v, k){
|
||||
var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep);
|
||||
return memo + (v.length && util.format('%s%s%s\n', pre, v.join(sep), sep) || '');
|
||||
var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep),
|
||||
res = v.length ? v.join(sep) : '';
|
||||
return memo + (v.length && util.format('%s%s%s\n', pre, res, sep) || '');
|
||||
}, '');
|
||||
}
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"verbs"
|
||||
],
|
||||
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
|
||||
"version": "1.0.1",
|
||||
"version": "1.1",
|
||||
"homepage": "https://github.com/moos/wordpos",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
|
|
|
@ -13,6 +13,17 @@ var fs = require('fs'),
|
|||
path = require('path'),
|
||||
_ = require('underscore');
|
||||
|
||||
/**
|
||||
* sanity check read data - line must start with zero-padded location
|
||||
*
|
||||
* @param line {string} - line data read
|
||||
* @return {boolean} true if line data is good
|
||||
*/
|
||||
function dataCheck(line, location) {
|
||||
var pad = '00000000', // 8 zeros
|
||||
padded = String(pad + location).slice( - pad.length);
|
||||
return line.indexOf(padded) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a single data file line, returning data object
|
||||
|
@ -22,7 +33,9 @@ var fs = require('fs'),
|
|||
*
|
||||
* Credit for this routine to https://github.com/NaturalNode/natural
|
||||
*/
|
||||
function lineDataToJSON(line) {
|
||||
function lineDataToJSON(line, location) {
|
||||
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
|
||||
|
||||
var data = line.split('| '),
|
||||
tokens = data[0].split(/\s+/),
|
||||
ptrs = [],
|
||||
|
@ -48,6 +61,7 @@ function lineDataToJSON(line) {
|
|||
var glossArray = data[1].split("; ");
|
||||
var definition = glossArray[0];
|
||||
var examples = glossArray.slice(1);
|
||||
var lexFilenum = parseInt(tokens[1], 10);
|
||||
|
||||
for (var k = 0; k < examples.length; k++) {
|
||||
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
|
||||
|
@ -55,7 +69,8 @@ function lineDataToJSON(line) {
|
|||
|
||||
return {
|
||||
synsetOffset: parseInt(tokens[0], 10),
|
||||
lexFilenum: parseInt(tokens[1], 10),
|
||||
lexFilenum: lexFilenum,
|
||||
lexName: DataFile.LEX_NAMES[ lexFilenum ],
|
||||
pos: tokens[2],
|
||||
wCnt: wCnt,
|
||||
lemma: tokens[4],
|
||||
|
@ -85,12 +100,12 @@ function readLocation(location, callback) {
|
|||
|
||||
readChunk(location, function(err, count) {
|
||||
if (err) {
|
||||
console.log(err);
|
||||
//console.log(err);
|
||||
callback(err);
|
||||
return;
|
||||
}
|
||||
//console.log(' read %d bytes at <%d>', count, location);
|
||||
callback(null, lineDataToJSON(str));
|
||||
callback(null, lineDataToJSON(str, location));
|
||||
});
|
||||
|
||||
function readChunk(pos, cb) {
|
||||
|
@ -98,12 +113,13 @@ function readLocation(location, callback) {
|
|||
str += buffer.toString('ascii');
|
||||
var eol = str.indexOf('\n');
|
||||
//console.log(' -- read %d bytes at <%d>', count, pos, eol);
|
||||
if (eol === -1 && len < file.maxLineLength) {
|
||||
if (count && eol === -1 && len < file.maxLineLength) {
|
||||
// continue reading
|
||||
return readChunk(pos + count, cb);
|
||||
}
|
||||
|
||||
str = str.substr(0, eol);
|
||||
if (str === '' && !err) err = new Error('no data at offset ' + pos);
|
||||
cb(err, count);
|
||||
});
|
||||
}
|
||||
|
@ -112,15 +128,16 @@ function readLocation(location, callback) {
|
|||
/**
|
||||
* main lookup function
|
||||
*
|
||||
* @param record {object} - record to lookup, obtained from index.find()
|
||||
* @param offsets {array} - array of offsets to lookup (obtained from index.find())
|
||||
* @param callback{function} (optional) - callback function
|
||||
* @returns {Promise}
|
||||
*/
|
||||
function lookup(record, callback) {
|
||||
function lookup(offsets, callback) {
|
||||
var results = [],
|
||||
self = this,
|
||||
offsets = record.synsetOffset;
|
||||
single = !_.isArray(offsets);
|
||||
|
||||
if (single) offsets = [offsets];
|
||||
return new Promise(function(resolve, reject) {
|
||||
offsets
|
||||
.map(function (offset) {
|
||||
|
@ -134,9 +151,10 @@ function lookup(record, callback) {
|
|||
function done(lastResult) {
|
||||
closeFile();
|
||||
if (lastResult instanceof Error) {
|
||||
callback && callback(lastResult, []);
|
||||
callback && callback(lastResult, single ? {} :[]);
|
||||
reject(lastResult);
|
||||
} else {
|
||||
if (single) results = results[0];
|
||||
callback && callback(null, results);
|
||||
resolve(results);
|
||||
}
|
||||
|
@ -233,5 +251,58 @@ DataFile.MAX_LINE_LENGTH = {
|
|||
adv: 638
|
||||
};
|
||||
|
||||
/**
|
||||
* map of lexFilenum to lex names
|
||||
*
|
||||
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
|
||||
* @type {string[]}
|
||||
*/
|
||||
DataFile.LEX_NAMES = [
|
||||
'adj.all',
|
||||
'adj.pert',
|
||||
'adv.all',
|
||||
'noun.Tops',
|
||||
'noun.act',
|
||||
'noun.animal',
|
||||
'noun.artifact',
|
||||
'noun.attribute',
|
||||
'noun.body',
|
||||
'noun.cognition',
|
||||
'noun.communication',
|
||||
'noun.event',
|
||||
'noun.feeling',
|
||||
'noun.food',
|
||||
'noun.group',
|
||||
'noun.location',
|
||||
'noun.motive',
|
||||
'noun.object',
|
||||
'noun.person',
|
||||
'noun.phenomenon',
|
||||
'noun.plant',
|
||||
'noun.possession',
|
||||
'noun.process',
|
||||
'noun.quantity',
|
||||
'noun.relation',
|
||||
'noun.shape',
|
||||
'noun.state',
|
||||
'noun.substance',
|
||||
'noun.time',
|
||||
'verb.body',
|
||||
'verb.change',
|
||||
'verb.cognition',
|
||||
'verb.communication',
|
||||
'verb.competition',
|
||||
'verb.consumption',
|
||||
'verb.contact',
|
||||
'verb.creation',
|
||||
'verb.emotion',
|
||||
'verb.motion',
|
||||
'verb.perception',
|
||||
'verb.possession',
|
||||
'verb.social',
|
||||
'verb.stative',
|
||||
'verb.weather',
|
||||
'adj.ppl'
|
||||
];
|
||||
|
||||
module.exports = DataFile;
|
||||
|
|
|
@ -63,7 +63,7 @@ function lookup(pos) {
|
|||
.then(function(result) {
|
||||
if (result) {
|
||||
// lookup data
|
||||
return files.data.lookup(result).then(done);
|
||||
return files.data.lookup(result.synsetOffset).then(done);
|
||||
} else {
|
||||
// not found in index
|
||||
return done([]);
|
||||
|
@ -362,6 +362,31 @@ wordposProto.getVerbs = get('isVerb');
|
|||
wordposProto.parse = prepText;
|
||||
|
||||
|
||||
/**
|
||||
* seek - get record at offset for pos
|
||||
*
|
||||
* @param offset {number} - synset offset
|
||||
* @param pos {string} - POS a/r/n/v
|
||||
* @param callback {function} - optional callback
|
||||
* @returns Promise
|
||||
*/
|
||||
wordposProto.seek = function(offset, pos, callback){
|
||||
offset = Number(offset);
|
||||
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
|
||||
|
||||
var data = this.getFilesFor(pos).data;
|
||||
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
|
||||
|
||||
return data.lookup(offset, callback);
|
||||
|
||||
function error(msg) {
|
||||
var err = new Error(msg);
|
||||
callback && callback(err, {});
|
||||
return Promise.reject(err);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* access to WordNet DB
|
||||
* @type {object}
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
var
|
||||
chai = require('chai'),
|
||||
_ = require('underscore'),
|
||||
assert = chai.assert,
|
||||
WordPOS = require('../src/wordpos'),
|
||||
wordpos = new WordPOS({profile: false});
|
||||
|
@ -35,7 +36,9 @@ var str = "The angry bear chased the frightened little squirrel",
|
|||
adverbs: [ 'little' ],
|
||||
rest: [ 'The' ]
|
||||
},
|
||||
garble = 'garblegarble'; // expect not to find word
|
||||
garble = 'garblegarble', // expect not to find word
|
||||
offset = 1285602,
|
||||
offset_pos ='a';
|
||||
|
||||
|
||||
|
||||
|
@ -356,6 +359,62 @@ describe('randX()...', function() {
|
|||
});
|
||||
|
||||
|
||||
describe('seek()...', function() {
|
||||
|
||||
it('should handle bad offset', function(done) {
|
||||
wordpos.seek('foobar', 'a', function(err, result){
|
||||
assert(err instanceof Error);
|
||||
assert.equal(err.message, 'offset must be valid positive number.');
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle wrong offset', function(done) {
|
||||
var bad_offset = offset + 1;
|
||||
wordpos.seek(bad_offset, offset_pos, function(err, result) {
|
||||
assert(err instanceof Error);
|
||||
assert.equal(err.message, 'Bad data at location ' + bad_offset);
|
||||
assert.deepEqual(result, {});
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle very large offset', function(done) {
|
||||
var bad_offset = offset + 100000000;
|
||||
wordpos.seek(bad_offset, offset_pos, function(err, result) {
|
||||
assert(err instanceof Error);
|
||||
assert.equal(err.message, 'no data at offset ' + bad_offset);
|
||||
assert.deepEqual(result, {});
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle bad pos', function(done) {
|
||||
wordpos.seek(offset, 'g', function(err, result) {
|
||||
assert(err instanceof Error);
|
||||
assert(/Incorrect POS/.test(err.message));
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle wrong pos', function(done) {
|
||||
wordpos.seek(offset, 'v', function(err, result){
|
||||
assert.equal(err.message, 'Bad data at location ' + offset);
|
||||
});
|
||||
done();
|
||||
});
|
||||
|
||||
it('should seek offset', function(done) {
|
||||
wordpos.seek(offset, offset_pos, function(err, result) {
|
||||
assert.equal(result.synsetOffset, offset);
|
||||
assert.equal(result.pos, 's');
|
||||
assert.equal(result.lemma, 'amazing');
|
||||
done();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
describe('Promise pattern', function() {
|
||||
|
||||
|
@ -413,4 +472,28 @@ describe('Promise pattern', function() {
|
|||
assert.equal(result[0].indexOf('foo'), 0);
|
||||
});
|
||||
});
|
||||
|
||||
it('seek()', function () {
|
||||
return wordpos.seek(offset, offset_pos).then(function (result) {
|
||||
assert.equal(result.synsetOffset, offset);
|
||||
assert.equal(result.pos, 's');
|
||||
assert.equal(result.lemma, 'amazing');
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
it('seek() - wrong offset', function () {
|
||||
return wordpos.seek(offset + 1, offset_pos).catch(function (err) {
|
||||
assert(err instanceof Error);
|
||||
assert.equal(err.message, 'Bad data at location ' + (offset+1));
|
||||
});
|
||||
});
|
||||
|
||||
it('seek() - bad offset', function () {
|
||||
return wordpos.seek('foobar', offset_pos).catch(function (err) {
|
||||
assert(err instanceof Error);
|
||||
assert.equal(err.message, 'offset must be valid positive number.');
|
||||
});
|
||||
});
|
||||
|
||||
});
|
Loading…
Reference in New Issue