added ranX() methods

This commit is contained in:
Moos 2014-05-03 14:41:39 -07:00
parent 45a9a836d4
commit 889e336097
6 changed files with 406 additions and 171 deletions

View File

@ -3,6 +3,7 @@ wordpos
wordpos is a set of part-of-speech (POS) utilities for Node.js using [natural's](http://github.com/NaturalNode/natural) WordNet module.
*Update*: New version 0.1.10 - get random word(s).
## Usage
@ -171,6 +172,55 @@ wordpos.lookup('great', console.log);
// ...
```
### randX()
Get random words.
```
wordpos.rand([options,] callback)
wordpos.randNoun([options,] callback)
wordpos.randVerb([options,[ callback)
wordpos.randAdjective([options,] callback)
wordpos.randAdverb([options,] callback)
```
Callback receives array of random words and the startsWith option.
Options, if given, is:
```
{
startsWith : <string> -- get random words starting with string
count : <number> -- number of words to return (default = 1)
}
```
Examples:
```js
wordpos.rand(console.log)
// ['wulfila'] ''
wordpos.randNoun(console.log)
// ['bamboo_palm'] ''
// with options:
wordpos.rand({starstWith: 'foo'}, console.log)
// ['foot'] 'foo'
wordpos.rand({starstWith: 'foo', count: 3}, console.log)
// ['footsure', 'foolish', 'footsore'] 'foo'
wordpos.randVerb({starstWith: 'bar', count: 3}, console.log)
// ['barge', 'barf', 'barter_away'] 'bar'
wordpos.rand({starsWith: 'zzz'}, console.log)
// [] 'zzz'
```
Note on performance: random lookups could involve heavy disk reads. It is better to use the 'count' option to get words
in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the file.
Getting random POS (randX) is generally faster than rand(), which may look at multiple POS files until 'count' requirement
is met.
### Other methods/properties
```
@ -287,6 +337,32 @@ lexId":"0","ptrs":[],"gloss":"a person who is deemed to be despicable or contemp
would do that\"; \"kill the rat\"; \"throw the bum out\"; \"you cowardly little pukes!\"; \"the British
call a contemptible person a `git'\" "}]}
```
Get random words:
```bash
$ wordpos rand
# 1:
hopelessly
$ wordpos rand -N 2 foot
# foot 2:
footprint
footlights
$ wordpos rand -N 2 foot hand
# foot 2:
footlocker
footmark
# hand 2:
hand-hewn
handstitched
$ wordpos rand --adj foot
# foot 1:
foot-shaped
```
Usage:
```bash
$ wordpos
@ -304,6 +380,9 @@ $ wordpos
parse
show parsed words, deduped and less stopwords
rand
get random words (optionally starting with 'word')
Options:
-h, --help output usage information
@ -312,12 +391,13 @@ $ wordpos
-a, --adj Get adjectives
-v, --verb Get verbs
-r, --adv Get adverbs
-c, --count count only (noun, adj, verb, adv, total parsed words)
-c, --count get counts only (noun, adj, verb, adv, total parsed words)
-b, --brief brief output (all on one line, no headers)
-f, --full full results object
-j, --json full results object as JSON
-i, --file <file> input file
-s, --stopwords include stopwords
-N, --num <num> number of random words to get
```
## Benchmark

View File

@ -5,7 +5,7 @@
* command-line interface to wordpos
*
* Usage:
* wordpos [options] <get|parse|def> <stdin|words*>
* wordpos [options] <get|parse|def|rand> <stdin|words*>
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
@ -35,6 +35,7 @@ program
.option('-j, --json', 'full results object as JSON')
.option('-i, --file <file>', 'input file')
.option('-s, --stopwords', 'include stopwords')
.option('-N, --num <num>', 'number of random words to return')
;
program.command('get')
@ -53,7 +54,7 @@ program.command('parse')
.action(exec);
program.command('rand')
.description('get random words')
.description('get random words (starting with word, optionally)')
.action(exec);
var
@ -100,6 +101,7 @@ function read_stdin(callback) {
function optToFn() {
var fns = _.reject(POS, function(fn, opt) { return !program[opt] });
if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
if (!fns.length) fns = _.values(POS); //default to all if no POS given
return fns;
}
@ -117,9 +119,6 @@ function run(data) {
plural ? fns.length : words.length * fns.length,
_.bind(output, null, results)),
collect = function(what, result, word){
console.log('collect ----', arguments);
if (word) { // lookup
results[word] = [].concat(results[word] || [], result);
} else { // get
@ -138,11 +137,12 @@ function run(data) {
if (cmd == 'get') {
wordpos[method](words, cb);
} else if (cmd == 'rand') {
words.forEach(function(word){
wordpos[method]({startsWith: word, count: program.num || 1}, cb);
});
} else {
words.forEach(function(word){
console.log(' calling rand', method, word);
wordpos[method](word, cb);
});
}

View File

@ -1,16 +1,16 @@
{
"name": "wordpos",
"author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"],
"keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"],
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.9",
"version": "0.1.10",
"homepage": "https://github.com/moos/wordpos",
"engines": {
"node": ">=0.6"
},
"bin": "./bin/wordpos-cli.js",
"dependencies": {
"natural": "latest",
"natural": "NaturalNode/natural",
"underscore": ">=1.3.1",
"WNdb": "latest",
"commander": "1.1.1"

View File

@ -26,6 +26,9 @@ var str = "The angry bear chased the frightened little squirrel",
garble = 'garblegarble'; // expect not to find word
function noop(){}
describe('getX()...', function() {
beforeEach(function() {
@ -240,5 +243,67 @@ describe('nested callbacks on same index key', function() {
});
});
function noop(){}
describe('rand()...', function() {
it('should get random word', function(done) {
wordpos.randNoun(function(result) {
expect(result).toBeTruthy();
done();
});
});
it('should get N random words', function(done) {
wordpos.rand({count: 3}, function(result) {
expect(result.length).toEqual(3);
done();
});
});
it('should get random word starting with', function(done) {
wordpos.rand({startsWith: 'foo'}, function(result, startsWith) {
expect(result[0].indexOf('foo')).toEqual(0);
expect(startsWith).toEqual('foo');
done();
});
});
it('should get nothing starting with not fount', function(done) {
wordpos.rand({startsWith: 'zzzz'}, function(result) {
expect(result.length).toEqual(0);
done();
});
});
});
describe('randX()...', function() {
it('should get random noun', function(done) {
wordpos.randNoun(function(result) {
expect(result.length).toEqual(1);
done();
});
});
it('should get random verb', function(done) {
wordpos.randVerb(function(result) {
expect(result.length).toEqual(1);
done();
});
});
it('should get random adjective', function(done) {
wordpos.randAdjective(function(result) {
expect(result.length).toEqual(1);
done();
});
});
it('should get random adverb', function(done) {
wordpos.randAdverb(function(result) {
expect(result.length).toEqual(1);
done();
});
});
// not found
it('should NOT get random noun starting with', function(done) {
wordpos.randNoun({startsWith: 'zzzz'},function(result, startsWith) {
expect(result.length).toEqual(0);
done();
});
});
});

View File

@ -1,9 +1,9 @@
/**
/*!
* fastIndex.js
*
* override natural.WordNet's IndexFile to use fast index data
*
* Copyright (c) 2012 mooster@42at.com
* Copyright (c) 2012-2014 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
@ -17,9 +17,10 @@ var _ = require('underscore')._,
/**
* load fast index bucket data
* @param dir - dir path of index files
* @param name - name of index file, eg, 'index.verb'
* @returns Object - fast index data object
*
* @param dir {string} - dir path of index files
* @param name {string} - name of index file, eg, 'index.verb'
* @returns {Object} - fast index data object
*/
function loadFastIndex(dir, name) {
var jsonFile = path.join(dir, 'fast-' + name + '.json'),
@ -36,8 +37,9 @@ function loadFastIndex(dir, name) {
/**
* read index file using fast index data at key
*
* @param key - 3-char key into fast index
* @param index - index file name (eg, 'index.verb')
* @param index - index object
* @param callback - function receives buffer of data read
* @returns none
*/
@ -57,16 +59,25 @@ function readIndexForKey(key, index, callback) {
}
/**
* read index file using fast index data at keyStart to keyEnd (inclusive)
*
* @param keyStart {string} - 3-char key into fast index to begin at
* @param keyEnd {string|null} - 3-char key into fast index to end at. If null, reads to next key.
* @param index - index object
* @param callback - function receives buffer of data read
* @returns none
*/
function readIndexBetweenKeys(keyStart, keyEnd, index, callback) {
var data = index.fastIndex,
offset = data.offsets[keyStart][0],
nextKey = keyEnd || data.offsets[keyStart][1],
end = keyEnd || keyStart,
nextKey = data.offsets[end][1],
nextOffset = data.offsets[nextKey][0],
len = nextOffset - offset - 1,
buffer = new Buffer(len);
console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len, index.fd, offset)
//console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len)
fs.read(index.fd, buffer, 0, len, offset, function(err, count){
if (err) return console.log(err);
// console.log(' read %d bytes for <%s>', count, keyStart);
@ -75,22 +86,23 @@ function readIndexBetweenKeys(keyStart, keyEnd, index, callback) {
}
/**
* run single 'task' method sharing callbacks. method MUST take callback as LAST arg.
* run single 'task' method sharing callbacks. Method MUST take callback as LAST arg.
* piper is bound to an index.
*
* @param task {string} - task name unique to method!
* @param method {function} - method to execute, gets (key, ... , callback)
* @param args {arrray} - args to pass to method
* @param context {object} - other params to remember
* @param method {function} - method to execute, gets (args, ... , callback)
* @param args {array} - args to pass to method
* @param context {object} - other params to remember and sent to callback
* @param callback {function} - result callback
*/
function piper(task, method, args, context, callback){
var readCallbacks = this.cache,
var readCallbacks = this.callbackQueue,
memoArgs = _.rest(arguments, 2),
wrappedCallback; //_.partial(piper.wrapper, this, task, context, callback);
wrappedCallback;
console.log('piper', task, args[0], args[1], context[0]);
// console.log('piper', task, args[0], context[0]);
// queue up if already reading file for this task:key
// queue up if already reading file for this task
if (task in readCallbacks){
readCallbacks[task].push(memoArgs);
return;
@ -98,7 +110,7 @@ function piper(task, method, args, context, callback){
readCallbacks[task] = [memoArgs];
if (!this.fd) {
console.log(' ... opening', this.filePath);
//console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
@ -112,24 +124,24 @@ function piper(task, method, args, context, callback){
}
// result is the *same* for same task
piper.wrapper = function(self, task, result){//, context, callback, result){
var readCallbacks = self.cache,
piper.wrapper = function(self, task, result){
var readCallbacks = self.callbackQueue,
callback, args;
// live access callbacks cache in case nested cb's
// add to the array.
while (args = readCallbacks[task].shift()) {
callback = args.pop();
// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString())
callback = args.pop(); // last arg MUST be callback
callback.apply(null, [].concat(_.flatten(args), result));
// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString())
callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result));
}
// now done - delete cb cache
delete readCallbacks[task];
if (--self.refcount == 0) {
console.log(' ... closing', self.filePath);
if (--self.refcount === 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
@ -140,61 +152,36 @@ piper.wrapper = function(self, task, result){//, context, callback, result){
*
* calls to same bucket are queued for callback.
*
* @param search - word to search for
* @param callback - callback receives found line and tokens
* @param search {string} - word to search for
* @param callback {function} - callback receives found line and tokens
* @returns none
*/
function find(search, callback) {
var self = this,
data = this.fastIndex,
readCallbacks = this.cache,
miss = {status: 'miss'},
args = [search, callback];
readCallbacks = this.callbackQueue,
miss = {status: 'miss'};
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) });
// queue up if already reading file for this key
if (key in readCallbacks){
readCallbacks[key].push(args);
return;
}
readCallbacks[key] = [args];
if (!this.fd) {
//console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// prepare the piper
var task = 'find' + key,
args = [key, this],
context = [search, callback]; // last arg MUST be callback
// ref count so we know when to close the main index file
++this.refcount;
// pay the piper
this.piper(task, readIndexForKey, args, context, collector);
readIndexForKey(key, this, function (buffer){
function collector(key, index, search, callback, buffer){
var lines = buffer.toString().split('\n'),
keys = lines.map(function(line){
return line.substring(0,line.indexOf(' '));
});
// live access callbacks cache in case nested cb's
// add to the array.
while (readCallbacks[key].length) {
test(readCallbacks[key].shift());
}
// now done - delete cb cache
delete readCallbacks[key];
if (--self.refcount == 0) {
//console.log(' ... closing', self.filePath);
fs.close(self.fd);
self.fd = null;
}
function test(item) {
var search = item[0],
callback = item[1],
}),
ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search!
//console.log(' %s is %d', search, ind);
if (ind == -1) return callback(miss);
if (ind === -1) return callback(miss);
var tokens = lines[ind].split(/\s+/),
key = tokens[0],
@ -202,79 +189,116 @@ function find(search, callback) {
callback(result);
}
});
}
function rand(startsWith, callback){
/**
* rand function (bound to index)
*
* @param startsWith {string} - get random word(s) that start with this, or ''
* @param num {number} - number of words to return
* @param callback {function} - callback function, receives words array and startsWith
*/
function rand(startsWith, num, callback){
var self = this,
key, nextKey = null, ikey;
nextKey = null,
trie = this.fastIndex.trie,
key, keys;
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
if (startsWith){
key = startsWith.slice(0, KEY_LENGTH);
console.log('-- ', startsWith, key, self.indexKeys.length);
/**
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
*/
if (key.length < KEY_LENGTH) {
if (!(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback('not found') });
// calc trie if haven't done so yet
if (!trie){
var natural = require('natural');
// 'a' -> nextKey 'b', 'go' -> 'gp'
if (key.length < 3) {
nextKey = key.replace(/.$/, String.fromCharCode( key.charCodeAt(key.length-1) + 1));
ikey = _.sortedIndex(self.indexKeys, nextKey);
nextKey = self.indexKeys[ ikey ]; // assures nextKey is in index keys
trie = new natural.Trie();
trie.addStrings(self.fastIndex.indexKeys);
this.fastIndex.trie = trie;
//console.log(' +++ Trie calc ');
}
try{
// trie throws if not found!!!!!
keys = trie.keysWithPrefix( startsWith );
} catch(e){
keys = [];
}
// read all keys then select random word.
// May be large disk read!
key = keys[0];
nextKey = _.last(keys);
}
if (!key || !(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback([], startsWith) });
} else {
// no startWith given - random select among keys
keys = _.sample( this.fastIndex.indexKeys, num );
// if num > 1, run each key independently and collect results
if (num > 1){
var results = [], ii = 0;
_(keys).each(function(startsWith){
self.rand(startsWith, 1, function(result){
results.push(result[0]);
if (++ii == num) {
callback(results, '');
}
})
});
return;
}
key = keys;
}
// console.log(' using key', key, nextKey);
// prepare the piper
var args = [key, nextKey, this],
task = 'rand' + key + nextKey,
context = arguments;
this.piper(task, readIndexBetweenKeys, args, context, function(key, nextKey, index, context, buffer){
console.log(context, '====', key, nextKey);
var startsWith = context[0],
callback = context[1];
context = [startsWith, num, callback]; // last arg MUST be callback
// pay the piper
this.piper(task, readIndexBetweenKeys, args, context, collector);
function collector(key, nextKey, index, startsWith, num, callback, buffer){
var lines = buffer.toString().split('\n'),
r = Math.floor(Math.random() * lines.length),
line = lines[r],
word = line.substring(0, line.indexOf(' ')),
keys;
console.log(' got lines ', lines.length);
console.log(11111, self.natural)
if (startsWith !== key) {
keys = lines.map(function(line){
matches = lines.map(function(line){
return line.substring(0,line.indexOf(' '));
});
var ind = _.sortedIndex(keys, startsWith);
//console.log(' got lines for key ', key, lines.length);
console.log(3333 ,ind, keys[ind], startsWith)
// we got bunch of matches for key - now search within for startsWith
if (startsWith !== key){
if (ind >= lines.length || keys[ind].indexOf(startsWith) === -1){
return callback('not found', startsWith);
// binary search for startsWith within set of matches
var ind = _.sortedIndex(matches, startsWith);
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1){
return callback([], startsWith);
}
var natural = require('natural')
// trie = new natural.Trie();
// FIXME --- using Trie's new keysWithPrefix not yet pushed to npm.
// see https://github.com/NaturalNode/natural/commit/5fc86c42e41c1314bfc6a37384dd14acf5f4bb7b
console.log(4444, natural.Trie)
// trie.addStrings(keys);
// console.log(55555, trie.keysWithPrefix( startsWith ));
var natural = require('natural'),
trie = new natural.Trie();
trie.addStrings(matches);
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
matches = trie.keysWithPrefix( startsWith );
}
callback(word, startsWith);
});
var words = _.sample(matches, num);
callback(words, startsWith);
}
}
// cache of fast index data across instances of WordPOS class
@ -284,8 +308,8 @@ module.exports = {
/**
* loads fast index data and return fast index find function
*
* @param index is the IndexFile instance
* @returns function - fast index find or origin find if errors
* @param index {object} - the IndexFile instance
* @returns {function} - fast index find or original find if errors
*/
find: function(index){
@ -300,22 +324,25 @@ module.exports = {
// if no fast index data was found or was corrupt, use original find
if (!cache[key]) return index.find;
index.indexKeys = Object.keys(cache[key].offsets);
index.fastIndex = cache[key];
index.refcount = 0;
index.cache = {};
index.fastIndex.indexKeys = Object.keys(index.fastIndex.offsets);
index.fastIndex.trie = null; // calc on demand
index.refcount = 0;
index.callbackQueue = {};
index.piper = _.bind(piper, index);
return find;
},
/**
* bind rand() to index
*
* @param index {object} - the IndexFile instance
* @returns {function} - bound rand function for index
*/
rand: function(index){
if (!index.fastIndex) throw 'rand requires fastIndex';
return _.bind(rand, index);
}
};

View File

@ -3,7 +3,7 @@
*
* Node.js part-of-speech utilities using natural's WordNet module.
*
* Copyright (c) 2012 mooster@42at.com
* Copyright (c) 2012-2014 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
@ -77,14 +77,20 @@ function is(pos){
function rand(pos){
return function(startsWith, callback, _noprofile) {
return function(opts, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getIndexFile(pos);
// word = normalize(word);
index.rand(startsWith, function(record) {
index = this.getIndexFile(pos),
startsWith = opts && opts.startsWith || '',
count = opts && opts.count || 1;
if (typeof opts === 'function') {
callback = opts;
}
index.rand(startsWith, count, function(record) {
args.push(record, startsWith);
profile && args.push(new Date() - start);
callback.apply(null, args);
@ -241,8 +247,8 @@ if (!wordposProto.getIndexFile) {
* getPOS()
* Find all POS for all words in given string
*
* @param string text - words to lookup for POS
* @param function callback - receives object with words broken into POS or 'rest':
* @param {string} text - words to lookup for POS
* @param {function} callback - receives object with words broken into POS or 'rest', ie,
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return none
*/
@ -289,8 +295,65 @@ wordposProto.getPOS = function(text, callback) {
return nWords;
};
/**
* rand()
*/
wordposProto.rand = function(opts, callback) {
var
profile = this.options.profile,
start = profile && new Date(),
results = [],
startsWith = opts && opts.startsWith || '',
count = opts && opts.count || 1,
args = [null, startsWith],
parts = 'Noun Verb Adjective Adverb'.split(' '),
self = this,
done = function(){
profile && (args.push(new Date() - start));
args[0] = results;
callback.apply(null, args)
};
if (typeof opts === 'function') {
callback = opts;
} else {
opts = _.clone(opts);
}
// TODO -- or loop count times each time getting 1 from random part!!
// slower but more random.
// select at random a part to look at
var doParts = _.sample(parts, parts.length);
tryPart();
function tryPart(){
var rand = 'rand' + doParts.pop();
self[ rand ](opts, partCallback);
}
function partCallback(result){
if (result) {
results = _.uniq(results.concat(result)); // make sure it's unique!
}
//console.log(result);
if (results.length < count && doParts.length) {
// reduce count for next part -- NO! may get duplicates
// opts.count = count - results.length;
return tryPart();
}
// trim excess
if (results.length > count) {
results.length = count;
}
done();
}
};
WordPOS.WNdb = WNdb;
WordPOS.natural = natural;
module.exports = WordPOS;