added ranX() methods
This commit is contained in:
parent
45a9a836d4
commit
889e336097
82
README.md
82
README.md
|
@ -3,6 +3,7 @@ wordpos
|
|||
|
||||
wordpos is a set of part-of-speech (POS) utilities for Node.js using [natural's](http://github.com/NaturalNode/natural) WordNet module.
|
||||
|
||||
*Update*: New version 0.1.10 - get random word(s).
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -171,6 +172,55 @@ wordpos.lookup('great', console.log);
|
|||
// ...
|
||||
```
|
||||
|
||||
### randX()
|
||||
|
||||
Get random words.
|
||||
|
||||
```
|
||||
wordpos.rand([options,] callback)
|
||||
wordpos.randNoun([options,] callback)
|
||||
wordpos.randVerb([options,[ callback)
|
||||
wordpos.randAdjective([options,] callback)
|
||||
wordpos.randAdverb([options,] callback)
|
||||
```
|
||||
Callback receives array of random words and the startsWith option.
|
||||
Options, if given, is:
|
||||
```
|
||||
{
|
||||
startsWith : <string> -- get random words starting with string
|
||||
count : <number> -- number of words to return (default = 1)
|
||||
}
|
||||
```
|
||||
Examples:
|
||||
```js
|
||||
wordpos.rand(console.log)
|
||||
// ['wulfila'] ''
|
||||
|
||||
wordpos.randNoun(console.log)
|
||||
// ['bamboo_palm'] ''
|
||||
|
||||
// with options:
|
||||
|
||||
wordpos.rand({starstWith: 'foo'}, console.log)
|
||||
// ['foot'] 'foo'
|
||||
|
||||
wordpos.rand({starstWith: 'foo', count: 3}, console.log)
|
||||
// ['footsure', 'foolish', 'footsore'] 'foo'
|
||||
|
||||
wordpos.randVerb({starstWith: 'bar', count: 3}, console.log)
|
||||
// ['barge', 'barf', 'barter_away'] 'bar'
|
||||
|
||||
wordpos.rand({starsWith: 'zzz'}, console.log)
|
||||
// [] 'zzz'
|
||||
```
|
||||
|
||||
Note on performance: random lookups could involve heavy disk reads. It is better to use the 'count' option to get words
|
||||
in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the file.
|
||||
|
||||
Getting random POS (randX) is generally faster than rand(), which may look at multiple POS files until 'count' requirement
|
||||
is met.
|
||||
|
||||
|
||||
### Other methods/properties
|
||||
|
||||
```
|
||||
|
@ -287,6 +337,32 @@ lexId":"0","ptrs":[],"gloss":"a person who is deemed to be despicable or contemp
|
|||
would do that\"; \"kill the rat\"; \"throw the bum out\"; \"you cowardly little pukes!\"; \"the British
|
||||
call a contemptible person a `git'\" "}]}
|
||||
```
|
||||
|
||||
Get random words:
|
||||
```bash
|
||||
$ wordpos rand
|
||||
# 1:
|
||||
hopelessly
|
||||
|
||||
$ wordpos rand -N 2 foot
|
||||
# foot 2:
|
||||
footprint
|
||||
footlights
|
||||
|
||||
$ wordpos rand -N 2 foot hand
|
||||
# foot 2:
|
||||
footlocker
|
||||
footmark
|
||||
|
||||
# hand 2:
|
||||
hand-hewn
|
||||
handstitched
|
||||
|
||||
$ wordpos rand --adj foot
|
||||
# foot 1:
|
||||
foot-shaped
|
||||
```
|
||||
|
||||
Usage:
|
||||
```bash
|
||||
$ wordpos
|
||||
|
@ -304,6 +380,9 @@ $ wordpos
|
|||
parse
|
||||
show parsed words, deduped and less stopwords
|
||||
|
||||
rand
|
||||
get random words (optionally starting with 'word')
|
||||
|
||||
Options:
|
||||
|
||||
-h, --help output usage information
|
||||
|
@ -312,12 +391,13 @@ $ wordpos
|
|||
-a, --adj Get adjectives
|
||||
-v, --verb Get verbs
|
||||
-r, --adv Get adverbs
|
||||
-c, --count count only (noun, adj, verb, adv, total parsed words)
|
||||
-c, --count get counts only (noun, adj, verb, adv, total parsed words)
|
||||
-b, --brief brief output (all on one line, no headers)
|
||||
-f, --full full results object
|
||||
-j, --json full results object as JSON
|
||||
-i, --file <file> input file
|
||||
-s, --stopwords include stopwords
|
||||
-N, --num <num> number of random words to get
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
* command-line interface to wordpos
|
||||
*
|
||||
* Usage:
|
||||
* wordpos [options] <get|parse|def> <stdin|words*>
|
||||
* wordpos [options] <get|parse|def|rand> <stdin|words*>
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
|
@ -35,6 +35,7 @@ program
|
|||
.option('-j, --json', 'full results object as JSON')
|
||||
.option('-i, --file <file>', 'input file')
|
||||
.option('-s, --stopwords', 'include stopwords')
|
||||
.option('-N, --num <num>', 'number of random words to return')
|
||||
;
|
||||
|
||||
program.command('get')
|
||||
|
@ -53,7 +54,7 @@ program.command('parse')
|
|||
.action(exec);
|
||||
|
||||
program.command('rand')
|
||||
.description('get random words')
|
||||
.description('get random words (starting with word, optionally)')
|
||||
.action(exec);
|
||||
|
||||
var
|
||||
|
@ -100,6 +101,7 @@ function read_stdin(callback) {
|
|||
|
||||
function optToFn() {
|
||||
var fns = _.reject(POS, function(fn, opt) { return !program[opt] });
|
||||
if (!fns.length && cmd === 'rand') return fns = ['']; // run rand()
|
||||
if (!fns.length) fns = _.values(POS); //default to all if no POS given
|
||||
return fns;
|
||||
}
|
||||
|
@ -117,9 +119,6 @@ function run(data) {
|
|||
plural ? fns.length : words.length * fns.length,
|
||||
_.bind(output, null, results)),
|
||||
collect = function(what, result, word){
|
||||
|
||||
console.log('collect ----', arguments);
|
||||
|
||||
if (word) { // lookup
|
||||
results[word] = [].concat(results[word] || [], result);
|
||||
} else { // get
|
||||
|
@ -138,11 +137,12 @@ function run(data) {
|
|||
|
||||
if (cmd == 'get') {
|
||||
wordpos[method](words, cb);
|
||||
} else if (cmd == 'rand') {
|
||||
words.forEach(function(word){
|
||||
wordpos[method]({startsWith: word, count: program.num || 1}, cb);
|
||||
});
|
||||
} else {
|
||||
words.forEach(function(word){
|
||||
|
||||
console.log(' calling rand', method, word);
|
||||
|
||||
wordpos[method](word, cb);
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
{
|
||||
"name": "wordpos",
|
||||
"author": "Moos <mooster@42at.com>",
|
||||
"keywords": ["natural", "language", "wordnet", "pos"],
|
||||
"keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"],
|
||||
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
|
||||
"version": "0.1.9",
|
||||
"version": "0.1.10",
|
||||
"homepage": "https://github.com/moos/wordpos",
|
||||
"engines": {
|
||||
"node": ">=0.6"
|
||||
},
|
||||
"bin": "./bin/wordpos-cli.js",
|
||||
"dependencies": {
|
||||
"natural": "latest",
|
||||
"natural": "NaturalNode/natural",
|
||||
"underscore": ">=1.3.1",
|
||||
"WNdb": "latest",
|
||||
"commander": "1.1.1"
|
||||
|
|
|
@ -26,6 +26,9 @@ var str = "The angry bear chased the frightened little squirrel",
|
|||
garble = 'garblegarble'; // expect not to find word
|
||||
|
||||
|
||||
function noop(){}
|
||||
|
||||
|
||||
describe('getX()...', function() {
|
||||
|
||||
beforeEach(function() {
|
||||
|
@ -240,5 +243,67 @@ describe('nested callbacks on same index key', function() {
|
|||
});
|
||||
});
|
||||
|
||||
function noop(){}
|
||||
|
||||
describe('rand()...', function() {
|
||||
it('should get random word', function(done) {
|
||||
wordpos.randNoun(function(result) {
|
||||
expect(result).toBeTruthy();
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get N random words', function(done) {
|
||||
wordpos.rand({count: 3}, function(result) {
|
||||
expect(result.length).toEqual(3);
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get random word starting with', function(done) {
|
||||
wordpos.rand({startsWith: 'foo'}, function(result, startsWith) {
|
||||
expect(result[0].indexOf('foo')).toEqual(0);
|
||||
expect(startsWith).toEqual('foo');
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get nothing starting with not fount', function(done) {
|
||||
wordpos.rand({startsWith: 'zzzz'}, function(result) {
|
||||
expect(result.length).toEqual(0);
|
||||
done();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('randX()...', function() {
|
||||
it('should get random noun', function(done) {
|
||||
wordpos.randNoun(function(result) {
|
||||
expect(result.length).toEqual(1);
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get random verb', function(done) {
|
||||
wordpos.randVerb(function(result) {
|
||||
expect(result.length).toEqual(1);
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get random adjective', function(done) {
|
||||
wordpos.randAdjective(function(result) {
|
||||
expect(result.length).toEqual(1);
|
||||
done();
|
||||
});
|
||||
});
|
||||
it('should get random adverb', function(done) {
|
||||
wordpos.randAdverb(function(result) {
|
||||
expect(result.length).toEqual(1);
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
// not found
|
||||
it('should NOT get random noun starting with', function(done) {
|
||||
wordpos.randNoun({startsWith: 'zzzz'},function(result, startsWith) {
|
||||
expect(result.length).toEqual(0);
|
||||
done();
|
||||
});
|
||||
});
|
||||
|
||||
});
|
261
src/fastIndex.js
261
src/fastIndex.js
|
@ -1,9 +1,9 @@
|
|||
/**
|
||||
/*!
|
||||
* fastIndex.js
|
||||
*
|
||||
* override natural.WordNet's IndexFile to use fast index data
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Copyright (c) 2012-2014 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -17,9 +17,10 @@ var _ = require('underscore')._,
|
|||
|
||||
/**
|
||||
* load fast index bucket data
|
||||
* @param dir - dir path of index files
|
||||
* @param name - name of index file, eg, 'index.verb'
|
||||
* @returns Object - fast index data object
|
||||
*
|
||||
* @param dir {string} - dir path of index files
|
||||
* @param name {string} - name of index file, eg, 'index.verb'
|
||||
* @returns {Object} - fast index data object
|
||||
*/
|
||||
function loadFastIndex(dir, name) {
|
||||
var jsonFile = path.join(dir, 'fast-' + name + '.json'),
|
||||
|
@ -36,8 +37,9 @@ function loadFastIndex(dir, name) {
|
|||
|
||||
/**
|
||||
* read index file using fast index data at key
|
||||
*
|
||||
* @param key - 3-char key into fast index
|
||||
* @param index - index file name (eg, 'index.verb')
|
||||
* @param index - index object
|
||||
* @param callback - function receives buffer of data read
|
||||
* @returns none
|
||||
*/
|
||||
|
@ -57,16 +59,25 @@ function readIndexForKey(key, index, callback) {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* read index file using fast index data at keyStart to keyEnd (inclusive)
|
||||
*
|
||||
* @param keyStart {string} - 3-char key into fast index to begin at
|
||||
* @param keyEnd {string|null} - 3-char key into fast index to end at. If null, reads to next key.
|
||||
* @param index - index object
|
||||
* @param callback - function receives buffer of data read
|
||||
* @returns none
|
||||
*/
|
||||
function readIndexBetweenKeys(keyStart, keyEnd, index, callback) {
|
||||
var data = index.fastIndex,
|
||||
offset = data.offsets[keyStart][0],
|
||||
nextKey = keyEnd || data.offsets[keyStart][1],
|
||||
end = keyEnd || keyStart,
|
||||
nextKey = data.offsets[end][1],
|
||||
nextOffset = data.offsets[nextKey][0],
|
||||
len = nextOffset - offset - 1,
|
||||
buffer = new Buffer(len);
|
||||
|
||||
console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len, index.fd, offset)
|
||||
|
||||
//console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len)
|
||||
fs.read(index.fd, buffer, 0, len, offset, function(err, count){
|
||||
if (err) return console.log(err);
|
||||
// console.log(' read %d bytes for <%s>', count, keyStart);
|
||||
|
@ -75,22 +86,23 @@ function readIndexBetweenKeys(keyStart, keyEnd, index, callback) {
|
|||
}
|
||||
|
||||
/**
|
||||
* run single 'task' method sharing callbacks. method MUST take callback as LAST arg.
|
||||
* run single 'task' method sharing callbacks. Method MUST take callback as LAST arg.
|
||||
* piper is bound to an index.
|
||||
*
|
||||
* @param task {string} - task name unique to method!
|
||||
* @param method {function} - method to execute, gets (key, ... , callback)
|
||||
* @param args {arrray} - args to pass to method
|
||||
* @param context {object} - other params to remember
|
||||
* @param method {function} - method to execute, gets (args, ... , callback)
|
||||
* @param args {array} - args to pass to method
|
||||
* @param context {object} - other params to remember and sent to callback
|
||||
* @param callback {function} - result callback
|
||||
*/
|
||||
function piper(task, method, args, context, callback){
|
||||
var readCallbacks = this.cache,
|
||||
var readCallbacks = this.callbackQueue,
|
||||
memoArgs = _.rest(arguments, 2),
|
||||
wrappedCallback; //_.partial(piper.wrapper, this, task, context, callback);
|
||||
wrappedCallback;
|
||||
|
||||
console.log('piper', task, args[0], args[1], context[0]);
|
||||
// console.log('piper', task, args[0], context[0]);
|
||||
|
||||
// queue up if already reading file for this task:key
|
||||
// queue up if already reading file for this task
|
||||
if (task in readCallbacks){
|
||||
readCallbacks[task].push(memoArgs);
|
||||
return;
|
||||
|
@ -98,7 +110,7 @@ function piper(task, method, args, context, callback){
|
|||
readCallbacks[task] = [memoArgs];
|
||||
|
||||
if (!this.fd) {
|
||||
console.log(' ... opening', this.filePath);
|
||||
//console.log(' ... opening', this.filePath);
|
||||
this.fd = fs.openSync(this.filePath, 'r');
|
||||
}
|
||||
|
||||
|
@ -112,24 +124,24 @@ function piper(task, method, args, context, callback){
|
|||
}
|
||||
|
||||
// result is the *same* for same task
|
||||
piper.wrapper = function(self, task, result){//, context, callback, result){
|
||||
var readCallbacks = self.cache,
|
||||
piper.wrapper = function(self, task, result){
|
||||
var readCallbacks = self.callbackQueue,
|
||||
callback, args;
|
||||
|
||||
// live access callbacks cache in case nested cb's
|
||||
// add to the array.
|
||||
while (args = readCallbacks[task].shift()) {
|
||||
callback = args.pop();
|
||||
// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString())
|
||||
callback = args.pop(); // last arg MUST be callback
|
||||
|
||||
callback.apply(null, [].concat(_.flatten(args), result));
|
||||
// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString())
|
||||
callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result));
|
||||
}
|
||||
|
||||
// now done - delete cb cache
|
||||
delete readCallbacks[task];
|
||||
|
||||
if (--self.refcount == 0) {
|
||||
console.log(' ... closing', self.filePath);
|
||||
if (--self.refcount === 0) {
|
||||
//console.log(' ... closing', self.filePath);
|
||||
fs.close(self.fd);
|
||||
self.fd = null;
|
||||
}
|
||||
|
@ -140,61 +152,36 @@ piper.wrapper = function(self, task, result){//, context, callback, result){
|
|||
*
|
||||
* calls to same bucket are queued for callback.
|
||||
*
|
||||
* @param search - word to search for
|
||||
* @param callback - callback receives found line and tokens
|
||||
* @param search {string} - word to search for
|
||||
* @param callback {function} - callback receives found line and tokens
|
||||
* @returns none
|
||||
*/
|
||||
function find(search, callback) {
|
||||
var self = this,
|
||||
data = this.fastIndex,
|
||||
readCallbacks = this.cache,
|
||||
miss = {status: 'miss'},
|
||||
args = [search, callback];
|
||||
readCallbacks = this.callbackQueue,
|
||||
miss = {status: 'miss'};
|
||||
|
||||
var key = search.slice(0, KEY_LENGTH);
|
||||
if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) });
|
||||
|
||||
// queue up if already reading file for this key
|
||||
if (key in readCallbacks){
|
||||
readCallbacks[key].push(args);
|
||||
return;
|
||||
}
|
||||
readCallbacks[key] = [args];
|
||||
if (!this.fd) {
|
||||
//console.log(' ... opening', this.filePath);
|
||||
this.fd = fs.openSync(this.filePath, 'r');
|
||||
}
|
||||
// prepare the piper
|
||||
var task = 'find' + key,
|
||||
args = [key, this],
|
||||
context = [search, callback]; // last arg MUST be callback
|
||||
|
||||
// ref count so we know when to close the main index file
|
||||
++this.refcount;
|
||||
// pay the piper
|
||||
this.piper(task, readIndexForKey, args, context, collector);
|
||||
|
||||
readIndexForKey(key, this, function (buffer){
|
||||
function collector(key, index, search, callback, buffer){
|
||||
var lines = buffer.toString().split('\n'),
|
||||
keys = lines.map(function(line){
|
||||
return line.substring(0,line.indexOf(' '));
|
||||
});
|
||||
|
||||
// live access callbacks cache in case nested cb's
|
||||
// add to the array.
|
||||
while (readCallbacks[key].length) {
|
||||
test(readCallbacks[key].shift());
|
||||
}
|
||||
|
||||
// now done - delete cb cache
|
||||
delete readCallbacks[key];
|
||||
|
||||
if (--self.refcount == 0) {
|
||||
//console.log(' ... closing', self.filePath);
|
||||
fs.close(self.fd);
|
||||
self.fd = null;
|
||||
}
|
||||
|
||||
function test(item) {
|
||||
var search = item[0],
|
||||
callback = item[1],
|
||||
}),
|
||||
ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search!
|
||||
|
||||
//console.log(' %s is %d', search, ind);
|
||||
if (ind == -1) return callback(miss);
|
||||
if (ind === -1) return callback(miss);
|
||||
|
||||
var tokens = lines[ind].split(/\s+/),
|
||||
key = tokens[0],
|
||||
|
@ -202,79 +189,116 @@ function find(search, callback) {
|
|||
|
||||
callback(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function rand(startsWith, callback){
|
||||
|
||||
/**
|
||||
* rand function (bound to index)
|
||||
*
|
||||
* @param startsWith {string} - get random word(s) that start with this, or ''
|
||||
* @param num {number} - number of words to return
|
||||
* @param callback {function} - callback function, receives words array and startsWith
|
||||
*/
|
||||
function rand(startsWith, num, callback){
|
||||
var self = this,
|
||||
key, nextKey = null, ikey;
|
||||
nextKey = null,
|
||||
trie = this.fastIndex.trie,
|
||||
key, keys;
|
||||
|
||||
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
|
||||
if (startsWith){
|
||||
key = startsWith.slice(0, KEY_LENGTH);
|
||||
|
||||
console.log('-- ', startsWith, key, self.indexKeys.length);
|
||||
/**
|
||||
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
|
||||
*/
|
||||
if (key.length < KEY_LENGTH) {
|
||||
|
||||
if (!(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback('not found') });
|
||||
// calc trie if haven't done so yet
|
||||
if (!trie){
|
||||
var natural = require('natural');
|
||||
|
||||
// 'a' -> nextKey 'b', 'go' -> 'gp'
|
||||
if (key.length < 3) {
|
||||
nextKey = key.replace(/.$/, String.fromCharCode( key.charCodeAt(key.length-1) + 1));
|
||||
ikey = _.sortedIndex(self.indexKeys, nextKey);
|
||||
nextKey = self.indexKeys[ ikey ]; // assures nextKey is in index keys
|
||||
trie = new natural.Trie();
|
||||
trie.addStrings(self.fastIndex.indexKeys);
|
||||
this.fastIndex.trie = trie;
|
||||
//console.log(' +++ Trie calc ');
|
||||
}
|
||||
|
||||
try{
|
||||
// trie throws if not found!!!!!
|
||||
keys = trie.keysWithPrefix( startsWith );
|
||||
} catch(e){
|
||||
keys = [];
|
||||
}
|
||||
|
||||
// read all keys then select random word.
|
||||
// May be large disk read!
|
||||
key = keys[0];
|
||||
nextKey = _.last(keys);
|
||||
}
|
||||
|
||||
if (!key || !(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback([], startsWith) });
|
||||
|
||||
} else {
|
||||
// no startWith given - random select among keys
|
||||
keys = _.sample( this.fastIndex.indexKeys, num );
|
||||
|
||||
// if num > 1, run each key independently and collect results
|
||||
if (num > 1){
|
||||
var results = [], ii = 0;
|
||||
_(keys).each(function(startsWith){
|
||||
self.rand(startsWith, 1, function(result){
|
||||
results.push(result[0]);
|
||||
if (++ii == num) {
|
||||
callback(results, '');
|
||||
}
|
||||
})
|
||||
});
|
||||
return;
|
||||
}
|
||||
key = keys;
|
||||
}
|
||||
// console.log(' using key', key, nextKey);
|
||||
|
||||
// prepare the piper
|
||||
var args = [key, nextKey, this],
|
||||
task = 'rand' + key + nextKey,
|
||||
context = arguments;
|
||||
|
||||
this.piper(task, readIndexBetweenKeys, args, context, function(key, nextKey, index, context, buffer){
|
||||
|
||||
console.log(context, '====', key, nextKey);
|
||||
|
||||
var startsWith = context[0],
|
||||
callback = context[1];
|
||||
context = [startsWith, num, callback]; // last arg MUST be callback
|
||||
|
||||
// pay the piper
|
||||
this.piper(task, readIndexBetweenKeys, args, context, collector);
|
||||
|
||||
function collector(key, nextKey, index, startsWith, num, callback, buffer){
|
||||
var lines = buffer.toString().split('\n'),
|
||||
r = Math.floor(Math.random() * lines.length),
|
||||
line = lines[r],
|
||||
word = line.substring(0, line.indexOf(' ')),
|
||||
keys;
|
||||
|
||||
console.log(' got lines ', lines.length);
|
||||
|
||||
console.log(11111, self.natural)
|
||||
|
||||
|
||||
if (startsWith !== key) {
|
||||
keys = lines.map(function(line){
|
||||
matches = lines.map(function(line){
|
||||
return line.substring(0,line.indexOf(' '));
|
||||
});
|
||||
|
||||
var ind = _.sortedIndex(keys, startsWith);
|
||||
//console.log(' got lines for key ', key, lines.length);
|
||||
|
||||
console.log(3333 ,ind, keys[ind], startsWith)
|
||||
// we got bunch of matches for key - now search within for startsWith
|
||||
if (startsWith !== key){
|
||||
|
||||
if (ind >= lines.length || keys[ind].indexOf(startsWith) === -1){
|
||||
return callback('not found', startsWith);
|
||||
// binary search for startsWith within set of matches
|
||||
var ind = _.sortedIndex(matches, startsWith);
|
||||
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1){
|
||||
return callback([], startsWith);
|
||||
}
|
||||
|
||||
var natural = require('natural')
|
||||
// trie = new natural.Trie();
|
||||
// FIXME --- using Trie's new keysWithPrefix not yet pushed to npm.
|
||||
// see https://github.com/NaturalNode/natural/commit/5fc86c42e41c1314bfc6a37384dd14acf5f4bb7b
|
||||
|
||||
console.log(4444, natural.Trie)
|
||||
// trie.addStrings(keys);
|
||||
// console.log(55555, trie.keysWithPrefix( startsWith ));
|
||||
var natural = require('natural'),
|
||||
trie = new natural.Trie();
|
||||
|
||||
trie.addStrings(matches);
|
||||
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
|
||||
|
||||
matches = trie.keysWithPrefix( startsWith );
|
||||
}
|
||||
|
||||
|
||||
callback(word, startsWith);
|
||||
});
|
||||
var words = _.sample(matches, num);
|
||||
callback(words, startsWith);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// cache of fast index data across instances of WordPOS class
|
||||
|
@ -284,8 +308,8 @@ module.exports = {
|
|||
/**
|
||||
* loads fast index data and return fast index find function
|
||||
*
|
||||
* @param index is the IndexFile instance
|
||||
* @returns function - fast index find or origin find if errors
|
||||
* @param index {object} - the IndexFile instance
|
||||
* @returns {function} - fast index find or original find if errors
|
||||
*/
|
||||
find: function(index){
|
||||
|
||||
|
@ -300,22 +324,25 @@ module.exports = {
|
|||
// if no fast index data was found or was corrupt, use original find
|
||||
if (!cache[key]) return index.find;
|
||||
|
||||
|
||||
index.indexKeys = Object.keys(cache[key].offsets);
|
||||
|
||||
index.fastIndex = cache[key];
|
||||
index.refcount = 0;
|
||||
index.cache = {};
|
||||
index.fastIndex.indexKeys = Object.keys(index.fastIndex.offsets);
|
||||
index.fastIndex.trie = null; // calc on demand
|
||||
|
||||
index.refcount = 0;
|
||||
index.callbackQueue = {};
|
||||
index.piper = _.bind(piper, index);
|
||||
|
||||
return find;
|
||||
},
|
||||
|
||||
/**
|
||||
* bind rand() to index
|
||||
*
|
||||
* @param index {object} - the IndexFile instance
|
||||
* @returns {function} - bound rand function for index
|
||||
*/
|
||||
rand: function(index){
|
||||
|
||||
if (!index.fastIndex) throw 'rand requires fastIndex';
|
||||
|
||||
return _.bind(rand, index);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
*
|
||||
* Node.js part-of-speech utilities using natural's WordNet module.
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Copyright (c) 2012-2014 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -77,14 +77,20 @@ function is(pos){
|
|||
|
||||
|
||||
function rand(pos){
|
||||
return function(startsWith, callback, _noprofile) {
|
||||
return function(opts, callback, _noprofile) {
|
||||
// disable profiling when isX() used internally
|
||||
var profile = this.options.profile && !_noprofile,
|
||||
start = profile && new Date(),
|
||||
args = [],
|
||||
index = this.getIndexFile(pos);
|
||||
// word = normalize(word);
|
||||
index.rand(startsWith, function(record) {
|
||||
index = this.getIndexFile(pos),
|
||||
startsWith = opts && opts.startsWith || '',
|
||||
count = opts && opts.count || 1;
|
||||
|
||||
if (typeof opts === 'function') {
|
||||
callback = opts;
|
||||
}
|
||||
|
||||
index.rand(startsWith, count, function(record) {
|
||||
args.push(record, startsWith);
|
||||
profile && args.push(new Date() - start);
|
||||
callback.apply(null, args);
|
||||
|
@ -234,15 +240,15 @@ if (!wordposProto.getIndexFile) {
|
|||
case 'r':
|
||||
return this.advIndex;
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* getPOS()
|
||||
* Find all POS for all words in given string
|
||||
*
|
||||
* @param string text - words to lookup for POS
|
||||
* @param function callback - receives object with words broken into POS or 'rest':
|
||||
* @param {string} text - words to lookup for POS
|
||||
* @param {function} callback - receives object with words broken into POS or 'rest', ie,
|
||||
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
|
||||
* @return none
|
||||
*/
|
||||
|
@ -289,8 +295,65 @@ wordposProto.getPOS = function(text, callback) {
|
|||
return nWords;
|
||||
};
|
||||
|
||||
/**
|
||||
* rand()
|
||||
*/
|
||||
wordposProto.rand = function(opts, callback) {
|
||||
var
|
||||
profile = this.options.profile,
|
||||
start = profile && new Date(),
|
||||
results = [],
|
||||
startsWith = opts && opts.startsWith || '',
|
||||
count = opts && opts.count || 1,
|
||||
args = [null, startsWith],
|
||||
parts = 'Noun Verb Adjective Adverb'.split(' '),
|
||||
self = this,
|
||||
done = function(){
|
||||
profile && (args.push(new Date() - start));
|
||||
args[0] = results;
|
||||
callback.apply(null, args)
|
||||
};
|
||||
|
||||
if (typeof opts === 'function') {
|
||||
callback = opts;
|
||||
} else {
|
||||
opts = _.clone(opts);
|
||||
}
|
||||
|
||||
// TODO -- or loop count times each time getting 1 from random part!!
|
||||
// slower but more random.
|
||||
|
||||
// select at random a part to look at
|
||||
var doParts = _.sample(parts, parts.length);
|
||||
tryPart();
|
||||
|
||||
function tryPart(){
|
||||
var rand = 'rand' + doParts.pop();
|
||||
self[ rand ](opts, partCallback);
|
||||
}
|
||||
|
||||
function partCallback(result){
|
||||
if (result) {
|
||||
results = _.uniq(results.concat(result)); // make sure it's unique!
|
||||
}
|
||||
|
||||
//console.log(result);
|
||||
if (results.length < count && doParts.length) {
|
||||
// reduce count for next part -- NO! may get duplicates
|
||||
// opts.count = count - results.length;
|
||||
return tryPart();
|
||||
}
|
||||
|
||||
// trim excess
|
||||
if (results.length > count) {
|
||||
results.length = count;
|
||||
}
|
||||
done();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
WordPOS.WNdb = WNdb;
|
||||
WordPOS.natural = natural;
|
||||
|
||||
|
||||
module.exports = WordPOS;
|
||||
|
|
Loading…
Reference in New Issue