Major update - first v1.0 checkin with support for Promise, remove natural dependency, and more.
This commit is contained in:
parent
2001182b7a
commit
b27c49fd01
|
@ -3,5 +3,3 @@ node_js:
|
|||
- '5'
|
||||
- '4'
|
||||
- '0.12'
|
||||
before_script:
|
||||
- npm install -g jasmine-node
|
107
README.md
107
README.md
|
@ -6,7 +6,7 @@ wordpos
|
|||
|
||||
wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js using fast lookup in the WordNet database.
|
||||
|
||||
Version 1.x is a mojor update with no direct depedence on [natural's](http://github.com/NaturalNode/natural), with support for Promises, and roughly 5x speed improvement over previous version.
|
||||
Version 1.x is a major update with no direct dependence on [natural's](http://github.com/NaturalNode/natural), with support for [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise), and roughly 5x speed improvement over previous version.
|
||||
|
||||
**CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](http://wordnet.princeton.edu/wordnet/man/wnstats.7WN.html) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment.
|
||||
|
||||
|
@ -104,7 +104,7 @@ wordpos.getPOS(text, callback) -- callback receives a result object:
|
|||
```
|
||||
|
||||
If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
|
||||
than getPOS() which looks up the word in all index files. [stopwords](https://github.com/moos/wordpos/lib/natural/util/stopwords.js)are stripped out from text before lookup.
|
||||
than getPOS() which looks up the word in all index files. [stopwords](lib/natural/util/stopwords.js) are stripped out from text before lookup.
|
||||
|
||||
If `text` is an *array*, all words are looked-up -- no deduplication, stopword filtering or tokenization is applied.
|
||||
|
||||
|
@ -127,8 +127,7 @@ wordpos.getPOS('The angry bear chased the frightened little squirrel.', console.
|
|||
}
|
||||
|
||||
```
|
||||
This has no relation to correct grammar of given sentence, where here only 'bear' and 'squirrel'
|
||||
would be considered nouns.
|
||||
This has no relation to correct grammar of given sentence, where here only 'bear' and 'squirrel' would be considered nouns.
|
||||
|
||||
#### isNoun(word, callback)
|
||||
#### isVerb(word, callback)
|
||||
|
@ -228,7 +227,33 @@ Access the array of stopwords.
|
|||
|
||||
## Promises
|
||||
|
||||
TODO
|
||||
As of v1.0, all `get`, `is`, `rand`, and `lookup` methods return a standard ES6 [Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise).
|
||||
|
||||
```js
|
||||
wordpos.isVerb('fish').then(console.log);
|
||||
// true
|
||||
```
|
||||
|
||||
Compound, with error handler:
|
||||
|
||||
```js
|
||||
wordpos.isVerb('fish')
|
||||
.then(console.log)
|
||||
.then(doSomethingElse)
|
||||
.catch(console.error);
|
||||
```
|
||||
|
||||
Callbacks, if given, are executed _before_ the Promise is resolved.
|
||||
|
||||
```js
|
||||
wordpos.isVerb('fish', console.log)
|
||||
.then(console.log)
|
||||
.catch(console.error);
|
||||
// true 'fish' 13
|
||||
// true
|
||||
```
|
||||
Note that callback receives full arguments (including profile, if enabled), while the Promise receives only the result of the call. Also, beware that exceptions in the _callback_ will result in the Promise being _rejected_ and caught by `catch()`, if provided.
|
||||
|
||||
|
||||
## Fast Index
|
||||
|
||||
|
@ -236,7 +261,7 @@ Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the
|
|||
|
||||
Fast index improves performance **30x** over Natural's native methods. See blog article [Optimizing WordPos](http://blog.42at.com/optimizing-wordpos).
|
||||
|
||||
As of version 1.0, the fast index option is always on and cannot be turned off.
|
||||
As of version 1.0, fast index is always on and cannot be turned off.
|
||||
|
||||
## Command-line: CLI
|
||||
|
||||
|
@ -245,73 +270,15 @@ For CLI usage and examples, see [bin/README](bin).
|
|||
|
||||
## Benchmark
|
||||
|
||||
Note: `wordpos-bench.js` requires a [forked uubench](https://github.com/moos/uubench) module.
|
||||
|
||||
cd bench
|
||||
node wordpos-bench.js
|
||||
|
||||
|
||||
512-word corpus (< v0.1.4, comparable to Natural) :
|
||||
```
|
||||
getPOS : 0 ops/s { iterations: 1, elapsed: 9039 }
|
||||
getNouns : 0 ops/s { iterations: 1, elapsed: 2347 }
|
||||
getVerbs : 0 ops/s { iterations: 1, elapsed: 2434 }
|
||||
getAdjectives : 1 ops/s { iterations: 1, elapsed: 1698 }
|
||||
getAdverbs : 0 ops/s { iterations: 1, elapsed: 2698 }
|
||||
done in 20359 msecs
|
||||
```
|
||||
|
||||
512-word corpus (as of v0.1.4, with fastIndex) :
|
||||
```
|
||||
getPOS : 18 ops/s { iterations: 1, elapsed: 57 }
|
||||
getNouns : 48 ops/s { iterations: 1, elapsed: 21 }
|
||||
getVerbs : 125 ops/s { iterations: 1, elapsed: 8 }
|
||||
getAdjectives : 111 ops/s { iterations: 1, elapsed: 9 }
|
||||
getAdverbs : 143 ops/s { iterations: 1, elapsed: 7 }
|
||||
done in 1375 msecs
|
||||
```
|
||||
|
||||
220 words are looked-up (less stopwords and duplicates) on a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.
|
||||
|
||||
### Version 1.0 Benchmark
|
||||
|
||||
Re-run v0.1.16:
|
||||
```
|
||||
getPOS : 11 ops/s { iterations: 1, elapsed: 90 }
|
||||
getNouns : 21 ops/s { iterations: 1, elapsed: 47 }
|
||||
getVerbs : 53 ops/s { iterations: 1, elapsed: 19 }
|
||||
getAdjectives : 29 ops/s { iterations: 1, elapsed: 34 }
|
||||
getAdverbs : 83 ops/s { iterations: 1, elapsed: 12 }
|
||||
lookup : 1 ops/s { iterations: 1, elapsed: 720 }
|
||||
lookupNoun : 1 ops/s { iterations: 1, elapsed: 676 }
|
||||
|
||||
looked up 220 words
|
||||
done in 2459 msecs
|
||||
```
|
||||
|
||||
V1.0:
|
||||
```
|
||||
getPOS : 14 ops/s { iterations: 1, elapsed: 73 }
|
||||
getNouns : 26 ops/s { iterations: 1, elapsed: 38 }
|
||||
getVerbs : 42 ops/s { iterations: 1, elapsed: 24 }
|
||||
getAdjectives : 24 ops/s { iterations: 1, elapsed: 42 }
|
||||
getAdverbs : 26 ops/s { iterations: 1, elapsed: 38 }
|
||||
lookup : 6 ops/s { iterations: 1, elapsed: 159 }
|
||||
lookupNoun : 13 ops/s { iterations: 1, elapsed: 77 }
|
||||
|
||||
looked up 221 words
|
||||
done in 1274 msecs
|
||||
```
|
||||
That's roughly **2x** better across the board. Functions that read the data files see much improved performance: `lookup` about **5x** and `lookupNoun` over **8x**.
|
||||
|
||||
See [benchmark](benchmark/README).
|
||||
|
||||
## Changes
|
||||
|
||||
1.0.1
|
||||
- Removed direct dependency on Natural. Certain modules are included in /lib.
|
||||
- Add support for Promises.
|
||||
- Improved data file reads for up to **5x** performance increase.
|
||||
- Tests are now mocha-based with assert interface.
|
||||
1.0.0
|
||||
- Removed npm dependency on Natural. Certain modules are included in /lib.
|
||||
- Add support for ES6 Promises.
|
||||
- Improved data file reads for up to **5x** performance increase compared to previous version.
|
||||
- Tests are now [mocha](https://mochajs.org/)-based with [chai](http://chaijs.com/) assert interface.
|
||||
|
||||
0.1.16
|
||||
- Changed dependency to wordnet-db (renamed from WNdb)
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
## Benchmark
|
||||
|
||||
```bash
|
||||
cd bench
|
||||
node wordpos-bench.js
|
||||
```
|
||||
|
||||
### Version 1.0 Benchmark
|
||||
|
||||
The following benchmarks were run on a Win8.1/Core i7/3.5GHz machine on a Seagate 500GB SATA II, 7200 RPM disk. The corpus was a 512-word text, with stopwords and duplicates removed, resulting in 220 words looked-up.
|
||||
|
||||
#### Pre v0.14 (comparable to Natural)
|
||||
```
|
||||
getPOS : 1 ops/s { iterations: 1, elapsed: 1514 }
|
||||
getNouns : 2 ops/s { iterations: 1, elapsed: 409 }
|
||||
getVerbs : 2 ops/s { iterations: 1, elapsed: 418 }
|
||||
getAdjectives : 3 ops/s { iterations: 1, elapsed: 332 }
|
||||
getAdverbs : 4 ops/s { iterations: 1, elapsed: 272 }
|
||||
lookup : 1 ops/s { iterations: 1, elapsed: 1981 }
|
||||
lookupNoun : 0 ops/s { iterations: 1, elapsed: 2016 }
|
||||
|
||||
looked up 220 words
|
||||
done in 7770 msecs
|
||||
```
|
||||
|
||||
#### v0.1.16 (with fastIndex):
|
||||
```
|
||||
getPOS : 11 ops/s { iterations: 1, elapsed: 90 }
|
||||
getNouns : 21 ops/s { iterations: 1, elapsed: 47 }
|
||||
getVerbs : 53 ops/s { iterations: 1, elapsed: 19 }
|
||||
getAdjectives : 29 ops/s { iterations: 1, elapsed: 34 }
|
||||
getAdverbs : 83 ops/s { iterations: 1, elapsed: 12 }
|
||||
lookup : 1 ops/s { iterations: 1, elapsed: 720 }
|
||||
lookupNoun : 1 ops/s { iterations: 1, elapsed: 676 }
|
||||
|
||||
looked up 220 words
|
||||
done in 2459 msecs
|
||||
```
|
||||
|
||||
#### v1.0:
|
||||
```
|
||||
getPOS : 14 ops/s { iterations: 1, elapsed: 73 }
|
||||
getNouns : 26 ops/s { iterations: 1, elapsed: 38 }
|
||||
getVerbs : 42 ops/s { iterations: 1, elapsed: 24 }
|
||||
getAdjectives : 24 ops/s { iterations: 1, elapsed: 42 }
|
||||
getAdverbs : 26 ops/s { iterations: 1, elapsed: 38 }
|
||||
lookup : 6 ops/s { iterations: 1, elapsed: 159 }
|
||||
lookupNoun : 13 ops/s { iterations: 1, elapsed: 77 }
|
||||
|
||||
looked up 221 words
|
||||
done in 1274 msecs
|
||||
```
|
||||
|
||||
These are **3.5x** better compared to v0.1.16 and **15x** better compared to pre v0.14, overall. Functions that read the data files see much improved performance: `lookup` about **13x** and `lookupNoun` **26x** compared to pre v0.14.
|
||||
|
||||
|
||||
### Old benchmark
|
||||
|
||||
512-word corpus (< v0.1.4, comparable to Natural) :
|
||||
```
|
||||
getPOS : 0 ops/s { iterations: 1, elapsed: 9039 }
|
||||
getNouns : 0 ops/s { iterations: 1, elapsed: 2347 }
|
||||
getVerbs : 0 ops/s { iterations: 1, elapsed: 2434 }
|
||||
getAdjectives : 1 ops/s { iterations: 1, elapsed: 1698 }
|
||||
getAdverbs : 0 ops/s { iterations: 1, elapsed: 2698 }
|
||||
done in 20359 msecs
|
||||
```
|
||||
|
||||
512-word corpus (as of v0.1.4, with fastIndex) :
|
||||
```
|
||||
getPOS : 18 ops/s { iterations: 1, elapsed: 57 }
|
||||
getNouns : 48 ops/s { iterations: 1, elapsed: 21 }
|
||||
getVerbs : 125 ops/s { iterations: 1, elapsed: 8 }
|
||||
getAdjectives : 111 ops/s { iterations: 1, elapsed: 9 }
|
||||
getAdverbs : 143 ops/s { iterations: 1, elapsed: 7 }
|
||||
done in 1375 msecs
|
||||
```
|
||||
|
||||
220 words are looked-up (less stopwords and duplicates) on a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.
|
||||
|
|
@ -1,15 +1,23 @@
|
|||
/**
|
||||
* wordpos-bench.js
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
*/
|
||||
|
||||
var uubench = require('uubench'), // from: https://github.com/moos/uubench
|
||||
var Bench = require('mini-bench'),
|
||||
fs = require('fs'),
|
||||
_ = require('underscore')._,
|
||||
WordPOS = require('../src/wordpos'),
|
||||
wordpos = new WordPOS();
|
||||
|
||||
|
||||
suite = new uubench.Suite({
|
||||
suite = new Bench.Suite({
|
||||
type: 'fixed',
|
||||
iterations: 1,
|
||||
sync: true, // important!
|
||||
async: false, // important!
|
||||
|
||||
start: function(tests){
|
||||
console.log('starting %d tests', tests.length);
|
||||
|
@ -110,6 +118,7 @@ suite.section('--512 words--', function(next){
|
|||
suite.options.iterations = 1;
|
||||
next();
|
||||
});
|
||||
|
||||
suite.bench('getPOS', getPOS);
|
||||
suite.bench('getNouns', getNouns);
|
||||
suite.bench('getVerbs', getVerbs);
|
||||
|
@ -118,6 +127,4 @@ suite.bench('getAdverbs', getAdverbs);
|
|||
suite.bench('lookup', lookup);
|
||||
suite.bench('lookupNoun', lookupNoun);
|
||||
|
||||
|
||||
|
||||
suite.run();
|
||||
|
|
22
package.json
22
package.json
|
@ -1,7 +1,15 @@
|
|||
{
|
||||
"name": "wordpos",
|
||||
"author": "Moos <mooster@42at.com>",
|
||||
"keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"],
|
||||
"keywords": [
|
||||
"natural",
|
||||
"language",
|
||||
"wordnet",
|
||||
"adjectives",
|
||||
"nouns",
|
||||
"adverbs",
|
||||
"verbs"
|
||||
],
|
||||
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
|
||||
"version": "1.0.0-RC1",
|
||||
"homepage": "https://github.com/moos/wordpos",
|
||||
|
@ -10,18 +18,18 @@
|
|||
},
|
||||
"bin": "./bin/wordpos-cli.js",
|
||||
"dependencies": {
|
||||
"commander": "^2.0.0",
|
||||
"underscore": ">=1.3.1",
|
||||
"wordnet-db": "latest",
|
||||
"commander": "^2.0.0"
|
||||
"wordnet-db": "latest"
|
||||
},
|
||||
"devDependencies": {
|
||||
"uubench": "git://github.com/moos/uubench.git",
|
||||
"mini-bench": "^1.0.0",
|
||||
"chai": "*",
|
||||
"mocha": "*"
|
||||
},
|
||||
"repository" : {
|
||||
"type" : "git",
|
||||
"url" : "git://github.com/moos/wordpos.git"
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git://github.com/moos/wordpos.git"
|
||||
},
|
||||
"main": "./src/wordpos.js",
|
||||
"scripts": {
|
||||
|
|
|
@ -1,24 +1,41 @@
|
|||
/*!
|
||||
* dataFile.js
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Portions: Copyright (c) 2011, Chris Umbel
|
||||
*
|
||||
* Released under MIT license
|
||||
*/
|
||||
|
||||
var fs = require('fs'),
|
||||
path = require('path'),
|
||||
_ = require('underscore');
|
||||
|
||||
|
||||
// courtesy of natural.WordNet
|
||||
// TODO link
|
||||
/**
|
||||
* parse a single data file line, returning data object
|
||||
*
|
||||
* @param line {string} - a single line from WordNet data file
|
||||
* @returns {object}
|
||||
*
|
||||
* Credit for this routine to https://github.com/NaturalNode/natural
|
||||
*/
|
||||
function lineDataToJSON(line) {
|
||||
var data = line.split('| '),
|
||||
tokens = data[0].split(/\s+/),
|
||||
ptrs = [],
|
||||
wCnt = parseInt(tokens[3], 16),
|
||||
synonyms = [];
|
||||
synonyms = [],
|
||||
i;
|
||||
|
||||
for(var i = 0; i < wCnt; i++) {
|
||||
for(i = 0; i < wCnt; i++) {
|
||||
synonyms.push(tokens[4 + i * 2]);
|
||||
}
|
||||
|
||||
var ptrOffset = (wCnt - 1) * 2 + 6;
|
||||
for(var i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
|
||||
for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
|
||||
ptrs.push({
|
||||
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
|
||||
synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10),
|
||||
|
@ -51,10 +68,15 @@ function lineDataToJSON(line) {
|
|||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* read data file at location (bound to a data file).
|
||||
* Reads nominal length and checks for EOL. Continue reading until EOL.
|
||||
*
|
||||
* @param location {Number} - seek location
|
||||
* @param callback {function} - callback function
|
||||
*/
|
||||
function readLocation(location, callback) {
|
||||
//console.log('## read location ', this.fileName, location);
|
||||
|
||||
var
|
||||
file = this,
|
||||
str = '',
|
||||
|
@ -68,8 +90,6 @@ function readLocation(location, callback) {
|
|||
return;
|
||||
}
|
||||
//console.log(' read %d bytes at <%d>', count, location);
|
||||
//console.log(str);
|
||||
|
||||
callback(null, lineDataToJSON(str));
|
||||
});
|
||||
|
||||
|
@ -77,10 +97,9 @@ function readLocation(location, callback) {
|
|||
fs.read(file.fd, buffer, 0, len, pos, function (err, count) {
|
||||
str += buffer.toString('ascii');
|
||||
var eol = str.indexOf('\n');
|
||||
|
||||
//console.log(' -- read %d bytes at <%d>', count, pos, eol);
|
||||
|
||||
if (eol === -1 && len < file.maxLineLength) {
|
||||
// continue reading
|
||||
return readChunk(pos + count, cb);
|
||||
}
|
||||
|
||||
|
@ -90,14 +109,19 @@ function readLocation(location, callback) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* main lookup function
|
||||
*
|
||||
* @param record {object} - record to lookup, obtained from index.find()
|
||||
* @param callback{function} (optional) - callback function
|
||||
* @returns {Promise}
|
||||
*/
|
||||
function lookup(record, callback) {
|
||||
var results = [],
|
||||
self = this,
|
||||
offsets = record.synsetOffset;
|
||||
|
||||
return new Promise(function(resolve, reject) {
|
||||
//console.log('data lookup', record);
|
||||
|
||||
offsets
|
||||
.map(function (offset) {
|
||||
return _.partial(readLocation.bind(self), offset);
|
||||
|
@ -109,7 +133,6 @@ function lookup(record, callback) {
|
|||
|
||||
function done(lastResult) {
|
||||
closeFile();
|
||||
//console.log('done promise -- ');
|
||||
if (lastResult instanceof Error) {
|
||||
callback && callback(lastResult, []);
|
||||
reject(lastResult);
|
||||
|
@ -129,7 +152,6 @@ function lookup(record, callback) {
|
|||
//console.log(' ... opening', self.filePath);
|
||||
self.fd = fs.openSync(self.filePath, 'r');
|
||||
}
|
||||
|
||||
// ref count so we know when to close the main index file
|
||||
++self.refcount;
|
||||
return Promise.resolve();
|
||||
|
@ -145,13 +167,17 @@ function lookup(record, callback) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* turn ordinary function into a promising one!
|
||||
*
|
||||
* @param collect {Array} - used to collect results
|
||||
* @returns {Function}
|
||||
*/
|
||||
function promisifyInto(collect) {
|
||||
return function(fn) {
|
||||
return function() {
|
||||
return new Promise(function (resolve, reject) {
|
||||
fn(function (error, result) { // Note callback signature!
|
||||
//console.log('cb from get', arguments)
|
||||
fn(function (error, result) { // Note: callback signature!
|
||||
if (error) {
|
||||
reject(error);
|
||||
}
|
||||
|
@ -166,7 +192,13 @@ function promisifyInto(collect) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* DataFile class
|
||||
*
|
||||
* @param dictPath {string} - path to dict folder
|
||||
* @param name {string} - POS name
|
||||
* @constructor
|
||||
*/
|
||||
var DataFile = function(dictPath, name) {
|
||||
this.dictPath = dictPath;
|
||||
this.fileName = 'data.' + name;
|
||||
|
@ -177,13 +209,23 @@ var DataFile = function(dictPath, name) {
|
|||
this.refcount = 0;
|
||||
};
|
||||
|
||||
// maximum read length at a time
|
||||
/**
|
||||
* maximum read length at a time
|
||||
* @type {Number}
|
||||
*/
|
||||
var MAX_SINGLE_READ_LENGTH = 512;
|
||||
|
||||
//DataFile.prototype.get = get;
|
||||
/**
|
||||
* lookup
|
||||
*/
|
||||
DataFile.prototype.lookup = lookup;
|
||||
|
||||
// e.g.: wc -L data.adv as of v3.1
|
||||
|
||||
/**
|
||||
* maximum line length in each data file - used to optimize reads
|
||||
*
|
||||
* wc -L data.adv as of v3.1
|
||||
*/
|
||||
DataFile.MAX_LINE_LENGTH = {
|
||||
noun: 12972,
|
||||
verb: 7713,
|
||||
|
@ -191,4 +233,5 @@ DataFile.MAX_LINE_LENGTH = {
|
|||
adv: 638
|
||||
};
|
||||
|
||||
|
||||
module.exports = DataFile;
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Portions: Copyright (c) 2011, Chris Umbel
|
||||
*
|
||||
* Released under MIT license
|
||||
*/
|
||||
|
||||
|
@ -16,6 +18,7 @@ var _ = require('underscore')._,
|
|||
piper = require('./piper'),
|
||||
KEY_LENGTH = 3;
|
||||
|
||||
|
||||
/**
|
||||
* load fast index bucket data
|
||||
*
|
||||
|
@ -112,7 +115,7 @@ function find(search, callback) {
|
|||
// pay the piper
|
||||
this.piper(task, readIndexForKey, args, context, collector);
|
||||
|
||||
function collector(key, index, search, callback, buffer){
|
||||
function collector(_key, index, search, callback, buffer){
|
||||
var lines = buffer.toString().split('\n'),
|
||||
keys = lines.map(function(line){
|
||||
return line.substring(0,line.indexOf(' '));
|
||||
|
@ -136,21 +139,24 @@ function find(search, callback) {
|
|||
* @param word {string} - search word
|
||||
* @param callback {function} - callback function receives result
|
||||
* @returns none
|
||||
*
|
||||
* Credit for this routine to https://github.com/NaturalNode/natural
|
||||
*/
|
||||
function lookup(word, callback) {
|
||||
var self = this;
|
||||
|
||||
return new Promise(function(resolve, reject){
|
||||
self.find(word, function (record) {
|
||||
var indexRecord = null;
|
||||
var indexRecord = null,
|
||||
i;
|
||||
|
||||
if (record.status == 'hit') {
|
||||
var ptrs = [], offsets = [];
|
||||
|
||||
for (var i = 0; i < parseInt(record.tokens[3]); i++)
|
||||
for (i = 0; i < parseInt(record.tokens[3]); i++)
|
||||
ptrs.push(record.tokens[i]);
|
||||
|
||||
for (var i = 0; i < parseInt(record.tokens[2]); i++)
|
||||
for (i = 0; i < parseInt(record.tokens[2]); i++)
|
||||
offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10));
|
||||
|
||||
indexRecord = {
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
|
||||
var _ = require('underscore')._,
|
||||
util = require('util'),
|
||||
path = require('path'),
|
||||
fs = require('fs');
|
||||
|
||||
/**
|
||||
|
@ -21,7 +20,7 @@ var _ = require('underscore')._,
|
|||
*
|
||||
* @param task {string} - task name unique to method!
|
||||
* @param method {function} - method to execute, gets (args, ... , callback)
|
||||
* @param args {array} - args to pass to method
|
||||
* @param args {Array} - args to pass to method
|
||||
* @param context {object} - other params to remember and sent to callback
|
||||
* @param callback {function} - result callback
|
||||
*/
|
||||
|
|
230
src/rand.js
230
src/rand.js
|
@ -36,10 +36,10 @@ function makeRandX(pos){
|
|||
callback = opts;
|
||||
}
|
||||
|
||||
index.rand(startsWith, count, function(record) {
|
||||
return index.rand(startsWith, count, function (record) {
|
||||
args.push(record, startsWith);
|
||||
profile && args.push(new Date() - start);
|
||||
callback.apply(null, args);
|
||||
callback && callback.apply(null, args);
|
||||
});
|
||||
};
|
||||
}
|
||||
|
@ -50,6 +50,7 @@ function makeRandX(pos){
|
|||
* @param startsWith {string} - get random word(s) that start with this, or ''
|
||||
* @param num {number} - number of words to return
|
||||
* @param callback {function} - callback function, receives words array and startsWith
|
||||
* @returns Promise
|
||||
*/
|
||||
function rand(startsWith, num, callback){
|
||||
var self = this,
|
||||
|
@ -57,102 +58,115 @@ function rand(startsWith, num, callback){
|
|||
trie = this.fastIndex.trie,
|
||||
key, keys;
|
||||
|
||||
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
|
||||
if (startsWith){
|
||||
key = startsWith.slice(0, KEY_LENGTH);
|
||||
return new Promise(function(resolve, reject) {
|
||||
|
||||
/**
|
||||
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
|
||||
*/
|
||||
if (key.length < KEY_LENGTH) {
|
||||
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
|
||||
if (startsWith) {
|
||||
key = startsWith.slice(0, KEY_LENGTH);
|
||||
|
||||
// calc trie if haven't done so yet
|
||||
if (!trie){
|
||||
trie = new Trie();
|
||||
trie.addStrings(self.fastIndex.indexKeys);
|
||||
this.fastIndex.trie = trie;
|
||||
//console.log(' +++ Trie calc ');
|
||||
/**
|
||||
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
|
||||
*/
|
||||
if (key.length < KEY_LENGTH) {
|
||||
|
||||
// calc trie if haven't done so yet
|
||||
if (!trie) {
|
||||
trie = new Trie();
|
||||
trie.addStrings(self.fastIndex.indexKeys);
|
||||
self.fastIndex.trie = trie;
|
||||
//console.log(' +++ Trie calc ');
|
||||
}
|
||||
|
||||
try {
|
||||
// trie throws if not found!!!!!
|
||||
keys = trie.keysWithPrefix(startsWith);
|
||||
} catch (e) {
|
||||
keys = [];
|
||||
}
|
||||
|
||||
// read all keys then select random word.
|
||||
// May be large disk read!
|
||||
key = keys[0];
|
||||
nextKey = _.last(keys);
|
||||
}
|
||||
|
||||
try{
|
||||
// trie throws if not found!!!!!
|
||||
keys = trie.keysWithPrefix( startsWith );
|
||||
} catch(e){
|
||||
keys = [];
|
||||
if (!key || !(key in self.fastIndex.offsets)) {
|
||||
callback && callback([], startsWith);
|
||||
resolve([]);
|
||||
}
|
||||
|
||||
// read all keys then select random word.
|
||||
// May be large disk read!
|
||||
key = keys[0];
|
||||
nextKey = _.last(keys);
|
||||
} else {
|
||||
// no startWith given - random select among keys
|
||||
keys = _.sample(self.fastIndex.indexKeys, num);
|
||||
|
||||
// if num > 1, run each key independently and collect results
|
||||
if (num > 1) {
|
||||
var results = [], ii = 0;
|
||||
_(keys).each(function (startsWith) {
|
||||
self.rand(startsWith, 1, function (result) {
|
||||
results.push(result[0]);
|
||||
if (++ii == num) {
|
||||
callback && callback(results, '');
|
||||
resolve(results);
|
||||
}
|
||||
});
|
||||
});
|
||||
return;
|
||||
}
|
||||
key = keys;
|
||||
}
|
||||
|
||||
if (!key || !(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback([], startsWith) });
|
||||
// prepare the piper
|
||||
var args = [key, nextKey, self],
|
||||
task = 'rand:' + key + nextKey,
|
||||
context = [startsWith, num, callback]; // last arg MUST be callback
|
||||
|
||||
} else {
|
||||
// no startWith given - random select among keys
|
||||
keys = _.sample( this.fastIndex.indexKeys, num );
|
||||
// pay the piper
|
||||
self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector);
|
||||
|
||||
// if num > 1, run each key independently and collect results
|
||||
if (num > 1){
|
||||
var results = [], ii = 0;
|
||||
_(keys).each(function(startsWith){
|
||||
self.rand(startsWith, 1, function(result){
|
||||
results.push(result[0]);
|
||||
if (++ii == num) {
|
||||
callback(results, '');
|
||||
}
|
||||
})
|
||||
});
|
||||
return;
|
||||
}
|
||||
key = keys;
|
||||
}
|
||||
// console.log(' using key', key, nextKey);
|
||||
function collector(key, nextKey, index, startsWith, num, callback, buffer) {
|
||||
var lines = buffer.toString().split('\n'),
|
||||
matches = lines.map(function (line) {
|
||||
return line.substring(0, line.indexOf(' '));
|
||||
});
|
||||
//console.log(' got lines for key ', key, lines.length);
|
||||
|
||||
// prepare the piper
|
||||
var args = [key, nextKey, this],
|
||||
task = 'rand:' + key + nextKey,
|
||||
context = [startsWith, num, callback]; // last arg MUST be callback
|
||||
// we got bunch of matches for key - now search within for startsWith
|
||||
if (startsWith !== key) {
|
||||
// binary search for startsWith within set of matches
|
||||
var ind = _.sortedIndex(matches, startsWith);
|
||||
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) {
|
||||
callback && callback([], startsWith);
|
||||
resolve([]);
|
||||
return;
|
||||
}
|
||||
|
||||
// pay the piper
|
||||
this.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector);
|
||||
|
||||
function collector(key, nextKey, index, startsWith, num, callback, buffer){
|
||||
var lines = buffer.toString().split('\n'),
|
||||
matches = lines.map(function(line){
|
||||
return line.substring(0,line.indexOf(' '));
|
||||
});
|
||||
|
||||
//console.log(' got lines for key ', key, lines.length);
|
||||
|
||||
// we got bunch of matches for key - now search within for startsWith
|
||||
if (startsWith !== key){
|
||||
|
||||
// binary search for startsWith within set of matches
|
||||
var ind = _.sortedIndex(matches, startsWith);
|
||||
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1){
|
||||
return callback([], startsWith);
|
||||
var trie = new Trie();
|
||||
trie.addStrings(matches);
|
||||
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
|
||||
matches = trie.keysWithPrefix(startsWith);
|
||||
}
|
||||
|
||||
// FIXME --- using Trie's new keysWithPrefix not yet pushed to npm.
|
||||
// see https://github.com/NaturalNode/natural/commit/5fc86c42e41c1314bfc6a37384dd14acf5f4bb7b
|
||||
|
||||
var trie = new Trie();
|
||||
|
||||
trie.addStrings(matches);
|
||||
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
|
||||
|
||||
matches = trie.keysWithPrefix( startsWith );
|
||||
var words = _.sample(matches, num);
|
||||
callback && callback(words, startsWith);
|
||||
resolve(words);
|
||||
}
|
||||
|
||||
var words = _.sample(matches, num);
|
||||
callback(words, startsWith);
|
||||
}
|
||||
}); // Promise
|
||||
}
|
||||
|
||||
// relative weight of each POS word count (DB 3.1 numbers)
|
||||
var POS_factor = {
|
||||
Noun: 26,
|
||||
Verb: 3,
|
||||
Adjective: 5,
|
||||
Adverb: 1,
|
||||
Total: 37
|
||||
};
|
||||
|
||||
/**
|
||||
* rand() - for all Index files
|
||||
* @returns Promise
|
||||
*/
|
||||
function randAll(opts, callback) {
|
||||
var
|
||||
|
@ -163,12 +177,7 @@ function randAll(opts, callback) {
|
|||
count = opts && opts.count || 1,
|
||||
args = [null, startsWith],
|
||||
parts = 'Noun Verb Adjective Adverb'.split(' '),
|
||||
self = this,
|
||||
done = function(){
|
||||
profile && (args.push(new Date() - start));
|
||||
args[0] = results;
|
||||
callback.apply(null, args)
|
||||
};
|
||||
self = this;
|
||||
|
||||
if (typeof opts === 'function') {
|
||||
callback = opts;
|
||||
|
@ -176,36 +185,45 @@ function randAll(opts, callback) {
|
|||
opts = _.clone(opts);
|
||||
}
|
||||
|
||||
// TODO -- or loop count times each time getting 1 from random part!!
|
||||
// slower but more random.
|
||||
|
||||
// select at random a part to look at
|
||||
var doParts = _.sample(parts, parts.length);
|
||||
tryPart();
|
||||
return new Promise(function(resolve, reject) {
|
||||
// select at random a POS to look at
|
||||
var doParts = _.sample(parts, parts.length);
|
||||
tryPart();
|
||||
|
||||
function tryPart(){
|
||||
var rand = 'rand' + doParts.pop();
|
||||
self[ rand ](opts, partCallback);
|
||||
}
|
||||
function tryPart() {
|
||||
var part = doParts.pop(),
|
||||
rand = 'rand' + part,
|
||||
factor = POS_factor[part],
|
||||
weight = factor / POS_factor.Total;
|
||||
|
||||
function partCallback(result){
|
||||
if (result) {
|
||||
results = _.uniq(results.concat(result)); // make sure it's unique!
|
||||
// pick count according to relative weight
|
||||
opts.count = Math.ceil(count * weight * 1.1); // guard against dupes
|
||||
self[rand](opts, partCallback);
|
||||
}
|
||||
|
||||
//console.log(result);
|
||||
if (results.length < count && doParts.length) {
|
||||
// reduce count for next part -- NO! may get duplicates
|
||||
// opts.count = count - results.length;
|
||||
return tryPart();
|
||||
function partCallback(result) {
|
||||
if (result) {
|
||||
results = _.uniq(results.concat(result)); // make sure it's unique!
|
||||
}
|
||||
|
||||
if (results.length < count && doParts.length) {
|
||||
return tryPart();
|
||||
}
|
||||
|
||||
// final random and trim excess
|
||||
results = _.sample(results, count);
|
||||
done();
|
||||
}
|
||||
|
||||
// trim excess
|
||||
if (results.length > count) {
|
||||
results.length = count;
|
||||
function done() {
|
||||
profile && (args.push(new Date() - start));
|
||||
args[0] = results;
|
||||
callback && callback.apply(null, args);
|
||||
resolve(results);
|
||||
}
|
||||
done();
|
||||
}
|
||||
|
||||
}); // Promise
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/**
|
||||
/*!
|
||||
* wordpos.js
|
||||
*
|
||||
* Node.js part-of-speech utilities using WordNet database.
|
||||
|
@ -149,11 +149,11 @@ function get(isFn) {
|
|||
};
|
||||
}
|
||||
|
||||
// setImmediate executes callback AFTER promise handlers.
|
||||
// Without it, exceptions in callback may be caught by Promise.
|
||||
function nextTick(fn, args) {
|
||||
if (fn) {
|
||||
setImmediate(function(){
|
||||
fn.apply(null, args);
|
||||
});
|
||||
fn.apply(null, args);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -216,7 +216,7 @@ var wordposProto = WordPOS.prototype;
|
|||
* lookup a word in all indexes
|
||||
*
|
||||
* @param word {string} - search word
|
||||
* @param callback {Functino} (optional) - callback with (results, word) signature
|
||||
* @param callback {Function} (optional) - callback with (results, word) signature
|
||||
* @returns {Promise}
|
||||
*/
|
||||
wordposProto.lookup = function(word, callback) {
|
||||
|
@ -362,7 +362,17 @@ wordposProto.getVerbs = get('isVerb');
|
|||
wordposProto.parse = prepText;
|
||||
|
||||
|
||||
/**
|
||||
* access to WordNet DB
|
||||
* @type {object}
|
||||
*/
|
||||
WordPOS.WNdb = WNdb;
|
||||
|
||||
/**
|
||||
* access to stopwords
|
||||
* @type {Array}
|
||||
*/
|
||||
WordPOS.stopwords = stopwords;
|
||||
|
||||
|
||||
module.exports = WordPOS;
|
||||
|
|
40
test.js
40
test.js
|
@ -1,40 +0,0 @@
|
|||
var
|
||||
WordPOS = require('./src/wordpos'),
|
||||
wordpos = new WordPOS({profile: true}),
|
||||
getAllPOS = wordpos.getPOS
|
||||
;
|
||||
|
||||
|
||||
console.log(1111,
|
||||
wordpos.lookup('foot')
|
||||
//wordpos.getPOS('was doing the work the ashtray closer Also known as inject and foldl, reduce boils down a list of values into a single value', console.log
|
||||
.then(function(result){
|
||||
console.log(' xxx - ', result)
|
||||
})
|
||||
.catch(function(result){
|
||||
console.log(' error xxx - ', result)
|
||||
}));
|
||||
|
||||
//wordpos.rand({count: 3},console.log)
|
||||
|
||||
return;
|
||||
|
||||
|
||||
//getAllPOS('se', console.log)
|
||||
wordpos.getPOS('se', console.log)
|
||||
|
||||
|
||||
|
||||
|
||||
a=wordpos.getPOS('se', function(res) {
|
||||
console.log(1, res)
|
||||
wordpos.getPOS('sea hey who work', function(res) {
|
||||
console.log(2, res)
|
||||
wordpos.getPOS('sear done work ', function(res) {
|
||||
console.log(3, res)
|
||||
console.log('all done');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
console.log(a)
|
|
@ -1,7 +1,7 @@
|
|||
/**
|
||||
* validate_test.js
|
||||
*
|
||||
* Run validate on all four main index files
|
||||
* Run validate on all four main index files
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
/**
|
||||
* wordpos_spec.js
|
||||
* wordpos_test.js
|
||||
*
|
||||
* test file for main wordpos functionality
|
||||
* test file for main wordpos functionality
|
||||
*
|
||||
* Usage:
|
||||
* npm install mocha -g
|
||||
* mocha wordpos_spec.js --verbose
|
||||
* mocha wordpos_test.js
|
||||
*
|
||||
* or
|
||||
*
|
||||
|
@ -388,4 +388,29 @@ describe('Promise pattern', function() {
|
|||
assert.equal(result, true);
|
||||
});
|
||||
});
|
||||
|
||||
it('rand()', function () {
|
||||
return wordpos.rand({count: 5}).then(function (result) {
|
||||
assert.equal(result.length, 5);
|
||||
});
|
||||
});
|
||||
|
||||
it('randNoun()', function () {
|
||||
return wordpos.randNoun().then(function (result) {
|
||||
assert.equal(result.length, 1);
|
||||
});
|
||||
});
|
||||
|
||||
it('randNoun({count: 3})', function () {
|
||||
return wordpos.randNoun({count: 3}).then(function (result) {
|
||||
assert.equal(result.length, 3);
|
||||
});
|
||||
});
|
||||
|
||||
it('randNoun({startsWith: "foo"})', function () {
|
||||
return wordpos.randNoun({startsWith: 'foo'}).then(function (result) {
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0].indexOf('foo'), 0);
|
||||
});
|
||||
});
|
||||
});
|
|
@ -1,7 +1,7 @@
|
|||
/**
|
||||
* stat.js
|
||||
*
|
||||
* generate fast index for WordNet index files
|
||||
* generate fast index for WordNet index files
|
||||
*
|
||||
* Usage:
|
||||
* node stat [--no-stats] index.adv ...
|
||||
|
@ -40,7 +40,7 @@
|
|||
* read index file between the two offsets
|
||||
* binary search read data O(log avg)
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -48,7 +48,7 @@
|
|||
var
|
||||
WNdb = require('../src/wordpos').WNdb,
|
||||
util = require('util'),
|
||||
BufferedReader = require ("./buffered-reader"),
|
||||
BufferedReader = require ('./buffered-reader'),
|
||||
_ = require('underscore')._,
|
||||
fs = require('fs'),
|
||||
path = require('path'),
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
/**
|
||||
* validate.js
|
||||
*
|
||||
* read each index.<pos> file, and look up using wordpos and confirm find all words
|
||||
* read each index.<pos> file, and look up using wordpos and confirm find all words
|
||||
*
|
||||
* Usage:
|
||||
* node validate index.adv
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
|
Loading…
Reference in New Issue