Merge pull request #28 from moos/browser

Browser support
This commit is contained in:
Moos 2019-05-31 05:56:11 -07:00 committed by GitHub
commit 03c0b1aa54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 3658 additions and 925 deletions

36
.babelrc Normal file
View File

@ -0,0 +1,36 @@
{
"ignore": [
"./test/dict"
],
"presets": [
"@babel/preset-env"
],
"plugins": [
"@babel/plugin-proposal-class-properties",
"babel-plugin-dynamic-import-node",
"@babel/plugin-syntax-dynamic-import",
"@babel/plugin-syntax-import-meta",
"@babel/plugin-proposal-json-strings",
[
"@babel/plugin-proposal-decorators",
{
"legacy": true
}
],
"@babel/plugin-proposal-function-sent",
"@babel/plugin-proposal-export-namespace-from",
"@babel/plugin-proposal-numeric-separator",
"@babel/plugin-proposal-throw-expressions",
"@babel/plugin-proposal-export-default-from",
"@babel/plugin-proposal-logical-assignment-operators",
"@babel/plugin-proposal-optional-chaining",
[
"@babel/plugin-proposal-pipeline-operator",
{
"proposal": "minimal"
}
],
"@babel/plugin-proposal-nullish-coalescing-operator",
"@babel/plugin-proposal-do-expressions"
]
}

5
.gitignore vendored
View File

@ -1,4 +1,7 @@
dict
node_modules node_modules
.idea .idea
*.iml *.iml
.cache
build
dict
dist

View File

@ -1,6 +1,7 @@
language: node_js language: node_js
node_js: node_js:
- '12'
- '11'
- '10' - '10'
- '8' - '8'
- '6' - '6'
- '4'

101
README.md
View File

@ -4,12 +4,22 @@ wordpos
[![NPM version](https://img.shields.io/npm/v/wordpos.svg)](https://www.npmjs.com/package/wordpos) [![NPM version](https://img.shields.io/npm/v/wordpos.svg)](https://www.npmjs.com/package/wordpos)
[![Build Status](https://img.shields.io/travis/moos/wordpos/master.svg)](https://travis-ci.org/moos/wordpos) [![Build Status](https://img.shields.io/travis/moos/wordpos/master.svg)](https://travis-ci.org/moos/wordpos)
wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js using fast lookup in the WordNet database. wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js **and** browser using fast lookup in the WordNet database.
Version 1.x is a major update with no direct dependence on [natural's](https://github.com/NaturalNode/natural#wordnet) WordNet module, with support for [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise), and roughly 5x speed improvement over previous version. Version 1.x is a major update with no direct dependence on [natural's](https://github.com/NaturalNode/natural#wordnet) WordNet module, with support for [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise), and roughly 5x speed improvement over previous version.
**CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](https://wordnet.princeton.edu/documentation/wnstats7wn) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment. > ~~**CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](https://wordnet.princeton.edu/documentation/wnstats7wn) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment.~~
:zap: v2.x can work in browsers -- to try it out `npm i wordpos@beta` or [see it in action](https://moos.github.io/wordpos). See below for usage.
## Installation
npm install -g wordpos
To run test: (or just: npm test)
npm install -g mocha
mocha test
## Quick usage ## Quick usage
@ -33,7 +43,7 @@ Command-line: (see [CLI](bin) for full command list)
```bash ```bash
$ wordpos def git $ wordpos def git
git git
n: a person who is deemed to be despicable or contemptible; "only a rotter would do that"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptible person a `git'" n: a person who is deemed to be despicable or contemptible; "only a rotter would do that"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptible person a 'git'"
$ wordpos def git | wordpos get --adj $ wordpos def git | wordpos get --adj
# Adjective 6: # Adjective 6:
@ -45,16 +55,8 @@ little
British British
``` ```
## Installation
npm install -g wordpos ## Options
To run test: (or just: npm test)
npm install -g mocha
mocha test
### Options
```js ```js
WordPOS.defaults = { WordPOS.defaults = {
@ -68,7 +70,30 @@ WordPOS.defaults = {
* if array, stopwords to exclude, eg, ['all','of','this',...] * if array, stopwords to exclude, eg, ['all','of','this',...]
* if false, do not filter any stopwords. * if false, do not filter any stopwords.
*/ */
stopwords: true stopwords: true,
/**
* preload files (in browser only)
* true - preload all POS
* false - do not preload any POS
* 'a' - preload adj
* ['a','v'] - preload adj & verb
* @type {boolean|string|Array}
*/
preload: false,
/**
* include data files in preload
* @type {boolean}
*/
includeData: false, // WIP
/**
* set to true to enable debug logging
* @type {boolean}
*/
debug: false
}; };
``` ```
To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call. To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call.
@ -224,7 +249,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log)
// [] 'zzz' // [] 'zzz'
``` ```
**Note on performance**: random lookups could involve heavy disk reads. It is better to use the `count` option to get words in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the index files. **Note on performance**: (node only) random lookups could involve heavy disk reads. It is better to use the `count` option to get words in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the index files.
Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met. Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met.
@ -269,8 +294,43 @@ wordpos.isVerb('fish', console.log)
``` ```
Note that callback receives full arguments (including profile, if enabled), while the Promise receives only the result of the call. Also, beware that exceptions in the _callback_ will result in the Promise being _rejected_ and caught by `catch()`, if provided. Note that callback receives full arguments (including profile, if enabled), while the Promise receives only the result of the call. Also, beware that exceptions in the _callback_ will result in the Promise being _rejected_ and caught by `catch()`, if provided.
## Running inside the browsers
## Fast Index v2.0 introduces the capability of running wordpos in the browser. The dictionary files are optimized for fast access (lookup by lemma), but they must be fetched, parsed and loaded into browser memory. The files are loaded on-demand (unless the option `preload: true` is given).
The dict files can be served locally or from CDN (see [samples/cdn](samples/cdn/) for code, or [see it in action](https://moos.github.io/wordpos)). Include the following scripts in your `index.html`:
```html
<script src="wordpos/dist/wordpos.min.js"></script>
<script>
let wordpos = new WordPOS({
// preload: true,
dictPath: '/wordpos/dict',
profile: true
});
wordpos.getAdverbs('this is is lately a likely tricky business this is')
.then(res => {
console.log(res); // ["lately", "likely"]
});
</script>
```
Above assumes wordpos is installed to the directory `./wordpos`. `./wordpos/dict` holds the index and data WordNet files generated for the web in a postinstall script.
See [samples/self-hosted](samples/self-hosted/).
To run the samples locally, install [parcel](https://github.com/parcel-bundler/parcel) if you don't already have it (`npm i -g parcel`), then:
```bash
$ npm run start-self
Server running at http://localhost:1234
...
$ npm run start-cdn
Server running at http://localhost:1234
...
```
and open your browser to that url.
## Fast Index (node)
Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js. Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js.
@ -287,8 +347,17 @@ For CLI usage and examples, see [bin/README](bin).
See [bench/README](bench). See [bench/README](bench).
## TODO
- implement `includeData` option for preload
## Changes ## Changes
**2.0.0**
- Support for running wordpos in browser (no breaking change for node environment)
- Dropped support for node 4.x.
1.2.0 1.2.0
- Fix `new Buffer()` deprecation warning. - Fix `new Buffer()` deprecation warning.
- Fix npm audit vulnerabilities - Fix npm audit vulnerabilities
@ -347,4 +416,4 @@ License
(The MIT License) (The MIT License)
Copyright (c) 2012, 2014, 2016 mooster@42at.com Copyright (c) 2012-2019 mooster@42at.com

53
docs/cdn/index.html Normal file
View File

@ -0,0 +1,53 @@
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Security-Policy" content="script-src https: http: 'unsafe-inline' 'unsafe-eval'">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" />
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/languages/javascript.min.js"></script>
<!-- cdn source -->
<script src="https://unpkg.com/wordpos@2.0.0-beta/dist/wordpos.min.js"></script>
<script>
let wordpos = window.wordpos = new WordPOS({
// preload: true,
dictPath: 'https://unpkg.com/wordpos@2.0.0-beta/dict',
profile: true,
// stopwords: false
});
</script>
<script src="../main.js" name="main"></script>
<style>
pre {
padding: 2em;
display: block;
}
</style>
</head>
<body>
<h1>CDN WordPOS sample</h1>
Open console to see results.
<pre><code> </code></pre>
<script>
var el = document.querySelector('code');
if (fetch) {
fetch('../main.js')
.then(res => res.text())
.then(txt => {
el.innerText = txt;
window.hljs && hljs.initHighlightingOnLoad();
});
} else {
el.innerHTML = 'Open <a href="../main.js">main.js</a>.';
}
</script>
</body>
</html>

32
docs/main.js Normal file
View File

@ -0,0 +1,32 @@
let assertLikely = (r) => {
console.assert(r.def === 'with considerable certainty');
console.assert(r.pos === 'r');
console.assert(r.synsetOffset === '00139421');
};
console.group('Likely');
wordpos.isAdverb('likely').then(res => console.assert(res));
wordpos.isAdverb('likely', (res, ...profile) => console.log('callback with profile', res, profile));
wordpos.getAdverbs('this is is lately a likely tricky business this is')
.then(res => {
let expect = {lately: 1, likely: 1};
console.log('getAdverbs:', res);
console.assert(res[0] in expect); // NOTE: order is NOT gauranteed!
console.assert(res[1] in expect);
});
wordpos.lookupAdverb('likely')
.then(res => {
console.log('lookupAdverb:', res[0]);
assertLikely(res[0]);
});
// wordpos.lookup('likely').then(res, console.log('lookup ===', res))
wordpos.seek('00139421', 'r')
.then(res => {
console.log('seek:', res);
assertLikely(res);
});
setTimeout(() => console.groupEnd('Likely'), 1000);

View File

@ -0,0 +1,52 @@
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Security-Policy" content="script-src https: http: 'unsafe-inline' 'unsafe-eval'">
<title>Wordpos in the browser</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" />
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/languages/javascript.min.js"></script>
<script src="/dist/wordpos.min.js"></script>
<script>
let wordpos = window.wordpos = new WordPOS({
// preload: true,
dictPath: '/samples/self-hosted/dict',
profile: true,
// stopwords: false
});
</script>
<script src="../main.js" name="main"></script>
<style>
pre {
padding: 2em;
display: block;
}
</style>
</head>
<body>
<h1>Self-hosted WordPOS sample</h1>
Open console to see results.
<pre><code> </code></pre>
<script>
var el = document.querySelector('code');
if (fetch) {
fetch('../main.js')
.then(res => res.text())
.then(txt => {
el.innerText = txt;
window.hljs && hljs.initHighlightingOnLoad();
});
} else {
el.innerHTML = 'Open <a href="../main.js">main.js</a>.';
}
</script>
</body>
</html>

1651
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
{ {
"name": "wordpos", "name": "wordpos",
"version": "1.2.0", "version": "2.0.0-beta.2",
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", "description": "wordpos is a set of part-of-speech utilities for Node.js & browser using the WordNet database.",
"author": "Moos <mooster@42at.com>", "author": "Moos <mooster@42at.com>",
"keywords": [ "keywords": [
"natural", "natural",
@ -14,16 +14,47 @@
], ],
"homepage": "https://github.com/moos/wordpos", "homepage": "https://github.com/moos/wordpos",
"engines": { "engines": {
"node": ">=4" "node": ">=6"
}, },
"files": ["bench","bin","lib","src","test","tools"], "files": [
"bench",
"bin",
"dict",
"dist",
"lib",
"src",
"scripts",
"test",
"!test/dict",
"tools"
],
"bin": "./bin/wordpos-cli.js", "bin": "./bin/wordpos-cli.js",
"dependencies": { "dependencies": {
"commander": "^2.0.0", "commander": "^2.0.0",
"symlink-dir": "1.1.3",
"underscore": ">=1.3.1", "underscore": ">=1.3.1",
"wordnet-db": "^3.1.6" "wordnet-db": "^3.1.11"
}, },
"devDependencies": { "devDependencies": {
"@babel/core": "^7.0.0",
"@babel/plugin-proposal-class-properties": "^7.0.0",
"@babel/plugin-proposal-decorators": "^7.0.0",
"@babel/plugin-proposal-do-expressions": "^7.0.0",
"@babel/plugin-proposal-export-default-from": "^7.0.0",
"@babel/plugin-proposal-export-namespace-from": "^7.0.0",
"@babel/plugin-proposal-function-sent": "^7.0.0",
"@babel/plugin-proposal-json-strings": "^7.0.0",
"@babel/plugin-proposal-logical-assignment-operators": "^7.0.0",
"@babel/plugin-proposal-nullish-coalescing-operator": "^7.0.0",
"@babel/plugin-proposal-numeric-separator": "^7.0.0",
"@babel/plugin-proposal-optional-chaining": "^7.0.0",
"@babel/plugin-proposal-pipeline-operator": "^7.0.0",
"@babel/plugin-proposal-throw-expressions": "^7.0.0",
"@babel/plugin-syntax-dynamic-import": "^7.0.0",
"@babel/plugin-syntax-import-meta": "^7.0.0",
"@babel/preset-env": "^7.0.0",
"@babel/register": "^7.0.0",
"babel-plugin-dynamic-import-node": "^2.2.0",
"chai": "^4.0.2", "chai": "^4.0.2",
"mini-bench": "^1.0.0", "mini-bench": "^1.0.0",
"mocha": "^5.2.0" "mocha": "^5.2.0"
@ -32,10 +63,23 @@
"type": "git", "type": "git",
"url": "git://github.com/moos/wordpos.git" "url": "git://github.com/moos/wordpos.git"
}, },
"main": "./src/wordpos.js", "main": "./src/node/index.js",
"browser": "./src/browser/index.js",
"scripts": { "scripts": {
"postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun", "postinstall": "npm run postinstall-web && npm run postinstall-node",
"test": "mocha test" "postinstall-node": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun",
"postinstall-web": "node scripts/makeJsonDict.js index data",
"build": "parcel build --detailed-report -d dist -o wordpos.min.js --global WordPOS -t browser src/browser/index.js",
"postbuild": "sed -i 's/ES6_IMPORT/import/' dist/wordpos.min.js",
"test": "npm run test-node && npm run test-browser",
"test-node": "mocha test",
"test-browser": "mocha test/wordpos_test --require @babel/register",
"prestart": "symlink-dir dict samples/self-hosted/dict",
"start": "npm run build && http-server",
"prestart-dev": "rm -rf build && mkdir build && symlink-dir dict build/dict && cp samples/main.js build/main.txt",
"start-dev": "npm run start-self -- -d build",
"start-self": "parcel samples/self-hosted/index.html",
"start-cdn": "parcel samples/cdn/index.html"
}, },
"license": "MIT" "license": "MIT"
} }

53
samples/cdn/index.html Normal file
View File

@ -0,0 +1,53 @@
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Security-Policy" content="script-src https: http: 'unsafe-inline' 'unsafe-eval'">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" />
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/languages/javascript.min.js"></script>
<!-- cdn source -->
<script src="https://unpkg.com/wordpos@2.0.0-beta/dist/wordpos.min.js"></script>
<script>
let wordpos = window.wordpos = new WordPOS({
// preload: true,
dictPath: 'https://unpkg.com/wordpos@2.0.0-beta/dict',
profile: true,
// stopwords: false
});
</script>
<script src="../main.js" name="main"></script>
<style>
pre {
padding: 2em;
display: block;
}
</style>
</head>
<body>
<h1>CDN WordPOS sample</h1>
Open console to see results.
<pre><code> </code></pre>
<script>
var el = document.querySelector('code');
if (fetch) {
fetch('../main.js')
.then(res => res.text())
.then(txt => {
el.innerText = txt;
window.hljs && hljs.initHighlightingOnLoad();
});
} else {
el.innerHTML = 'Open <a href="../main.js">main.js</a>.';
}
</script>
</body>
</html>

32
samples/main.js Normal file
View File

@ -0,0 +1,32 @@
let assertLikely = (r) => {
console.assert(r.def === 'with considerable certainty');
console.assert(r.pos === 'r');
console.assert(r.synsetOffset === '00139421');
};
console.group('Likely');
wordpos.isAdverb('likely').then(res => console.assert(res));
wordpos.isAdverb('likely', (res, ...profile) => console.log('callback with profile', res, profile));
wordpos.getAdverbs('this is is lately a likely tricky business this is')
.then(res => {
let expect = {lately: 1, likely: 1};
console.log('getAdverbs:', res);
console.assert(res[0] in expect); // NOTE: order is NOT gauranteed!
console.assert(res[1] in expect);
});
wordpos.lookupAdverb('likely')
.then(res => {
console.log('lookupAdverb:', res[0]);
assertLikely(res[0]);
});
// wordpos.lookup('likely').then(res, console.log('lookup ===', res))
wordpos.seek('00139421', 'r')
.then(res => {
console.log('seek:', res);
assertLikely(res);
});
setTimeout(() => console.groupEnd('Likely'), 1000);

View File

@ -0,0 +1,52 @@
<!doctype html>
<html>
<head>
<meta http-equiv="Content-Security-Policy" content="script-src https: http: 'unsafe-inline' 'unsafe-eval'">
<title>Wordpos in the browser</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css" />
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/languages/javascript.min.js"></script>
<script src="/dist/wordpos.min.js"></script>
<script>
let wordpos = window.wordpos = new WordPOS({
// preload: true,
dictPath: '/samples/self-hosted/dict',
profile: true,
// stopwords: false
});
</script>
<script src="../main.js" name="main"></script>
<style>
pre {
padding: 2em;
display: block;
}
</style>
</head>
<body>
<h1>Self-hosted WordPOS sample</h1>
Open console to see results.
<pre><code> </code></pre>
<script>
var el = document.querySelector('code');
if (fetch) {
fetch('../main.js')
.then(res => res.text())
.then(txt => {
el.innerText = txt;
window.hljs && hljs.initHighlightingOnLoad();
});
} else {
el.innerHTML = 'Open <a href="../main.js">main.js</a>.';
}
</script>
</body>
</html>

98
scripts/makeJsonDict.js Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env node
/**
* takes original WordNet index & data files and converts to
* exported JSON format with lemma as the key.
*/
let fs = require('fs');
let path = require('path');
let outPath = './dict'; // browser-use files
let testPath = './test/dict'; // mocha files in CJS format
let posExt = ['adj', 'adv', 'noun', 'verb'];
let dictRoot = require('wordnet-db').path; // source files
const fileTypes = {
data: true,
index: true
};
const [,, ...args] = process.argv;
if (!args.length || args.filter(p => !(p in fileTypes)).length) {
console.log('Converts wordnet-db index & data files to JSON format for use in the browser.');
console.log('\nUsage: makeJsonDict.js index|data');
process.exit(1);
}
function uniq(arr) {
return arr.filter((v, i) => arr.indexOf(v) === i);
}
console.time('Done');
// create out directory
const ensurePath = (path) => {
try {
fs.statSync(path);
} catch (e) {
fs.mkdirSync(path);
}
};
ensurePath(outPath);
ensurePath(testPath);
function processFile(name) {
// read the file as text
function loadFile(pos) {
console.time(' load');
let inPath = path.resolve(dictRoot, name + '.' + pos);
let text = fs.readFileSync(inPath, 'utf8');
console.timeEnd(' load');
return text;
}
// convert raw text to JSON and write to file
function processText(pos, text) {
let obj = {};
let sp = ' ';
console.time(' process');
text.split('\n').forEach(line => {
if (!line || line[0] === sp) return;
let spi = line.indexOf(sp);
let key = line.substr(0, spi);
line = line.substring(1 + spi, line.lastIndexOf(sp + sp))
obj[key] = line;
});
console.timeEnd(' process');
return obj;
}
function writeFile(pos, obj) {
console.time(' write');
let text = JSON.stringify(obj);
fs.writeFileSync(path.resolve(outPath, name + '.' + pos + '.js'),
'export default ' + text);
// also write for mocha tests
fs.writeFileSync(path.resolve(testPath, name + '.' + pos + '.js'),
'module.exports.default = ' + text);
console.timeEnd(' write');
}
posExt.forEach(pos => {
console.log('\n', name, pos, ':');
let text = loadFile(pos);
let obj = processText(pos, text);
writeFile(pos, obj);
});
}
uniq(args).forEach(processFile);
console.log('\nWritten to', path.resolve(outPath));
console.timeEnd('Done');

62
src/browser/baseFile.js Normal file
View File

@ -0,0 +1,62 @@
/**
* browser/baseFile.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
let isTest = window.__mocha;
class BaseFile {
/**
* file contents - in browser it's just a string & not a file!
* @type {Object}
*/
file = {};
/**
* constructor
* @param {type} type - 'index' or 'data'
* @param {string} dictPath - path to dict db
* @param {string} posName - one of 'noun', 'verb', 'adj', 'adv'
* @param {object} [options] - @see WordPOS options
*/
constructor(type, dictPath, posName, options) {
this.type = type;
this.filePath = `${dictPath}/${type}.${posName}.js`;
this.posName = posName;
this.loadError = null;
this.options = Object.assign({}, options);
}
load() {
if (this.loadError) return Promise.reject(this.loadError);
this.options.debug && console.time('index load ' + this.posName);
let promise = isTest
? Promise.resolve(require(this.filePath))
: ES6_IMPORT(`${this.filePath}`); // prevent parcel from clobbering dynamic import
this.options.debug && console.timeEnd('index load ' + this.posName)
return promise
.then(exports => {
this.file = exports.default
})
.catch(err => {
console.error(`Error loading "${this.type}" file ${this.filePath}.`, err);
this.loadError = err;
throw err;
});
}
ready(fn, args) {
return this.load().then(() => fn.apply(this, args));
}
}
// export default BaseFile;
module.exports = BaseFile;

90
src/browser/dataFile.js Normal file
View File

@ -0,0 +1,90 @@
/**
* browser/dataFile.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
*
* Released under MIT license
*/
const { lineDataToJSON, LEX_NAMES } = require('../common');
const { zeroPad } = require('../util');
const BaseFile = require('./baseFile');
/**
* get parsed line from data file
*
* @param {string} offset The offset key
* @return {object} Data record object
* @this DataFile
*/
function seek(offset) {
let str = this.file[offset];
if (!str) return {};
// offset was extracted for the key - add it back to line data
return lineDataToJSON(offset + ' ' + str);
}
/**
* lookup offsets in data file
*
* @param offsets {array} - array of offsets to lookup (obtained from index.find())
* @param callback{function} (optional) - callback function
* @returns {Promise.[<Object>]} array of or single data record
* @this DataFile
*/
function lookup(offsets, callback) {
var results = [],
self = this,
readLine = seek.bind(this),
valid = (item => item.pos),
single = !Array.isArray(offsets);
if (single) offsets = [offsets];
return new Promise(function(resolve, reject) {
results = offsets
.map(zeroPad)
.map(readLine)
.filter(valid);
if (!results.length) {
let err = new RangeError(`No data at offsets ${offsets.join()} in ${self.filePath}.`);
callback && callback(err, single ? {} :[]);
reject(err);
} else {
if (single) results = results[0];
callback && callback(null, results);
resolve(results);
}
});
}
/**
* DataFile class
*
* @param dictPath {string} - path to dict folder
* @param posName {string} - POS name
* @constructor
*/
class DataFile extends BaseFile {
constructor(dictPath, posName) {
super('data', dictPath, posName);
}
lookup() {
return this.ready(lookup, arguments);
}
}
/**
* map of lexFilenum to lex names
*
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]}
*/
DataFile.LEX_NAMES = LEX_NAMES;
module.exports = DataFile;

188
src/browser/index.js Normal file
View File

@ -0,0 +1,188 @@
/**
* browser/index.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
const { stopwords, prepText, makeStopwordString } = require('../util');
const { is, get, getPOS, lookup, seek, lookupPOS } = require('../common');
const { randX, rand } = require('../rand');
const IndexFile = require('./indexFile');
const DataFile = require('./dataFile');
const POS = {
n: 'noun',
v: 'verb',
a: 'adj',
r: 'adv'
};
class WordPOS {
options = {};
constructor(config) {
this.options = Object.assign({}, WordPOS.defaults, config);
this.initFiles();
if (Array.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
}
}
initFiles() {
const keys = Object.keys(POS);
const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos], this.options);
const loader = (Comp) => keys.map(loadOne.bind(null, Comp));
const reducer = (arr) => arr.reduce((coll, item, i) => (coll[keys[i]] = item, coll), {});
this.indexFiles = reducer(loader(IndexFile));
this.dataFiles = reducer(loader(DataFile));
if (this.options.preload) {
this.loaded = this.preloadIndexes(this.options.preload);
}
}
getFilesFor(pos) {
return {
index: this.indexFiles[pos],
data: this.dataFiles[pos]
};
}
/**
* loads index files
*
* @param {string|Array} [pos] POS to load (default: all)
* @return {Promise.<index data>}
*/
preloadIndexes(pos) {
let file = this.indexFile[pos];
let load = p => file.load();
let promise;
if (!pos || pos === true) { // preload all
promise = Promise.all(Object.keys(POS).map(load));
}
else if (typeof pos === 'string' && file) {
promise = load(pos);
}
else if (pos instanceof Array) {
promise = pos.forEach(pos => file && load(pos));
}
// TODO includeData
return promise || Promise.reject(new RangeError(`Unknown POS "${pos}" for preload.`));
}
parse = prepText;
seek = seek;
/**
* isX() - Test if word is given POS
* @see is
*/
isAdjective = is('a');
isAdverb = is('r');
isNoun = is('n');
isVerb = is('v');
/**
* getX() - Find all words in string that are given POS
* @see get
*/
getPOS = getPOS;
getAdjectives = get('isAdjective');
getAdverbs = get('isAdverb');
getNouns = get('isNoun');
getVerbs = get('isVerb');
/**
* lookupX() - Lookup word definition if already know POS
* @see lookup
*/
lookup = lookupPOS;
lookupAdjective = lookup('a');
lookupAdverb = lookup('r');
lookupNoun = lookup('n');
lookupVerb = lookup('v');
/**
* define randX()
* @see makeRandX
*/
rand = rand;
randAdjective = randX('a');
randAdverb = randX('r');
randNoun = randX('n');
randVerb = randX('v');
}
WordPOS.defaults = {
/**
* path to WordNet data (override only if not using wordnet-db)
* @type {string}
*/
dictPath: '',
/**
* enable profiling, time in msec returned as second argument in callback
* @type {boolean}
*/
profile: false,
/**
* if true, exclude standard stopwords.
* if array, stopwords to exclude, eg, ['all','of','this',...]
* if false, do not filter any stopwords.
* @type {boolean}
*/
stopwords: true,
/**
* preload files.
* true - preload all POS
* false - do not preload any POS
* 'a' - preload adj
* ['a','v'] - preload adj & verb
* @type {boolean|string|Array}
*/
preload: false,
/**
* include data files in preload
* @type {boolean}
*/
includeData: false,
/**
* set to true to enable debug logging
* @type {boolean}
*/
debug: false
};
/**
* access to WordNet DB
* @type {object}
*/
// WordPOS.WNdb = WNdb; // FIXME
/**
* access to stopwords
* @type {Array}
*/
WordPOS.stopwords = stopwords;
// Export as CJS handled by Parcel, otherwise will get WordPOS.default
// if use: export default WordPOS;
module.exports = WordPOS;

135
src/browser/indexFile.js Normal file
View File

@ -0,0 +1,135 @@
/**
* browser/indexFile.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
const { indexLookup } = require('../common');
const { sample } = require('../util');
const BaseFile = require('./baseFile');
const Trie = require('../../lib/natural/trie/trie');
/**
* find a search term in an index file (using fast index)
*
* Calls to same bucket are queued for callback using the piper.
*
* @param search {string} - word to search for
* @param callback {function} - callback receives found line and tokens
* @returns none
* @this IndexFile
*/
function find(search, callback) {
var miss = {status: 'miss'};
if (!(search in this.file)) {
callback(miss);
return;
}
var
line = this.file[search],
tokens = line.split(/\s+/),
result = {
status: 'hit',
key: search,
line: line,
tokens: tokens
};
result.tokens.unshift(search);
callback(result);
}
/**
* Select <count> words at random for POS
*
* @param {string} startsWith - string that results should start with
* @param {integer} count - number of results to return
* @param {Function} callback - receives (results, startsWith)
* @return {Promise} receives results
* @this IndexFile
*/
function rand(startsWith, count, callback) {
const done = (res) => {
callback(res, startsWith || '');
return Promise.resolve(res);
};
const doSample = (values) => {
let res = sample(values, count);
// console.timeEnd('getkeys')
return done(res);
};
const time = (label) => {
this.options.debug && console.time(label + ' ' + this.posName);
};
const timeEnd = (label) => {
this.options.debug && console.timeEnd(label + ' ' + this.posName);
};
if (!startsWith) {
// console.time('getkeys')
return doSample(this.getKeys());
}
// calc trie if haven't done so yet
if (!this.trie) {
time('Trie');
this.trie = new Trie();
this.trie.addStrings(this.getKeys());
timeEnd('Trie');
}
let keys = [];
time('trie-withprefix');
keys = this.trie.keysWithPrefix(startsWith);
timeEnd('trie-withprefix');
// TODO cache results?
return keys.length ? doSample(keys) : done([]);
}
/**
* IndexFile class
*/
class IndexFile extends BaseFile {
keys = null;
/**
* @param dictPath {string} - WordNet db dict path
* @param posName {string} - name of index: noun, verb, adj, adv
* @param {object} [options] - @see WordPOS options
* @constructor
*/
constructor(dictPath, posName, options) {
super('index', dictPath, posName, options);
this.options = Object.assign({}, options);
this.posName = posName;
}
getKeys() {
return this.keys || (this.keys = Object.keys(this.file));
}
lookup() {
return this.ready(indexLookup, arguments);
}
find() {
return this.ready(find, arguments);
}
rand() {
return this.ready(rand, arguments);
}
}
module.exports = IndexFile;

403
src/common.js Normal file
View File

@ -0,0 +1,403 @@
/**
* common.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
*
* Released under MIT license
*/
var { normalize, nextTick, isString, uniq, sample, diff, flat } = require('./util');
function error(err, callback) {
if (isString(err)) err = new RangeError(err);
callback && callback(err, {});
return Promise.reject(err);
}
/**
* factory for main lookup function
*
* @param pos {string} - n/v/a/r
* @returns {Function} - lookup function bound to POS
* @this WordPOS
*/
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
files = this.getFilesFor(pos),
args = [];
word = normalize(word);
// lookup index
return files.index.lookup(word)
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result.synsetOffset).then(done);
} else {
// not found in index
return done([]);
}
})
.catch(done);
function done(results) {
if (results instanceof Error) {
args.push([], word);
} else {
args.push(results, word);
}
//console.log(3333, args)
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* find a word and prepare its lexical record
*
* @param word {string} - search word
* @param callback {function} - callback function receives result
* @returns {Promise.<IndexRecord>}
* @this IndexFile
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function indexLookup(word, callback) {
var self = this;
return new Promise(function(resolve, reject){
self.find(word, function (record) {
var indexRecord = null,
i;
if (record.status == 'hit') {
var ptrs = [], offsets = [];
let n = parseInt(record.tokens[3]);
for (i = 0; i < n; i++) {
ptrs.push(record.tokens[i]);
}
n = parseInt(record.tokens[2]);
for (i = 0; i < n; i++) {
offsets.push(record.tokens[ptrs.length + 6 + i]);
}
indexRecord = {
lemma : record.tokens[0],
pos : record.tokens[1],
ptrSymbol : ptrs,
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
synsetOffset: offsets
};
}
callback && callback(indexRecord);
resolve(indexRecord);
});
});
}
/**
* lookup a word in all indexes
*
* @param word {string} - search word
* @param callback {Function} (optional) - callback with (results, word) signature
* @returns {Promise}
* @this WordPOS
*/
function lookupPOS(word, callback) {
var self = this,
results = [],
profile = this.options.profile,
start = profile && new Date(),
methods = ['lookupAdverb', 'lookupAdjective', 'lookupVerb', 'lookupNoun'];
return Promise
.all(methods.map(exec))
.then(done)
.catch(error);
function exec(method) {
return self[ method ]
.call(self, word)
.then(function collect(result){
results = results.concat(result);
});
}
function done() {
var args = [results, word];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
function error(err) {
nextTick(callback, [[], word]);
throw err;
}
}
/**
* getX() factory function
*
* @param isFn {function} - an isX() function
* @returns {Function}
* @this IndexFile
*/
function get(isFn) {
return function(text, callback, _noprofile) {
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
words = this.parse(text),
results = [],
self = this,
first = words.shift();
// test one first & check for error, otherwise
// map is inoccuous to errors!
return exec(first)
.then(() => Promise.all(words.map(exec)))
.then(done)
.catch(err => {
// done(); // callback signature is same! // FIXME
return Promise.reject(err);
});
function exec(word) {
return self[isFn]
.call(self, word, null, /*_noprofile*/ true)
.then(function collect(result) {
result && results.push(word);
});
}
function done(){
var args = [results];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* getPOS() - Find all POS for all words in given string
*
* @param text {string} - words to lookup for POS
* @param callback {function} (optional) - receives object with words broken into POS or 'rest', ie,
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return Promise - resolve function receives data object
*/
function getPOS(text, callback) {
var self = this,
data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
profile = this.options.profile,
start = profile && new Date(),
words = this.parse(text),
methods = ['getAdverbs', 'getAdjectives', 'getVerbs', 'getNouns'];
return Promise
.all(methods.map(exec))
.then(done)
.catch(error);
function exec(method) {
return self[ method ]
.call(self, text, null, true)
.then(function collect(results) {
// getAdjectives --> adjectives
var pos = method.replace('get','').toLowerCase();
data[ pos ] = results;
});
}
function done() {
var args = [data];
var matches = uniq(flat(Object.values(data)));
data.rest = diff(words, matches);
profile && args.push(new Date() - start);
nextTick(callback, args);
return data;
}
function error(err) {
nextTick(callback, []);
throw err;
}
}
/**
* isX() factory function
*
* @param pos {string} - n/v/a/r
* @returns {Function}
* @this WordPOS
*/
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index;
word = normalize(word);
return index
.lookup(word)
.then(function(record) {
var result = !!record;
args.push(result, word);
profile && args.push(new Date() - start);
nextTick(callback, args);
return result;
});
};
}
/**
* parse a single data file line, returning data object
*
* @param line {string} - a single line from WordNet data file
* @returns {object}
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line, location) {
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
wCnt = parseInt(tokens[3], 16),
synonyms = [],
i;
for(i = 0; i < wCnt; i++) {
synonyms.push(tokens[4 + i * 2]);
}
var ptrOffset = (wCnt - 1) * 2 + 6;
let n = parseInt(tokens[ptrOffset], 10);
for(i = 0; i < n; i++) {
ptrs.push({
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
synsetOffset: tokens[ptrOffset + 2 + i * 4],
pos: tokens[ptrOffset + 3 + i * 4],
sourceTarget: tokens[ptrOffset + 4 + i * 4]
});
}
// break "gloss" into definition vs. examples
var glossArray = data[1].split('; ');
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
}
return {
synsetOffset: tokens[0],
lexFilenum: lexFilenum,
lexName: LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
synonyms: synonyms,
lexId: tokens[5],
ptrs: ptrs,
gloss: data[1],
def: definition,
exp: examples
};
}
/**
* seek - get record at offset for pos
*
* @param offset {number} - synset offset
* @param pos {string} - POS a/r/n/v
* @param callback {function} - optional callback
* @returns Promise
* @this WordPOS
*/
function seek(offset, pos, callback){
var offsetTmp = Number(offset);
if (isNaN(offsetTmp) || offsetTmp <= 0) return error('Offset must be valid positive number: ' + offset, callback);
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.', callback);
return data.lookup(offset, callback);
}
const LEX_NAMES = [
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
module.exports= {
indexLookup,
is,
get,
seek,
getPOS,
lineDataToJSON,
LEX_NAMES,
lookup,
lookupPOS
}

View File

@ -1,7 +1,7 @@
/*! /*!
* dataFile.js * dataFile.js
* *
* Copyright (c) 2012-2018 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Portions: Copyright (c) 2011, Chris Umbel * Portions: Copyright (c) 2011, Chris Umbel
@ -11,7 +11,12 @@
var fs = require('fs'), var fs = require('fs'),
path = require('path'), path = require('path'),
_ = require('underscore'); _ = require('underscore'),
{ zeroPad } = require('../util'),
{
lineDataToJSON,
LEX_NAMES
} = require('../common');
/** /**
* sanity check read data - line must start with zero-padded location * sanity check read data - line must start with zero-padded location
@ -20,67 +25,7 @@ var fs = require('fs'),
* @return {boolean} true if line data is good * @return {boolean} true if line data is good
*/ */
function dataCheck(line, location) { function dataCheck(line, location) {
var pad = '00000000', // 8 zeros return line.indexOf(zeroPad(location)) === 0;
padded = String(pad + location).slice( - pad.length);
return line.indexOf(padded) === 0;
}
/**
* parse a single data file line, returning data object
*
* @param line {string} - a single line from WordNet data file
* @returns {object}
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line, location) {
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
wCnt = parseInt(tokens[3], 16),
synonyms = [],
i;
for(i = 0; i < wCnt; i++) {
synonyms.push(tokens[4 + i * 2]);
}
var ptrOffset = (wCnt - 1) * 2 + 6;
for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
ptrs.push({
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10),
pos: tokens[ptrOffset + 3 + i * 4],
sourceTarget: tokens[ptrOffset + 4 + i * 4]
});
}
// break "gloss" into definition vs. examples
var glossArray = data[1].split("; ");
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
}
return {
synsetOffset: parseInt(tokens[0], 10),
lexFilenum: lexFilenum,
lexName: DataFile.LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
synonyms: synonyms,
lexId: tokens[5],
ptrs: ptrs,
gloss: data[1],
def: definition,
exp: examples
};
} }
/** /**
@ -98,6 +43,7 @@ function readLocation(location, callback) {
len = file.nominalLineLength, len = file.nominalLineLength,
buffer = new Buffer.alloc(len); buffer = new Buffer.alloc(len);
location = Number(location);
readChunk(location, function(err, count) { readChunk(location, function(err, count) {
if (err) { if (err) {
//console.log(err); //console.log(err);
@ -105,11 +51,13 @@ function readLocation(location, callback) {
return; return;
} }
//console.log(' read %d bytes at <%d>', count, location); //console.log(' read %d bytes at <%d>', count, location);
if (!dataCheck(str, location)) return callback(new RangeError('No data at offset ' + location));
callback(null, lineDataToJSON(str, location)); callback(null, lineDataToJSON(str, location));
}); });
function readChunk(pos, cb) { function readChunk(pos, cb) {
var nonDataErr = new Error('no data at offset ' + pos); var nonDataErr = new RangeError('No data at offset ' + pos);
fs.read(file.fd, buffer, 0, len, pos, function (err, count) { fs.read(file.fd, buffer, 0, len, pos, function (err, count) {
if (!count) return cb(nonDataErr, count); if (!count) return cb(nonDataErr, count);
@ -213,7 +161,6 @@ function promisifyInto(collect) {
} }
} }
/** /**
* DataFile class * DataFile class
* *
@ -258,55 +205,8 @@ DataFile.MAX_LINE_LENGTH = {
/** /**
* map of lexFilenum to lex names * map of lexFilenum to lex names
* *
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]} * @type {string[]}
*/ */
DataFile.LEX_NAMES = [ DataFile.LEX_NAMES = LEX_NAMES;
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
module.exports = DataFile; module.exports = DataFile;

190
src/node/index.js Normal file
View File

@ -0,0 +1,190 @@
/*!
* node/index.js
*
* Node.js part-of-speech utilities using WordNet database.
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
_ = require('underscore')._,
util = require('util'),
stopwordsStr,
WNdb = require('wordnet-db'),
DataFile = require('./dataFile'),
IndexFile = require('./indexFile'),
{
nextTick,
normalize,
tokenizer,
prepText,
makeStopwordString,
stopwords
} = require('../util'),
{
is,
get,
getPOS,
seek,
lookup,
lookupPOS
} = require('../common');
stopwordsStr = makeStopwordString(stopwords);
/**
* @class WordPOS
* @param options {object} -- @see WordPOS.defaults
* @constructor
*/
var WordPOS = function(options) {
var dictPath;
this.options = _.defaults({}, _.isObject(options) && options || {}, {
dictPath: WNdb.path
}, WordPOS.defaults);
dictPath = this.options.dictPath;
this.nounIndex = new IndexFile(dictPath, 'noun');
this.verbIndex = new IndexFile(dictPath, 'verb');
this.adjIndex = new IndexFile(dictPath, 'adj');
this.advIndex = new IndexFile(dictPath, 'adv');
this.nounData = new DataFile(dictPath, 'noun');
this.verbData = new DataFile(dictPath, 'verb');
this.adjData = new DataFile(dictPath, 'adj');
this.advData = new DataFile(dictPath, 'adv');
// define randX() functions
require('./rand').init(this);
if (_.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
}
};
WordPOS.defaults = {
/**
* path to WordNet data (override only if not using wordnet-db)
*/
dictPath: '',
/**
* enable profiling, time in msec returned as second argument in callback
*/
profile: false,
/**
* if true, exclude standard stopwords.
* if array, stopwords to exclude, eg, ['all','of','this',...]
* if false, do not filter any stopwords.
*/
stopwords: true
};
var wordposProto = WordPOS.prototype;
/**
* lookup a word in all indexes
*
* @param word {string} - search word
* @param callback {Function} (optional) - callback with (results, word) signature
* @returns {Promise}
*/
wordposProto.lookup = lookupPOS;
/**
* getPOS() - Find all POS for all words in given string
*
* @param text {string} - words to lookup for POS
* @param callback {function} (optional) - receives object with words broken into POS or 'rest', ie,
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return Promise - resolve function receives data object
*/
wordposProto.getPOS = getPOS;
/**
* get index and data files for given pos
*
* @param pos {string} - n/v/a/r
* @returns {object} - keys {index, data}
*/
wordposProto.getFilesFor = function (pos) {
switch(pos) {
case 'n':
return {index: this.nounIndex, data: this.nounData};
case 'v':
return {index: this.verbIndex, data: this.verbData};
case 'a': case 's':
return {index: this.adjIndex, data: this.adjData};
case 'r':
return {index: this.advIndex, data: this.advData};
}
return {};
};
/**
* lookupX() - Lookup word definition if already know POS
* @see lookup
*/
wordposProto.lookupAdjective = lookup('a');
wordposProto.lookupAdverb = lookup('r');
wordposProto.lookupNoun = lookup('n');
wordposProto.lookupVerb = lookup('v');
/**
* isX() - Test if word is given POS
* @see is
*/
wordposProto.isAdjective = is('a');
wordposProto.isAdverb = is('r');
wordposProto.isNoun = is('n');
wordposProto.isVerb = is('v');
/**
* getX() - Find all words in string that are given POS
* @see get
*/
wordposProto.getAdjectives = get('isAdjective');
wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
/**
* parse - get deduped, less stopwords
*
* @param text {string|array} - string of words to parse. If array is given, it is left in tact.
* @returns {array}
*/
wordposProto.parse = prepText;
/**
* seek - get record at offset for pos
*
* @param offset {number} - synset offset
* @param pos {string} - POS a/r/n/v
* @param callback {function} - optional callback
* @returns Promise
*/
wordposProto.seek = seek;
/**
* access to WordNet DB
* @type {object}
*/
WordPOS.WNdb = WNdb;
/**
* access to stopwords
* @type {Array}
*/
WordPOS.stopwords = stopwords;
module.exports = WordPOS;

View File

@ -1,9 +1,9 @@
/*! /*!
* indexFile.js * node/indexFile.js
* *
* implements fast index lookup of WordNet's index files * implements fast index lookup of WordNet's index files
* *
* Copyright (c) 2012-2018 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Portions: Copyright (c) 2011, Chris Umbel * Portions: Copyright (c) 2011, Chris Umbel
@ -16,6 +16,7 @@ var _ = require('underscore')._,
path = require('path'), path = require('path'),
fs = require('fs'), fs = require('fs'),
piper = require('./piper'), piper = require('./piper'),
{ indexLookup } = require('../common'),
KEY_LENGTH = 3; KEY_LENGTH = 3;
@ -133,49 +134,6 @@ function find(search, callback) {
} }
} }
/**
* find a word and prepare its lexical record
*
* @param word {string} - search word
* @param callback {function} - callback function receives result
* @returns none
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lookup(word, callback) {
var self = this;
return new Promise(function(resolve, reject){
self.find(word, function (record) {
var indexRecord = null,
i;
if (record.status == 'hit') {
var ptrs = [], offsets = [];
for (i = 0; i < parseInt(record.tokens[3]); i++)
ptrs.push(record.tokens[i]);
for (i = 0; i < parseInt(record.tokens[2]); i++)
offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10));
indexRecord = {
lemma : record.tokens[0],
pos : record.tokens[1],
ptrSymbol : ptrs,
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
synsetOffset: offsets
};
}
callback && callback(indexRecord);
resolve(indexRecord);
});
});
}
/** /**
* loads fast index data and return fast index find function * loads fast index data and return fast index find function
* *
@ -216,7 +174,7 @@ var IndexFile = function(dictPath, name) {
initIndex(this); initIndex(this);
}; };
IndexFile.prototype.lookup = lookup; IndexFile.prototype.lookup = indexLookup;
IndexFile.prototype.find = find; IndexFile.prototype.find = find;
/** /**

View File

@ -4,7 +4,7 @@
* executes multiple async i/o tasks and pools similar callbacks, * executes multiple async i/o tasks and pools similar callbacks,
* calling i/o open/close when all incoming tasks are done. * calling i/o open/close when all incoming tasks are done.
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
@ -79,4 +79,3 @@ piper.wrapper = function(self, task /*, result...*/){
module.exports = piper; module.exports = piper;

164
src/node/rand.js Normal file
View File

@ -0,0 +1,164 @@
/*!
* node/rand.js
*
* define rand() and randX() functions on wordpos
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var _ = require('underscore')._,
{ randX, rand } = require('../rand'),
Trie = require('../../lib/natural/trie/trie'),
IndexFile = require(`./indexFile`),
KEY_LENGTH = 3;
/**
* rand function (bound to index)
*
* @param startsWith {string} - get random word(s) that start with this, or ''
* @param num {number} - number of words to return
* @param callback {function} - callback function, receives words array and startsWith
* @returns Promise
* @this IndexFile
*/
function randomizer(startsWith, num, callback){
var self = this,
nextKey = null,
trie = this.fastIndex.trie,
key, keys;
return new Promise(function(resolve, reject) {
// console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
if (startsWith) {
key = startsWith.slice(0, KEY_LENGTH);
/**
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
*/
if (key.length < KEY_LENGTH) {
// calc trie if haven't done so yet
if (!trie) {
// console.time('trie');
trie = new Trie();
trie.addStrings(self.fastIndex.indexKeys);
self.fastIndex.trie = trie;
//console.log(' +++ Trie calc ');
// console.timeEnd('trie')
}
try {
// trie throws if not found!!!!!
keys = trie.keysWithPrefix(startsWith);
} catch (e) {
keys = [];
}
// read all keys then select random word.
// May be large disk read!
key = keys[0];
nextKey = _.last(keys);
}
if (!key || !(key in self.fastIndex.offsets)) {
callback && callback([], startsWith);
resolve([]);
}
} else {
// no startWith given - random select among keys
keys = _.sample(self.fastIndex.indexKeys, num);
// if num > 1, run each key independently and collect results
if (num > 1) {
var results = [], ii = 0;
_(keys).each(function (startsWith) {
self.rand(startsWith, 1, function (result) {
results.push(result[0]);
if (++ii == num) {
callback && callback(results, '');
resolve(results);
}
});
});
return;
}
key = keys;
}
// prepare the piper
var args = [key, nextKey, self],
task = 'rand:' + key + nextKey,
context = [startsWith, num, callback]; // last arg MUST be callback
// pay the piper
self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector);
function collector(key, nextKey, index, startsWith, num, callback, buffer) {
var lines = buffer.toString().split('\n'),
matches = lines.map(function (line) {
return line.substring(0, line.indexOf(' '));
});
//console.log(' got lines for key ', key, lines.length);
// we got bunch of matches for key - now search within for startsWith
if (startsWith !== key) {
// binary search for startsWith within set of matches
var ind = _.sortedIndex(matches, startsWith);
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) {
callback && callback([], startsWith);
resolve([]);
return;
}
var trie = new Trie();
trie.addStrings(matches);
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
matches = trie.keysWithPrefix(startsWith);
}
var words = _.sample(matches, num);
callback && callback(words, startsWith);
resolve(words);
}
}); // Promise
}
/**
* bind rand() to index
*
* @param index {object} - the IndexFile instance
* @returns {function} - bound rand function for index
*/
function randomify(index){
if (!index.fastIndex) throw new Error('rand requires fastIndex');
index.rand = _.bind(randomizer, index);
}
module.exports = {
init: function(wordposProto) {
randomify(wordposProto.nounIndex);
randomify(wordposProto.verbIndex);
randomify(wordposProto.adjIndex);
randomify(wordposProto.advIndex);
/**
* define rand() (all POS)
*/
wordposProto.rand = rand;
/**
* define randX()
*/
wordposProto.randAdjective = randX('a');
wordposProto.randAdverb = randX('r');
wordposProto.randNoun = randX('n');
wordposProto.randVerb = randX('v');
}
};

View File

@ -1,28 +1,22 @@
/*! /**
* rand.js * rand.js
* *
* define rand() and randX() functions on wordpos * Copyright (c) 2012-2019 mooster@42at.com
*
* Copyright (c) 2012-2016 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
*/ */
var _ = require('underscore')._, var { uniq, sample } = require('./util');
util = require('util'),
Trie = require('../lib/natural/trie/trie'),
IndexFile = require('./indexFile'),
KEY_LENGTH = 3;
/** /**
* factory function for randX() * factory function for randX()
* *
* @param pos {string} - a,r,n,v * @param pos {string} - a,r,n,v
* @returns {Function} - rand function bound to an index file * @returns {Function} - rand function bound to an index file
* @this WordPOS
*/ */
function makeRandX(pos){ function randX(pos){
return function(opts, callback, _noprofile) { return function(opts, callback, _noprofile) {
// disable profiling when isX() used internally // disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile, var profile = this.options.profile && !_noprofile,
@ -44,154 +38,40 @@ function makeRandX(pos){
}; };
} }
/**
* rand function (bound to index)
*
* @param startsWith {string} - get random word(s) that start with this, or ''
* @param num {number} - number of words to return
* @param callback {function} - callback function, receives words array and startsWith
* @returns Promise
*/
function rand(startsWith, num, callback){
var self = this,
nextKey = null,
trie = this.fastIndex.trie,
key, keys;
return new Promise(function(resolve, reject) {
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
if (startsWith) {
key = startsWith.slice(0, KEY_LENGTH);
/**
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
*/
if (key.length < KEY_LENGTH) {
// calc trie if haven't done so yet
if (!trie) {
trie = new Trie();
trie.addStrings(self.fastIndex.indexKeys);
self.fastIndex.trie = trie;
//console.log(' +++ Trie calc ');
}
try {
// trie throws if not found!!!!!
keys = trie.keysWithPrefix(startsWith);
} catch (e) {
keys = [];
}
// read all keys then select random word.
// May be large disk read!
key = keys[0];
nextKey = _.last(keys);
}
if (!key || !(key in self.fastIndex.offsets)) {
callback && callback([], startsWith);
resolve([]);
}
} else {
// no startWith given - random select among keys
keys = _.sample(self.fastIndex.indexKeys, num);
// if num > 1, run each key independently and collect results
if (num > 1) {
var results = [], ii = 0;
_(keys).each(function (startsWith) {
self.rand(startsWith, 1, function (result) {
results.push(result[0]);
if (++ii == num) {
callback && callback(results, '');
resolve(results);
}
});
});
return;
}
key = keys;
}
// prepare the piper
var args = [key, nextKey, self],
task = 'rand:' + key + nextKey,
context = [startsWith, num, callback]; // last arg MUST be callback
// pay the piper
self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector);
function collector(key, nextKey, index, startsWith, num, callback, buffer) {
var lines = buffer.toString().split('\n'),
matches = lines.map(function (line) {
return line.substring(0, line.indexOf(' '));
});
//console.log(' got lines for key ', key, lines.length);
// we got bunch of matches for key - now search within for startsWith
if (startsWith !== key) {
// binary search for startsWith within set of matches
var ind = _.sortedIndex(matches, startsWith);
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) {
callback && callback([], startsWith);
resolve([]);
return;
}
var trie = new Trie();
trie.addStrings(matches);
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
matches = trie.keysWithPrefix(startsWith);
}
var words = _.sample(matches, num);
callback && callback(words, startsWith);
resolve(words);
}
}); // Promise
}
// relative weight of each POS word count (DB 3.1 numbers)
var POS_factor = {
Noun: 26,
Verb: 3,
Adjective: 5,
Adverb: 1,
Total: 37
};
/** /**
* rand() - for all Index files * rand() - for all Index files
* @returns Promise *
* @param [opts] {object} options
* @param opts.startsWith {string} string random words should start with
* @param opts.count {integer} number of random words to return
* @param callback {function} - callback receives (results, startsWith, profile)
* @returns {Promise} receives results
* @this WordPOS
*/ */
function randAll(opts, callback) { function rand(opts, callback) {
if (typeof opts === 'function') { if (typeof opts === 'function') {
callback = opts; callback = opts;
opts = {}; opts = {};
} else { } else {
opts = _.clone(opts || {}); opts = Object.assign({
startsWith: '',
count: 1
}, opts);
} }
var var
profile = this.options.profile, profile = this.options.profile,
start = profile && new Date(), start = profile && new Date(),
results = [], results = [],
startsWith = opts && opts.startsWith || '', count = opts.count,
count = opts && opts.count || 1, args = [null, opts.startsWith],
args = [null, startsWith],
parts = 'Noun Verb Adjective Adverb'.split(' '), parts = 'Noun Verb Adjective Adverb'.split(' '),
self = this; self = this;
return new Promise(function(resolve, reject) { return new Promise(function(resolve, reject) {
// select at random a POS to look at // select at random a POS to look at
var doParts = _.sample(parts, parts.length); var doParts = sample(parts, parts.length);
tryPart(); tryPart();
function tryPart() { function tryPart() {
@ -207,7 +87,7 @@ function randAll(opts, callback) {
function partCallback(result) { function partCallback(result) {
if (result) { if (result) {
results = _.uniq(results.concat(result)); // make sure it's unique! results = uniq(results.concat(result)); // make sure it's unique!
} }
if (results.length < count && doParts.length) { if (results.length < count && doParts.length) {
@ -215,7 +95,7 @@ function randAll(opts, callback) {
} }
// final random and trim excess // final random and trim excess
results = _.sample(results, count); results = sample(results, count);
done(); done();
} }
@ -229,39 +109,17 @@ function randAll(opts, callback) {
}); // Promise }); // Promise
} }
/**
* bind rand() to index
*
* @param index {object} - the IndexFile instance
* @returns {function} - bound rand function for index
*/
function randomify(index){
if (!index.fastIndex) throw 'rand requires fastIndex';
return _.bind(rand, index);
}
// relative weight of each POS word count (DB 3.1 numbers)
const POS_factor = {
module.exports = { Noun: 26,
Verb: 3,
init: function(wordposProto) { Adjective: 5,
wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); Adverb: 1,
wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); Total: 37
wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex);
wordposProto.advIndex.rand = randomify(wordposProto.advIndex);
/**
* define rand()
*/
wordposProto.rand = randAll;
/**
* define randX()
*/
wordposProto.randAdjective = makeRandX('a');
wordposProto.randAdverb = makeRandX('r');
wordposProto.randNoun = makeRandX('n');
wordposProto.randVerb = makeRandX('v');
}
}; };
module.exports = {
randX,
rand
};

117
src/util.js Normal file
View File

@ -0,0 +1,117 @@
/**
* util.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
let stopwords = require('../lib/natural/util/stopwords').words;
let stopwordsStr = makeStopwordString(stopwords);
function makeStopwordString(stopwords) {
return ' ' + stopwords.join(' ') + ' ';
}
// setImmediate executes callback AFTER promise handlers.
// Without it, exceptions in callback may be caught by Promise.
function nextTick(fn, args) {
if (fn) {
fn.apply(null, args);
}
}
// offsets must be zero-padded to 8 chars
function zeroPad(str) {
var pad = '00000000'; // 8 zeros
return String(pad + str).slice(-pad.length);
}
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(stopwordsStr, word) {
return stopwordsStr.indexOf(' '+word+' ') >= 0;
}
function tokenizer(str) {
return str.split(/\W+/);
}
function uniq(arr) {
return arr.filter((v, i) => arr.indexOf(v) === i);
}
function diff(arr, subArr) {
return arr.filter(x => !subArr.includes(x));
}
// flatten an array - 1-deep only!
function flat(arr) {
return [].concat.apply([], arr);
}
// get random sample from array (note: count << array.length)
// https://stackoverflow.com/a/37834217
function sample(array, count) {
var indices = [];
var result = new Array(count);
for (let i = 0; i < count; i++ ) {
let j = Math.floor(Math.random() * (array.length - i) + i);
let val = array[indices[j] === undefined ? j : indices[j]];
if (val === undefined) {
result.length = i;
break;
}
result[i] = val;
indices[j] = indices[i] === undefined ? i : indices[i];
}
return result;
}
function isString(s) {
return typeof s === 'string';
}
function reject(arr, predicate) {
return arr.filter(item => !predicate(item))
}
function prepText(text) {
if (Array.isArray(text)) return text;
var deduped = uniq(tokenizer(text));
if (!this.options.stopwords) return deduped;
return reject(deduped, isStopword.bind(null,
isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
));
}
// node <= 6 polyfill
// @see https://github.com/tc39/proposal-object-values-entries/blob/master/polyfill.js
const reduce = Function.bind.call(Function.call, Array.prototype.reduce);
const isEnumerable = Function.bind.call(Function.call, Object.prototype.propertyIsEnumerable);
const concat = Function.bind.call(Function.call, Array.prototype.concat);
const keys = Reflect.ownKeys;
if (!Object.values) {
Object.values = function values(O) {
return reduce(keys(O), (v, k) => concat(v, typeof k === 'string' && isEnumerable(O, k) ? [O[k]] : []), []);
};
}
module.exports = {
isString,
zeroPad,
stopwords,
nextTick,
normalize,
tokenizer,
prepText,
makeStopwordString,
uniq,
diff,
flat,
sample
};

View File

@ -3,401 +3,14 @@
* *
* Node.js part-of-speech utilities using WordNet database. * Node.js part-of-speech utilities using WordNet database.
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
*/ */
var _ = require('underscore')._, if (process.browser) {
util = require('util'), module.exports = require('./browser');
stopwords = require('../lib/natural/util/stopwords').words,
stopwordsStr = makeStopwordString(stopwords),
WNdb = require('wordnet-db'),
DataFile = require('./dataFile'),
IndexFile = require('./indexFile');
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function makeStopwordString(stopwords) {
return ' '+ stopwords.join(' ') +' ';
}
function isStopword(stopwords, word) {
return stopwords.indexOf(' '+word+' ') >= 0;
}
function tokenizer(str) {
return str.split(/\W+/); //_.without(results,'',' ')
}
function prepText(text) {
if (_.isArray(text)) return text;
var deduped = _.uniq(tokenizer(text));
if (!this.options.stopwords) return deduped;
return _.reject(deduped, _.bind(isStopword, null,
_.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
));
}
/**
* factory for main lookup function
*
* @param pos {string} - n/v/a/r
* @returns {Function} - lookup function bound to POS
*/
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
files = this.getFilesFor(pos),
args = [];
word = normalize(word);
// lookup index
return files.index.lookup(word)
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result.synsetOffset).then(done);
} else { } else {
// not found in index module.exports = require('./node');
return done([]);
} }
})
.catch(done);
function done(results) {
if (results instanceof Error) {
args.push([], word);
} else {
args.push(results, word);
}
//console.log(3333, args)
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* isX() factory function
*
* @param pos {string} - n/v/a/r
* @returns {Function}
*/
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index;
word = normalize(word);
return index
.lookup(word)
.then(function(record) {
var result = !!record;
args.push(result, word);
profile && args.push(new Date() - start);
nextTick(callback, args);
return result;
});
};
}
/**
* getX() factory function
*
* @param isFn {function} - an isX() function
* @returns {Function}
*/
function get(isFn) {
return function(text, callback, _noprofile) {
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
words = this.parse(text),
results = [],
self = this;
//if (!n) return (process.nextTick(done),0);
return Promise
.all(words.map(exec))
.then(done);
function exec(word) {
return self[isFn]
.call(self, word, null, /*_noprofile*/ true)
.then(function collect(result) {
result && results.push(word);
});
}
function done(){
var args = [results];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
// setImmediate executes callback AFTER promise handlers.
// Without it, exceptions in callback may be caught by Promise.
function nextTick(fn, args) {
if (fn) {
fn.apply(null, args);
}
}
/**
* @class WordPOS
* @param options {object} -- @see WordPOS.defaults
* @constructor
*/
var WordPOS = function(options) {
var dictPath;
this.options = _.defaults({}, _.isObject(options) && options || {}, {
dictPath: WNdb.path
}, WordPOS.defaults);
dictPath = this.options.dictPath;
this.nounIndex = new IndexFile(dictPath, 'noun');
this.verbIndex = new IndexFile(dictPath, 'verb');
this.adjIndex = new IndexFile(dictPath, 'adj');
this.advIndex = new IndexFile(dictPath, 'adv');
this.nounData = new DataFile(dictPath, 'noun');
this.verbData = new DataFile(dictPath, 'verb');
this.adjData = new DataFile(dictPath, 'adj');
this.advData = new DataFile(dictPath, 'adv');
// define randX() functions
require('./rand').init(this);
if (_.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
}
};
WordPOS.defaults = {
/**
* path to WordNet data (override only if not using wordnet-db)
*/
dictPath: '',
/**
* enable profiling, time in msec returned as second argument in callback
*/
profile: false,
/**
* if true, exclude standard stopwords.
* if array, stopwords to exclude, eg, ['all','of','this',...]
* if false, do not filter any stopwords.
*/
stopwords: true
};
var wordposProto = WordPOS.prototype;
/**
* lookup a word in all indexes
*
* @param word {string} - search word
* @param callback {Function} (optional) - callback with (results, word) signature
* @returns {Promise}
*/
wordposProto.lookup = function(word, callback) {
var self = this,
results = [],
profile = this.options.profile,
start = profile && new Date(),
methods = ['lookupAdverb', 'lookupAdjective', 'lookupVerb', 'lookupNoun'];
return Promise
.all(methods.map(exec))
.then(done)
.catch(error);
function exec(method) {
return self[ method ]
.call(self, word)
.then(function collect(result){
results = results.concat(result);
});
}
function done() {
var args = [results, word];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
function error(err) {
nextTick(callback, [[], word]);
throw err;
}
};
/**
* getPOS() - Find all POS for all words in given string
*
* @param text {string} - words to lookup for POS
* @param callback {function} (optional) - receives object with words broken into POS or 'rest', ie,
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return Promise - resolve function receives data object
*/
wordposProto.getPOS = function(text, callback) {
var self = this,
data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
profile = this.options.profile,
start = profile && new Date(),
words = this.parse(text),
methods = ['getAdverbs', 'getAdjectives', 'getVerbs', 'getNouns'];
return Promise
.all(methods.map(exec))
.then(done)
.catch(error);
function exec(method) {
return self[ method ]
.call(self, text, null, true)
.then(function collect(results) {
// getAdjectives --> adjectives
var pos = method.replace('get','').toLowerCase();
data[ pos ] = results;
});
}
function done() {
var matches = _(data).chain()
.values()
.flatten()
.uniq()
.value(),
args = [data];
data.rest = _(words).difference(matches);
profile && args.push(new Date() - start);
nextTick(callback, args);
return data;
}
function error(err) {
nextTick(callback, []);
throw err;
}
};
/**
* get index and data files for given pos
*
* @param pos {string} - n/v/a/r
* @returns {object} - keys {index, data}
*/
wordposProto.getFilesFor = function (pos) {
switch(pos) {
case 'n':
return {index: this.nounIndex, data: this.nounData};
case 'v':
return {index: this.verbIndex, data: this.verbData};
case 'a': case 's':
return {index: this.adjIndex, data: this.adjData};
case 'r':
return {index: this.advIndex, data: this.advData};
}
return {};
};
/**
* lookupX() - Lookup word definition if already know POS
* @see lookup
*/
wordposProto.lookupAdjective = lookup('a');
wordposProto.lookupAdverb = lookup('r');
wordposProto.lookupNoun = lookup('n');
wordposProto.lookupVerb = lookup('v');
/**
* isX() - Test if word is given POS
* @see is
*/
wordposProto.isAdjective = is('a');
wordposProto.isAdverb = is('r');
wordposProto.isNoun = is('n');
wordposProto.isVerb = is('v');
/**
* getX() - Find all words in string that are given POS
* @see get
*/
wordposProto.getAdjectives = get('isAdjective');
wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
/**
* parse - get deduped, less stopwords
*
* @param text {string|array} - string of words to parse. If array is given, it is left in tact.
* @returns {array}
*/
wordposProto.parse = prepText;
/**
* seek - get record at offset for pos
*
* @param offset {number} - synset offset
* @param pos {string} - POS a/r/n/v
* @param callback {function} - optional callback
* @returns Promise
*/
wordposProto.seek = function(offset, pos, callback){
offset = Number(offset);
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
return data.lookup(offset, callback);
function error(msg) {
var err = new Error(msg);
callback && callback(err, {});
return Promise.reject(err);
}
};
/**
* access to WordNet DB
* @type {object}
*/
WordPOS.WNdb = WNdb;
/**
* access to stopwords
* @type {Array}
*/
WordPOS.stopwords = stopwords;
module.exports = WordPOS;

View File

@ -1,7 +1,7 @@
/** /**
* wordpos_test.js * wordpos_test.js
* *
* test file for main wordpos functionality * test file for main wordpos functionality (both node and browser)
* *
* Usage: * Usage:
* npm install mocha -g * npm install mocha -g
@ -11,21 +11,43 @@
* *
* npm test * npm test
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
*/ */
//import {describe, it} from 'mocha/lib/mocha.js'; // used in src code to signal test mode
global.window = global.window || {};
global.window.__mocha = true;
var var
chai = require('chai'), chai = require('chai'),
_ = require('underscore'), _ = require('underscore'),
assert = chai.assert, assert = chai.assert,
browser = process.browser = process.argv.includes('@babel/register'),
WordPOS = require('../src/wordpos'), WordPOS = require('../src/wordpos'),
wordpos = new WordPOS({profile: false}); path = require('path'),
dictPath = browser ? path.resolve('./test/dict') : undefined,
wordpos = new WordPOS({
profile: false,
dictPath: dictPath,
// debug: true
});
const assertNoData = (err) => {
assert(err instanceof RangeError);
assert(/No data at offset/.test(err.message));
};
const assertOffsetErr = (err) => {
assert(err instanceof RangeError);
assert.equal(err.message, 'Offset must be valid positive number: foobar');
};
console.log('Running', browser ? 'browser' : 'node', 'tests');
chai.config.showDiff = true; chai.config.showDiff = true;
var str = "The angry bear chased the frightened little squirrel", var str = "The angry bear chased the frightened little squirrel",
@ -40,25 +62,23 @@ var str = "The angry bear chased the frightened little squirrel",
offset = 1285602; offset = 1285602;
describe('lookup', function() { describe('lookup', function() {
it('with callback', function (done) {
wordpos.lookup('hegemony', function (result) { it('with callback', function () {
return wordpos.lookup('hegemony', function (result) {
assert.equal(result.length, 1); assert.equal(result.length, 1);
assert.equal(result[0].pos, 'n'); assert.equal(result[0].pos, 'n');
assert.equal(result[0].lemma, 'hegemony'); assert.equal(result[0].lemma, 'hegemony');
assert.equal(result[0].synonyms.length, 1); assert.equal(result[0].synonyms.length, 1);
done();
}); });
}); });
it('with Promise', function (done) { it('with Promise', function () {
wordpos.lookup('hegemony').then(function (result) { return wordpos.lookup('hegemony').then(function (result) {
assert.equal(result.length, 1); assert.equal(result.length, 1);
assert.equal(result[0].pos, 'n'); assert.equal(result[0].pos, 'n');
assert.equal(result[0].lemma, 'hegemony'); assert.equal(result[0].lemma, 'hegemony');
assert.equal(result[0].synonyms.length, 1); assert.equal(result[0].synonyms.length, 1);
done();
}); });
}); });
}); });
@ -83,42 +103,38 @@ describe('options passed to constructor', function() {
describe('getX()...', function() { describe('getX()...', function() {
it('should get all POS', function(done) {
wordpos.getPOS(str, function(result) { it('should get all POS', function() {
return wordpos.getPOS(str, function(result) {
assert.sameMembers(result.nouns, expected.nouns); assert.sameMembers(result.nouns, expected.nouns);
assert.sameMembers(result.verbs, expected.verbs); assert.sameMembers(result.verbs, expected.verbs);
assert.sameMembers(result.adjectives, expected.adjectives); assert.sameMembers(result.adjectives, expected.adjectives);
assert.sameMembers(result.adverbs, expected.adverbs); assert.sameMembers(result.adverbs, expected.adverbs);
assert.sameMembers(result.rest, expected.rest); assert.sameMembers(result.rest, expected.rest);
done();
}); });
}); });
it('should get nouns', function(done) { it('should get nouns', function() {
wordpos.getNouns(str, function(result) { return wordpos.getNouns(str, function(result) {
assert.sameMembers(result, expected.nouns); assert.sameMembers(result, expected.nouns);
done();
}); });
}); });
it('should get verbs', function(done) { it('should get verbs', function() {
wordpos.getVerbs(str, function(result) { return wordpos.getVerbs(str, function(result) {
assert.sameMembers(result, expected.verbs); assert.sameMembers(result, expected.verbs);
done();
}); });
}); });
it('should get adjectives', function(done) { it('should get adjectives', function() {
wordpos.getAdjectives(str, function(result) { return wordpos.getAdjectives(str, function(result) {
assert.sameMembers(result, expected.adjectives); assert.sameMembers(result, expected.adjectives);
done();
}); });
}); });
it('should get adverbs', function(done) { it('should get adverbs', function() {
wordpos.getAdverbs(str, function(result) { return wordpos.getAdverbs(str, function(result) {
assert.sameMembers(result, expected.adverbs); assert.sameMembers(result, expected.adverbs);
done();
}); });
}); });
}); });
@ -223,7 +239,7 @@ describe('lookupX()...', function() {
describe('profile option', function() { describe('profile option', function() {
var wp = new WordPOS({profile : true}); var wp = new WordPOS({profile : true, dictPath: dictPath});
it('should return time argument for isX()', function(done){ it('should return time argument for isX()', function(done){
wp.isNoun(garble, function(result, word, time) { wp.isNoun(garble, function(result, word, time) {
@ -248,7 +264,7 @@ describe('profile option', function() {
}); });
it('should disable stopword filtering', function(done){ it('should disable stopword filtering', function(done){
var wp = new WordPOS({stopwords : false}), var wp = new WordPOS({stopwords : false, dictPath: dictPath}),
strWithStopwords = 'about after all'; // 3 adjective stopwords strWithStopwords = 'about after all'; // 3 adjective stopwords
wp.getAdjectives(strWithStopwords, function(result){ wp.getAdjectives(strWithStopwords, function(result){
assert.equal(result.length, 3); assert.equal(result.length, 3);
@ -257,7 +273,7 @@ describe('profile option', function() {
}); });
it('should use custom stopwords', function(done){ it('should use custom stopwords', function(done){
var wp = new WordPOS({stopwords : ['all']}), var wp = new WordPOS({stopwords : ['all'], dictPath: dictPath}),
strWithStopwords = 'about after all'; // 3 adjective stopwords strWithStopwords = 'about after all'; // 3 adjective stopwords
// 'all' should be filtered // 'all' should be filtered
wp.getAdjectives(strWithStopwords, function(result){ wp.getAdjectives(strWithStopwords, function(result){
@ -269,7 +285,7 @@ describe('profile option', function() {
describe('nested callbacks on same index key', function() { describe('nested callbacks on same index key', function() {
var wp = new WordPOS(), var wp = new WordPOS({dictPath: dictPath}),
word1 = 'head', word1 = 'head',
word2 = word1 + 'er'; word2 = word1 + 'er';
@ -288,128 +304,86 @@ describe('nested callbacks on same index key', function() {
describe('rand()...', function() { describe('rand()...', function() {
it('should get random word', function(done) { it('should get random word', function() {
wordpos.rand(function(result) { return wordpos.rand(function(result) {
assert.equal(result.length, 1); assert.equal(result.length, 1);
done();
}); });
}); });
it('should get N random words', function(done) { it('should get N random words', function() {
wordpos.rand({count: 3}, function(result) { return wordpos.rand({count: 3}, function(result) {
assert.equal(result.length, 3); assert.equal(result.length, 3);
done();
}); });
}); });
it('should get random word starting with', function(done) { it('should get random word starting with', function() {
wordpos.rand({startsWith: 'foo'}, function(result, startsWith) { return wordpos.rand({startsWith: 'foo'}, function(result, startsWith) {
assert.equal(result[0].indexOf('foo'), 0); assert.equal(result[0].indexOf('foo'), 0);
assert.equal(startsWith, 'foo'); assert.equal(startsWith, 'foo');
done();
}); });
}); });
it('should get nothing starting with not found', function(done) { it('should get nothing starting with not found', function() {
wordpos.rand({startsWith: 'zzzz'}, function(result) { return wordpos.rand({startsWith: 'zzzz'}, function(result) {
assert.equal(result.length, 0); assert.equal(result.length, 0);
done();
}); });
}); });
}); });
describe('randX()...', function() { describe('randX()...', function() {
it('should get random noun', function(done) { let assertOneResult = (res) => {
wordpos.randNoun(function(result) { assert.equal(res.length, 1);
assert.equal(result.length, 1); };
done();
});
});
it('should get random verb', function(done) { it('should get random noun', () => wordpos.randNoun(assertOneResult));
wordpos.randVerb(function(result) { it('should get random verb', () => wordpos.randVerb(assertOneResult));
assert.equal(result.length, 1); it('should get random adjective', () => wordpos.randAdjective(assertOneResult));
done(); it('should get random adverb', () => wordpos.randAdverb(assertOneResult));
});
});
it('should get random adjective', function(done) {
wordpos.randAdjective(function(result) {
assert.equal(result.length, 1);
done();
});
});
it('should get random adverb', function(done) {
wordpos.randAdverb(function(result) {
assert.equal(result.length, 1);
done();
});
});
// not found // not found
it('should NOT get random noun starting with', function(done) { it('should NOT get random noun starting with', () =>
wordpos.randNoun({startsWith: 'zzzz'},function(result, startsWith) { wordpos.randNoun({startsWith: 'zzzz'}, (result, startsWith) =>
assert.equal(result.length, 0); assert.equal(result.length, 0)
done(); )
}); );
});
}); });
describe('seek()...', function() { describe('seek()...', function() {
it('should seek offset', function(done) { it('should seek offset', function() {
wordpos.seek(offset, 'a', function(err, result) { return wordpos.seek(offset, 'a', function(err, result) {
assert.equal(result.synsetOffset, offset); assert.equal(result.synsetOffset, offset);
assert.equal(result.pos, 's'); assert.equal(result.pos, 's');
assert.equal(result.lemma, 'amazing'); assert.equal(result.lemma, 'amazing');
done();
}); });
}); });
it('should handle bad offset', function(done) { it('should handle bad offset', function() {
wordpos.seek('foobar', 'a', function(err, result){ return wordpos.seek('foobar', 'a', assertOffsetErr).catch(assertOffsetErr);
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
done();
}).catch(_.noop); // UnhandledPromiseRejectionWarning
}); });
it('should handle wrong offset', function(done) { it('should handle wrong offset', function() {
var bad_offset = offset + 1; const bad_offset = offset + 1;
wordpos.seek(bad_offset, 'a', function(err, result) { return wordpos.seek(bad_offset, 'a', assertNoData).catch(assertNoData);
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + bad_offset);
assert.deepEqual(result, {});
done();
}).catch(_.noop); // UnhandledPromiseRejectionWarning;
}); });
it('should handle very large offset', function(done) { it('should handle very large offset', function() {
var bad_offset = offset + 100000000; const bad_offset = offset + 999999999;
wordpos.seek(bad_offset, 'a', function(err, result) { return wordpos.seek(bad_offset, 'a', assertNoData).catch(assertNoData);
assert(err instanceof Error);
assert.equal(err.message, 'no data at offset ' + bad_offset);
assert.deepEqual(result, {});
done();
}).catch(_.noop); // UnhandledPromiseRejectionWarning;
}); });
it('should handle bad POS', function(done) { it('should handle bad POS', function() {
wordpos.seek(offset, 'g', function(err, result) { const assertErr = err => {
assert(err instanceof Error); assert(err instanceof Error);
assert(/Incorrect POS/.test(err.message)); assert(/Incorrect POS/.test(err.message));
done(); };
}).catch(_.noop); // UnhandledPromiseRejectionWarning; return wordpos.seek(offset, 'g', assertErr).catch(assertErr);
}); });
it('should handle wrong POS', function(done) { it('should handle wrong POS', function() {
wordpos.seek(offset, 'v', function(err, result){ return wordpos.seek(offset, 'v', assertNoData).catch(assertNoData);
assert.equal(err.message, 'Bad data at location ' + offset);
}).catch(_.noop); // UnhandledPromiseRejectionWarning;
done();
}); });
}); });
@ -489,17 +463,11 @@ describe('Promise pattern', function() {
}); });
it('seek() - wrong offset', function () { it('seek() - wrong offset', function () {
return wordpos.seek(offset + 1, 'a').catch(function (err) { return wordpos.seek(offset + 1, 'a').catch(assertNoData);
assert(err instanceof Error);
assert.equal(err.message, 'Bad data at location ' + (offset+1));
});
}); });
it('seek() - bad offset', function () { it('seek() - bad offset', function () {
return wordpos.seek('foobar', 'a').catch(function (err) { return wordpos.seek('foobar', 'a').catch(assertOffsetErr);
assert(err instanceof Error);
assert.equal(err.message, 'offset must be valid positive number.');
});
}); });
}); });

View File

@ -46,7 +46,7 @@
* Released under MIT license * Released under MIT license
*/ */
var var
WNdb = require('../src/wordpos').WNdb, WNdb = require('wordnet-db'),
util = require('util'), util = require('util'),
BufferedReader = require ('./buffered-reader'), BufferedReader = require ('./buffered-reader'),
_ = require('underscore')._, _ = require('underscore')._,