From 2001182b7a95c87d72b1311270d2875ab232d84c Mon Sep 17 00:00:00 2001 From: Moos Date: Mon, 18 Jan 2016 00:09:56 -0800 Subject: [PATCH] Major update - first v1.0 checkin. --- .travis.yml | 4 +- README.md | 104 ++-- bench/text-128.txt | 13 - bench/wordpos-bench.js | 54 +- bin/wordpos-cli.js | 5 +- lib/natural/trie/trie.js | 231 +++++++++ lib/natural/util/stopwords.js | 41 ++ package.json | 15 +- spec/validate_spec.js | 53 -- src/dataFile.js | 194 ++++++++ src/fastIndex.js | 349 ------------- src/indexFile.js | 231 +++++++++ src/piper.js | 83 ++++ src/rand.js | 246 ++++++++++ src/wordpos.js | 487 ++++++++++--------- test.js | 40 ++ test/validate_test.js | 53 ++ spec/wordpos_spec.js => test/wordpos_test.js | 248 ++++++---- 18 files changed, 1647 insertions(+), 804 deletions(-) delete mode 100644 bench/text-128.txt create mode 100644 lib/natural/trie/trie.js create mode 100644 lib/natural/util/stopwords.js delete mode 100644 spec/validate_spec.js create mode 100644 src/dataFile.js delete mode 100644 src/fastIndex.js create mode 100644 src/indexFile.js create mode 100644 src/piper.js create mode 100644 src/rand.js create mode 100644 test.js create mode 100644 test/validate_test.js rename spec/wordpos_spec.js => test/wordpos_test.js (53%) diff --git a/.travis.yml b/.travis.yml index 0213cd0..b7484b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,7 @@ language: node_js node_js: - - "stable" + - '5' + - '4' + - '0.12' before_script: - npm install -g jasmine-node \ No newline at end of file diff --git a/README.md b/README.md index 493b899..097c301 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,10 @@ wordpos [![NPM version](https://img.shields.io/npm/v/wordpos.svg)](https://www.npmjs.com/package/wordpos) [![Build Status](https://img.shields.io/travis/moos/wordpos/master.svg)](https://travis-ci.org/moos/wordpos) -wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js using [natural's](http://github.com/NaturalNode/natural) WordNet module, offering **30x** performance over natural. +wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js using fast lookup in the WordNet database. +Version 1.x is a mojor update with no direct depedence on [natural's](http://github.com/NaturalNode/natural), with support for Promises, and roughly 5x speed improvement over previous version. + **CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](http://wordnet.princeton.edu/wordnet/man/wnstats.7WN.html) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment. @@ -47,12 +49,10 @@ British npm install -g wordpos -To run spec: (or just: npm test) +To run test: (or just: npm test) - npm install -g jasmine-node - cd spec - jasmine-node wordpos_spec.js --verbose - jasmine-node validate_spec.js --verbose + npm install -g mocha + mocha test ### Options @@ -63,11 +63,6 @@ WordPOS.defaults = { */ profile: false, - /** - * use fast index if available - */ - fastIndex: true, - /** * if true, exclude standard stopwords. * if array, stopwords to exclude, eg, ['all','of','this',...] @@ -86,7 +81,7 @@ To override, pass an options hash to the constructor. With the `profile` option, ## API -Please note: all API are *async* since the underlying WordNet library is async. WordPOS is a subclass of natural's [WordNet class](https://github.com/NaturalNode/natural#wordnet) and inherits all its methods. +Please note: all API are *async* since the underlying WordNet library is async. #### getPOS(text, callback) #### getNouns(text, callback) @@ -99,22 +94,21 @@ Get part-of-speech from `text`. `callback(results)` receives and array of words ``` wordpos.getPOS(text, callback) -- callback receives a result object: { - nouns:[], Array of text words that are nouns - verbs:[], Array of text words that are verbs - adjectives:[], Array of text words that are adjectives - adverbs:[], Array of text words that are adverbs - rest:[] Array of text words that are not in dict or could not be categorized as a POS + nouns:[], Array of words that are nouns + verbs:[], Array of words that are verbs + adjectives:[], Array of words that are adjectives + adverbs:[], Array of words that are adverbs + rest:[] Array of words that are not in dict or could not be categorized as a POS } Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective) ``` If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster -than getPOS() which looks up the word in all index files. [stopwords](https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js) -are stripped out from text before lookup. +than getPOS() which looks up the word in all index files. [stopwords](https://github.com/moos/wordpos/lib/natural/util/stopwords.js)are stripped out from text before lookup. -If `text` is an *array*, all words are looked-up -- no deduplication, stopword filter or tokenization is applied. +If `text` is an *array*, all words are looked-up -- no deduplication, stopword filtering or tokenization is applied. -getX() functions (immediately) return the *number* of parsed words that *will be* looked up (less duplicates and stopwords). +getX() functions return a Promise. Example: @@ -141,7 +135,7 @@ would be considered nouns. #### isAdjective(word, callback) #### isAdverb(word, callback) -Determine if `word` is a particular POS. `callback(result, word)` receives true/false as first argument and the looked-up word as the second argument. +Determine if `word` is a particular POS. `callback(result, word)` receives true/false as first argument and the looked-up word as the second argument. The resolved Promise receives true/false. Examples: @@ -159,13 +153,13 @@ wordpos.isAdverb('fishly', console.log); // false 'fishly' ``` +#### lookup(word, callback) #### lookupNoun(word, callback) #### lookupVerb(word, callback) #### lookupAdjective(word, callback) #### lookupAdverb(word, callback) -These calls are similar to natural's [lookup()](https://github.com/NaturalNode/natural#wordnet) call, except they can be faster if you -already know the POS of the word. Signature of the callback is `callback(result, word)` where `result` is an *array* of lookup object(s). +Get complete definition object for `word`. The lookupX() variants can be faster if you already know the POS of the word. Signature of the callback is `callback(result, word)` where `result` is an *array* of lookup object(s). Example: @@ -183,14 +177,8 @@ wordpos.lookupAdjective('awesome', console.log); gloss: 'inspiring awe or admiration or wonder; awing majesty, so vast, so high, so silent" ' } ], 'awesome' ``` -In this case only one lookup was found. But there could be several. +In this case only one lookup was found, but there could be several. -Or use WordNet's (slower) inherited method: - -```js -wordpos.lookup('great', console.log); -// ... -``` #### rand(options, callback) #### randNoun(options, callback) @@ -223,11 +211,9 @@ wordpos.rand({starsWith: 'zzz'}, console.log) // [] 'zzz' ``` -**Note on performance**: random lookups could involve heavy disk reads. It is better to use the `count` option to get words -in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the index files. +**Note on performance**: random lookups could involve heavy disk reads. It is better to use the `count` option to get words in batches. This may benefit from the cached reads of similarly keyed entries as well as shared open/close of the index files. -Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement -is met. +Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met. #### parse(text) Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally. @@ -236,16 +222,22 @@ Returns tokenized array of words in `text`, less duplicates and stopwords. This #### WordPOS.WNdb Access to the [wordnet-db](https://github.com/moos/wordnet-db) object containing the dictionary & index files. -#### WordPOS.natural -Access to underlying [natural](https://github.com/NaturalNode/natural) module. For example, WordPOS.natural.stopwords is the list of stopwords. +#### WordPOS.stopwords +Access the array of stopwords. +## Promises + +TODO + ## Fast Index Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js. Fast index improves performance **30x** over Natural's native methods. See blog article [Optimizing WordPos](http://blog.42at.com/optimizing-wordpos). +As of version 1.0, the fast index option is always on and cannot be turned off. + ## Command-line: CLI For CLI usage and examples, see [bin/README](bin). @@ -281,8 +273,46 @@ done in 1375 msecs 220 words are looked-up (less stopwords and duplicates) on a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files. +### Version 1.0 Benchmark + +Re-run v0.1.16: +``` + getPOS : 11 ops/s { iterations: 1, elapsed: 90 } + getNouns : 21 ops/s { iterations: 1, elapsed: 47 } + getVerbs : 53 ops/s { iterations: 1, elapsed: 19 } + getAdjectives : 29 ops/s { iterations: 1, elapsed: 34 } + getAdverbs : 83 ops/s { iterations: 1, elapsed: 12 } + lookup : 1 ops/s { iterations: 1, elapsed: 720 } + lookupNoun : 1 ops/s { iterations: 1, elapsed: 676 } + +looked up 220 words +done in 2459 msecs +``` + +V1.0: +``` + getPOS : 14 ops/s { iterations: 1, elapsed: 73 } + getNouns : 26 ops/s { iterations: 1, elapsed: 38 } + getVerbs : 42 ops/s { iterations: 1, elapsed: 24 } + getAdjectives : 24 ops/s { iterations: 1, elapsed: 42 } + getAdverbs : 26 ops/s { iterations: 1, elapsed: 38 } + lookup : 6 ops/s { iterations: 1, elapsed: 159 } + lookupNoun : 13 ops/s { iterations: 1, elapsed: 77 } + +looked up 221 words +done in 1274 msecs +``` +That's roughly **2x** better across the board. Functions that read the data files see much improved performance: `lookup` about **5x** and `lookupNoun` over **8x**. + + ## Changes +1.0.1 + - Removed direct dependency on Natural. Certain modules are included in /lib. + - Add support for Promises. + - Improved data file reads for up to **5x** performance increase. + - Tests are now mocha-based with assert interface. + 0.1.16 - Changed dependency to wordnet-db (renamed from WNdb) diff --git a/bench/text-128.txt b/bench/text-128.txt deleted file mode 100644 index d31e5b5..0000000 --- a/bench/text-128.txt +++ /dev/null @@ -1,13 +0,0 @@ -That's why, working with our military leaders, I have proposed a new -defense strategy that ensures we maintain the finest military in the -world, while saving nearly half a trillion dollars in our budget. To -stay one step ahead of our adversaries, I have already sent this -Congress legislation that will secure our country from the growing -danger of cyber-threats. - -Above all, our freedom endures because of the men and women in uniform -who defend it. As they come home, we must serve them as well as they -served us. That includes giving them the care and benefits they have -earned – which is why we've increased annual VA spending every year -I've been President. And it means enlisting our veterans in the work -of rebuilding our Nation. diff --git a/bench/wordpos-bench.js b/bench/wordpos-bench.js index 2821c74..715cb6c 100644 --- a/bench/wordpos-bench.js +++ b/bench/wordpos-bench.js @@ -5,9 +5,10 @@ var uubench = require('uubench'), // from: https://github.com/moos/uubench WordPOS = require('../src/wordpos'), wordpos = new WordPOS(); + suite = new uubench.Suite({ type: 'fixed', - iterations: 10, + iterations: 1, sync: true, // important! start: function(tests){ @@ -23,7 +24,7 @@ suite = new uubench.Suite({ }, done: function(time){ - console.log('looked up %d words', nwords); + console.log('looked up %d words, %d found', nwords, found); console.log('done in %d msecs', time ); }, @@ -34,20 +35,21 @@ suite = new uubench.Suite({ function out(res){ - return _(res).keys().map(function(k){ return k + ':' + res[k].length }); + return _(res).keys().map(function(k){ + return k + ':' + res[k].length + }); } - -var text1 = 'laksasdf', -// text128 = fs.readFileSync('text-128.txt', 'utf8'), - text512 = fs.readFileSync('text-512.txt', 'utf8'), - text, nwords, +var + text = fs.readFileSync('text-512.txt', 'utf8'), + parsedText = wordpos.parse(text), + nwords = parsedText.length, pos; function getPOS(next){ - nwords = wordpos.getPOS(text, function(res){ + wordpos.getPOS(text, function(res){ pos = res; next(); }); @@ -81,20 +83,31 @@ function getAdverbs(next){ }); } -suite.section('--1 word--', function(next){ - text = text1; - next(); -}); -suite.bench('getPOS', getPOS); -suite.bench('getNouns', getNouns); -suite.bench('getVerbs', getVerbs); -suite.bench('getAdjectives', getAdjectives); -suite.bench('getAdverbs', getAdverbs); +function lookup(next){ + var count = nwords; + found = 0; + parsedText.forEach(function(word) { + wordpos.lookup(word, function (res) { + res.length && ++found; + if (--count === 0) next(); + }); + }); +} + +function lookupNoun(next){ + var count = nwords; + found = 0; + parsedText.forEach(function(word) { + wordpos.lookupNoun(word, function (res) { + res.length && ++found; + if (--count === 0) next(); + }); + }); +} suite.section('--512 words--', function(next){ suite.options.iterations = 1; - text = text512; next(); }); suite.bench('getPOS', getPOS); @@ -102,6 +115,9 @@ suite.bench('getNouns', getNouns); suite.bench('getVerbs', getVerbs); suite.bench('getAdjectives', getAdjectives); suite.bench('getAdverbs', getAdverbs); +suite.bench('lookup', lookup); +suite.bench('lookupNoun', lookupNoun); + suite.run(); diff --git a/bin/wordpos-cli.js b/bin/wordpos-cli.js index cbd9fb0..18f983a 100644 --- a/bin/wordpos-cli.js +++ b/bin/wordpos-cli.js @@ -98,7 +98,7 @@ program.command('stopwords') .action(function(){ cmd = _.last(arguments)._name; rawCmd = rawCmd || cmd; - var stopwords = WordPos.natural.stopwords; + var stopwords = WordPos.stopwords; if (program.json) output(stopwords); @@ -184,7 +184,6 @@ function run(data) { _(fns).each(function(fn){ var method = cmd + fn + plural, cb = _.bind(collect, null, fn); - if (cmd == 'get') { wordpos[method](words, cb); } else if (cmd == 'rand') { @@ -194,7 +193,7 @@ function run(data) { }); } else { words.forEach(function(word){ - wordpos [method](word, cb); + wordpos[method](word, cb); }); } }); diff --git a/lib/natural/trie/trie.js b/lib/natural/trie/trie.js new file mode 100644 index 0000000..ab01621 --- /dev/null +++ b/lib/natural/trie/trie.js @@ -0,0 +1,231 @@ +/* +Copyright (c) 2014 Ken Koch + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * The basis of the TRIE structure. + **/ +function Trie(caseSensitive) { + this.dictionary = {}; + this.$ = false; + + if(typeof caseSensitive === "undefined") { + caseSensitive = true; + } + + this.cs = caseSensitive; +} + +/** + * Add a single string to the TRIE, returns true if the word was already in the + * trie. + **/ +Trie.prototype.addString = function(string) { + if(this.cs === false) { + string = string.toLowerCase(); + } + + // If the string has only one letter, mark this as a word. + if(string.length === 0) { + var wasWord = this.$; + this.$ = true; + return wasWord; + } + + // Make sure theres a Trie node in our dictionary + var next = this.dictionary[string[0]]; + + if(!next) { + this.dictionary[string[0]] = new Trie(this.cs); + next = this.dictionary[string[0]]; + } + + // Continue adding the string + return next.addString(string.substring(1)); +}; + +/** + * Add multiple strings to the TRIE + **/ +Trie.prototype.addStrings = function(list) { + for(var i in list) { + this.addString(list[i]); + } +}; + +/** + * A function to search the TRIE and return an array of + * words which have same prefix + * for example if we had the following words in our database: + * a, ab, bc, cd, abc, abd + * and we search the string: a + * we will get : + * [a, ab, abc, abd] + **/ +Trie.prototype.keysWithPrefix = function(prefix) { + if(this.caseSensitive === false) { + prefix = prefix.toLowerCase(); + } + + function isEmpty (object) { + for (var key in object) if (object.hasOwnProperty(key)) return false; + return true; + } + + function get (node, word) { + if(!node) return null; + if(word.length == 0) return node; + return get(node.dictionary[word[0]], word.substring(1)); + } + + function recurse ( node, stringAgg, resultsAgg) { + if (!node) return; + + // Check if this is a word + if (node.$) { + resultsAgg.push(stringAgg); + } + + if (isEmpty(node.dictionary)) { + return ; + } + + for (var c in node.dictionary) { + recurse (node.dictionary[c],stringAgg + c, resultsAgg); + } + } + + var results = []; + recurse (get(this, prefix), prefix, results); + return results; +}; + +/** + * A function to search the given string and return true if it lands + * on on a word. Essentially testing if the word exists in the database. + **/ +Trie.prototype.contains = function(string) { + if(this.cs === false) { + string = string.toLowerCase(); + } + + if(string.length === 0) { + return this.$; + } + + // Otherwise, we need to continue searching + var firstLetter = string[0]; + var next = this.dictionary[firstLetter]; + + // If we don't have a node, this isn't a word + if(!next) { + return false; + } + + // Otherwise continue the search at the next node + return next.contains(string.substring(1)); +} + +/** + * A function to search the TRIE and return an array of words which were encountered along the way. + * This will only return words with full prefix matches. + * for example if we had the following words in our database: + * a, ab, bc, cd, abc + * and we searched the string: abcd + * we would get only: + * [a, ab, abc] + **/ +Trie.prototype.findMatchesOnPath = function(search) { + if(this.cs === false) { + search = search.toLowerCase(); + } + + function recurse(node, search, stringAgg, resultsAgg) { + // Check if this is a word. + if(node.$) { + resultsAgg.push(stringAgg); + } + + // Check if the have completed the seearch + if(search.length === 0) { + return resultsAgg; + } + + // Otherwise, continue searching + var next = node.dictionary[search[0]]; + if(!next) { + return resultsAgg; + } + return recurse(next, search.substring(1), stringAgg + search[0], resultsAgg); + }; + + return recurse(this, search, "", []); +}; + +/** + * Returns the longest match and the remaining part that could not be matched. + * inspired by [NLTK containers.trie.find_prefix](http://nltk.googlecode.com/svn-/trunk/doc/api/nltk.containers.Trie-class.html). + **/ +Trie.prototype.findPrefix = function(search) { + if(this.cs === false) { + search = search.toLowerCase(); + } + + function recurse(node, search, stringAgg, lastWord) { + // Check if this is a word + if(node.$) { + lastWord = stringAgg; + } + + // Check if we have no more to search + if(search.length === 0) { + return [lastWord, search]; + } + + // Continue searching + var next = node.dictionary[search[0]]; + if(!next) { + return [lastWord, search]; + } + return recurse(next, search.substring(1), stringAgg + search[0], lastWord); + }; + + return recurse(this, search, "", null); +}; + +/** + * Computes the number of actual nodes from this node to the end. + * Note: This involves traversing the entire structure and may not be + * good for frequent use. + **/ +Trie.prototype.getSize = function() { + var total = 1; + for(var c in this.dictionary) { + total += this.dictionary[c].getSize(); + } + return total; +}; + +/** + * EXPORT THE TRIE + **/ +module.exports = Trie; + diff --git a/lib/natural/util/stopwords.js b/lib/natural/util/stopwords.js new file mode 100644 index 0000000..099b7d0 --- /dev/null +++ b/lib/natural/util/stopwords.js @@ -0,0 +1,41 @@ +/* +Copyright (c) 2011, Chris Umbel + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// a list of commonly used words that have little meaning and can be excluded +// from analysis. +var words = [ + 'about', 'after', 'all', 'also', 'am', 'an', 'and', 'another', 'any', 'are', 'as', 'at', 'be', + 'because', 'been', 'before', 'being', 'between', 'both', 'but', 'by', 'came', 'can', + 'come', 'could', 'did', 'do', 'each', 'for', 'from', 'get', 'got', 'has', 'had', + 'he', 'have', 'her', 'here', 'him', 'himself', 'his', 'how', 'if', 'in', 'into', + 'is', 'it', 'like', 'make', 'many', 'me', 'might', 'more', 'most', 'much', 'must', + 'my', 'never', 'now', 'of', 'on', 'only', 'or', 'other', 'our', 'out', 'over', + 'said', 'same', 'see', 'should', 'since', 'some', 'still', 'such', 'take', 'than', + 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'those', + 'through', 'to', 'too', 'under', 'up', 'very', 'was', 'way', 'we', 'well', 'were', + 'what', 'where', 'which', 'while', 'who', 'with', 'would', 'you', 'your', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '$', '1', + '2', '3', '4', '5', '6', '7', '8', '9', '0', '_']; + +// tell the world about the noise words. +exports.words = words; diff --git a/package.json b/package.json index 5779e82..9d54917 100644 --- a/package.json +++ b/package.json @@ -2,21 +2,22 @@ "name": "wordpos", "author": "Moos ", "keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"], - "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", - "version": "0.1.16", + "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", + "version": "1.0.0-RC1", "homepage": "https://github.com/moos/wordpos", "engines": { - "node": ">=0.6" + "node": ">=0.12" }, "bin": "./bin/wordpos-cli.js", "dependencies": { - "natural": "~0.1", "underscore": ">=1.3.1", "wordnet-db": "latest", - "commander": "1.1.1" + "commander": "^2.0.0" }, "devDependencies": { - "uubench": "git://github.com/moos/uubench.git" + "uubench": "git://github.com/moos/uubench.git", + "chai": "*", + "mocha": "*" }, "repository" : { "type" : "git", @@ -25,7 +26,7 @@ "main": "./src/wordpos.js", "scripts": { "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun", - "test": "jasmine-node spec/wordpos_spec.js spec/validate_spec.js --verbose" + "test": "mocha test" }, "license": "MIT" } diff --git a/spec/validate_spec.js b/spec/validate_spec.js deleted file mode 100644 index de178c2..0000000 --- a/spec/validate_spec.js +++ /dev/null @@ -1,53 +0,0 @@ -/** - * validate_spec.js - * - * Run validate on all four main index files - * - * Usage: - * npm install jasmine-node -g - * jasmine-node validate_spec.js --verbose - * - * Copyright (c) 2012 mooster@42at.com - * https://github.com/moos/wordpos - * - * Released under MIT license - */ -var - exec = require('child_process').exec, - cmd = 'node ' + __dirname + '/../tools/validate '; - - -// increase timeout -jasmine.asyncSpecWait.timeout = 20 * 1000; - - -describe('validate isX() using fastIndex', function() { - - it('should validate index.noun', function() { - exec(cmd + 'index.noun', callback); - asyncSpecWait(); - }); - - it('should validate index.verb', function() { - exec(cmd + 'index.verb', callback); - asyncSpecWait(); - }); - - it('should validate index.adv', function() { - exec(cmd + 'index.adv', callback); - asyncSpecWait(); - }); - - it('should validate index.adj', function() { - exec(cmd + 'index.adj', callback); - asyncSpecWait(); - }); - -}); - -function callback(error, stdout, stderr) { - expect(error).toBe(null); - console.log(stdout); - console.error(stderr); - asyncSpecDone(); -} \ No newline at end of file diff --git a/src/dataFile.js b/src/dataFile.js new file mode 100644 index 0000000..63a870c --- /dev/null +++ b/src/dataFile.js @@ -0,0 +1,194 @@ + +var fs = require('fs'), + path = require('path'), + _ = require('underscore'); + + +// courtesy of natural.WordNet +// TODO link +function lineDataToJSON(line) { + var data = line.split('| '), + tokens = data[0].split(/\s+/), + ptrs = [], + wCnt = parseInt(tokens[3], 16), + synonyms = []; + + for(var i = 0; i < wCnt; i++) { + synonyms.push(tokens[4 + i * 2]); + } + + var ptrOffset = (wCnt - 1) * 2 + 6; + for(var i = 0; i < parseInt(tokens[ptrOffset], 10); i++) { + ptrs.push({ + pointerSymbol: tokens[ptrOffset + 1 + i * 4], + synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10), + pos: tokens[ptrOffset + 3 + i * 4], + sourceTarget: tokens[ptrOffset + 4 + i * 4] + }); + } + + // break "gloss" into definition vs. examples + var glossArray = data[1].split("; "); + var definition = glossArray[0]; + var examples = glossArray.slice(1); + + for (var k = 0; k < examples.length; k++) { + examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); + } + + return { + synsetOffset: parseInt(tokens[0], 10), + lexFilenum: parseInt(tokens[1], 10), + pos: tokens[2], + wCnt: wCnt, + lemma: tokens[4], + synonyms: synonyms, + lexId: tokens[5], + ptrs: ptrs, + gloss: data[1], + def: definition, + exp: examples + }; +} + + +function readLocation(location, callback) { + //console.log('## read location ', this.fileName, location); + + var + file = this, + str = '', + len = file.nominalLineLength, + buffer = new Buffer(len); + + readChunk(location, function(err, count) { + if (err) { + console.log(err); + callback(err); + return; + } + //console.log(' read %d bytes at <%d>', count, location); + //console.log(str); + + callback(null, lineDataToJSON(str)); + }); + + function readChunk(pos, cb) { + fs.read(file.fd, buffer, 0, len, pos, function (err, count) { + str += buffer.toString('ascii'); + var eol = str.indexOf('\n'); + + //console.log(' -- read %d bytes at <%d>', count, pos, eol); + + if (eol === -1 && len < file.maxLineLength) { + return readChunk(pos + count, cb); + } + + str = str.substr(0, eol); + cb(err, count); + }); + } +} + +function lookup(record, callback) { + var results = [], + self = this, + offsets = record.synsetOffset; + + return new Promise(function(resolve, reject) { + //console.log('data lookup', record); + + offsets + .map(function (offset) { + return _.partial(readLocation.bind(self), offset); + }) + .map(promisifyInto(results)) + .reduce(serialize, openFile()) + .then(done) + .catch(done); + + function done(lastResult) { + closeFile(); + //console.log('done promise -- '); + if (lastResult instanceof Error) { + callback && callback(lastResult, []); + reject(lastResult); + } else { + callback && callback(null, results); + resolve(results); + } + } + }); + + function serialize(prev, next) { + return prev.then(next); + } + + function openFile() { + if (!self.fd) { + //console.log(' ... opening', self.filePath); + self.fd = fs.openSync(self.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++self.refcount; + return Promise.resolve(); + } + + function closeFile() { + if (--self.refcount === 0) { + //console.log(' ... closing', self.filePath); + fs.close(self.fd); + self.fd = null; + } + return Promise.resolve(); + } +} + + +function promisifyInto(collect) { + return function(fn) { + return function() { + return new Promise(function (resolve, reject) { + fn(function (error, result) { // Note callback signature! + //console.log('cb from get', arguments) + if (error) { + reject(error); + } + else { + collect && collect.push(result); + resolve(result); + } + }); + }); + }; + } +} + + + +var DataFile = function(dictPath, name) { + this.dictPath = dictPath; + this.fileName = 'data.' + name; + this.filePath = path.join(this.dictPath, this.fileName); + + this.maxLineLength = DataFile.MAX_LINE_LENGTH[ name ]; + this.nominalLineLength = MAX_SINGLE_READ_LENGTH; + this.refcount = 0; +}; + +// maximum read length at a time +var MAX_SINGLE_READ_LENGTH = 512; + +//DataFile.prototype.get = get; +DataFile.prototype.lookup = lookup; + +// e.g.: wc -L data.adv as of v3.1 +DataFile.MAX_LINE_LENGTH = { + noun: 12972, + verb: 7713, + adj: 2794, + adv: 638 +}; + +module.exports = DataFile; diff --git a/src/fastIndex.js b/src/fastIndex.js deleted file mode 100644 index c13d191..0000000 --- a/src/fastIndex.js +++ /dev/null @@ -1,349 +0,0 @@ -/*! - * fastIndex.js - * - * override natural.WordNet's IndexFile to use fast index data - * - * Copyright (c) 2012-2014 mooster@42at.com - * https://github.com/moos/wordpos - * - * Released under MIT license - */ - -var _ = require('underscore')._, - util = require('util'), - path = require('path'), - fs = require('fs'), - KEY_LENGTH = 3; - -/** - * load fast index bucket data - * - * @param dir {string} - dir path of index files - * @param name {string} - name of index file, eg, 'index.verb' - * @returns {Object} - fast index data object - */ -function loadFastIndex(dir, name) { - var jsonFile = path.join(dir, 'fast-' + name + '.json'), - data = null; - try{ - data = JSON.parse( fs.readFileSync(jsonFile,'utf8') ); - //console.log('loaded %d buckets for %s', data.stats.buckets, data.name); - } catch(e) { - console.error('Error with fast index file. Try reinstalling from npm!'); - throw e; - } - return data; -} - -/** - * read index file using fast index data at key - * - * @param key - 3-char key into fast index - * @param index - index object - * @param callback - function receives buffer of data read - * @returns none - */ -function readIndexForKey(key, index, callback) { - var data = index.fastIndex, - offset = data.offsets[key][0], - nextKey = data.offsets[key][1], - nextOffset = data.offsets[nextKey][0], - len = nextOffset - offset - 1, - buffer = new Buffer(len); - - fs.read(index.fd, buffer, 0, len, offset, function(err, count){ - if (err) return console.log(err); - //console.log(' read %d bytes for <%s>', count, key); - callback(buffer); - }); -} - - -/** - * read index file using fast index data at keyStart to keyEnd (inclusive) - * - * @param keyStart {string} - 3-char key into fast index to begin at - * @param keyEnd {string|null} - 3-char key into fast index to end at. If null, reads to next key. - * @param index - index object - * @param callback - function receives buffer of data read - * @returns none - */ -function readIndexBetweenKeys(keyStart, keyEnd, index, callback) { - var data = index.fastIndex, - offset = data.offsets[keyStart][0], - end = keyEnd || keyStart, - nextKey = data.offsets[end][1], - nextOffset = data.offsets[nextKey][0], - len = nextOffset - offset - 1, - buffer = new Buffer(len); - - //console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len) - fs.read(index.fd, buffer, 0, len, offset, function(err, count){ - if (err) return console.log(err); - // console.log(' read %d bytes for <%s>', count, keyStart); - callback(buffer); - }); -} - -/** - * run single 'task' method sharing callbacks. Method MUST take callback as LAST arg. - * piper is bound to an index. - * - * @param task {string} - task name unique to method! - * @param method {function} - method to execute, gets (args, ... , callback) - * @param args {array} - args to pass to method - * @param context {object} - other params to remember and sent to callback - * @param callback {function} - result callback - */ -function piper(task, method, args, context, callback){ - var readCallbacks = this.callbackQueue, - memoArgs = _.rest(arguments, 2), - wrappedCallback; - - // console.log('piper', task, args[0], context[0]); - - // queue up if already reading file for this task - if (task in readCallbacks){ - readCallbacks[task].push(memoArgs); - return; - } - readCallbacks[task] = [memoArgs]; - - if (!this.fd) { - //console.log(' ... opening', this.filePath); - this.fd = fs.openSync(this.filePath, 'r'); - } - - // ref count so we know when to close the main index file - ++this.refcount; - - wrappedCallback = _.partial(piper.wrapper, this, task); - - // call method -- replace original callback (last arg) with wrapped one - method.apply(null, [].concat( args, wrappedCallback )); -} - -// result is the *same* for same task -piper.wrapper = function(self, task, result){ - var readCallbacks = self.callbackQueue, - callback, args; - - // live access callbacks cache in case nested cb's - // add to the array. - while (args = readCallbacks[task].shift()) { - callback = args.pop(); // last arg MUST be callback - -// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString()) - callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result)); - } - - // now done - delete cb cache - delete readCallbacks[task]; - - if (--self.refcount === 0) { - //console.log(' ... closing', self.filePath); - fs.close(self.fd); - self.fd = null; - } -}; - -/** - * function that overrides WordNet's IndexFile.find() - * - * calls to same bucket are queued for callback. - * - * @param search {string} - word to search for - * @param callback {function} - callback receives found line and tokens - * @returns none - */ -function find(search, callback) { - var self = this, - data = this.fastIndex, - readCallbacks = this.callbackQueue, - miss = {status: 'miss'}; - - var key = search.slice(0, KEY_LENGTH); - if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) }); - - // prepare the piper - var task = 'find' + key, - args = [key, this], - context = [search, callback]; // last arg MUST be callback - - // pay the piper - this.piper(task, readIndexForKey, args, context, collector); - - function collector(key, index, search, callback, buffer){ - var lines = buffer.toString().split('\n'), - keys = lines.map(function(line){ - return line.substring(0,line.indexOf(' ')); - }), - ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search! - - //console.log(' %s is %d', search, ind); - if (ind === -1) return callback(miss); - - var tokens = lines[ind].split(/\s+/), - key = tokens[0], - result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens}; - - callback(result); - } -} - -/** - * rand function (bound to index) - * - * @param startsWith {string} - get random word(s) that start with this, or '' - * @param num {number} - number of words to return - * @param callback {function} - callback function, receives words array and startsWith - */ -function rand(startsWith, num, callback){ - var self = this, - nextKey = null, - trie = this.fastIndex.trie, - key, keys; - - //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); - if (startsWith){ - key = startsWith.slice(0, KEY_LENGTH); - - /** - * if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that. - */ - if (key.length < KEY_LENGTH) { - - // calc trie if haven't done so yet - if (!trie){ - var natural = require('natural'); - - trie = new natural.Trie(); - trie.addStrings(self.fastIndex.indexKeys); - this.fastIndex.trie = trie; - //console.log(' +++ Trie calc '); - } - - try{ - // trie throws if not found!!!!! - keys = trie.keysWithPrefix( startsWith ); - } catch(e){ - keys = []; - } - - // read all keys then select random word. - // May be large disk read! - key = keys[0]; - nextKey = _.last(keys); - } - - if (!key || !(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback([], startsWith) }); - - } else { - // no startWith given - random select among keys - keys = _.sample( this.fastIndex.indexKeys, num ); - - // if num > 1, run each key independently and collect results - if (num > 1){ - var results = [], ii = 0; - _(keys).each(function(startsWith){ - self.rand(startsWith, 1, function(result){ - results.push(result[0]); - if (++ii == num) { - callback(results, ''); - } - }) - }); - return; - } - key = keys; - } -// console.log(' using key', key, nextKey); - - // prepare the piper - var args = [key, nextKey, this], - task = 'rand' + key + nextKey, - context = [startsWith, num, callback]; // last arg MUST be callback - - // pay the piper - this.piper(task, readIndexBetweenKeys, args, context, collector); - - function collector(key, nextKey, index, startsWith, num, callback, buffer){ - var lines = buffer.toString().split('\n'), - matches = lines.map(function(line){ - return line.substring(0,line.indexOf(' ')); - }); - - //console.log(' got lines for key ', key, lines.length); - - // we got bunch of matches for key - now search within for startsWith - if (startsWith !== key){ - - // binary search for startsWith within set of matches - var ind = _.sortedIndex(matches, startsWith); - if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1){ - return callback([], startsWith); - } - - // FIXME --- using Trie's new keysWithPrefix not yet pushed to npm. - // see https://github.com/NaturalNode/natural/commit/5fc86c42e41c1314bfc6a37384dd14acf5f4bb7b - - var natural = require('natural'), - trie = new natural.Trie(); - - trie.addStrings(matches); - //console.log('Trie > ', trie.matchesWithPrefix( startsWith )); - - matches = trie.keysWithPrefix( startsWith ); - } - - var words = _.sample(matches, num); - callback(words, startsWith); - } -} - -// cache of fast index data across instances of WordPOS class -var cache = {}; - -module.exports = { - /** - * loads fast index data and return fast index find function - * - * @param index {object} - the IndexFile instance - * @returns {function} - fast index find or original find if errors - */ - find: function(index){ - - var key = index.filePath, - data; - - if (!(key in cache)) { - data = loadFastIndex(index.dataDir, index.fileName); - cache[key] = data; - } - - // if no fast index data was found or was corrupt, use original find - if (!cache[key]) return index.find; - - index.fastIndex = cache[key]; - index.fastIndex.indexKeys = Object.keys(index.fastIndex.offsets); - index.fastIndex.trie = null; // calc on demand - - index.refcount = 0; - index.callbackQueue = {}; - index.piper = _.bind(piper, index); - - return find; - }, - - /** - * bind rand() to index - * - * @param index {object} - the IndexFile instance - * @returns {function} - bound rand function for index - */ - rand: function(index){ - if (!index.fastIndex) throw 'rand requires fastIndex'; - return _.bind(rand, index); - } -}; - diff --git a/src/indexFile.js b/src/indexFile.js new file mode 100644 index 0000000..1e7ed06 --- /dev/null +++ b/src/indexFile.js @@ -0,0 +1,231 @@ +/*! + * indexFile.js + * + * implements fast index lookup of WordNet's index files + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + path = require('path'), + fs = require('fs'), + piper = require('./piper'), + KEY_LENGTH = 3; + +/** + * load fast index bucket data + * + * @param dir {string} - dir path of index files + * @param name {string} - name of index file, eg, 'index.verb' + * @returns {Object} - fast index data object + */ +function loadFastIndex(dir, name) { + var jsonFile = path.join(dir, 'fast-' + name + '.json'), + data = null; + try{ + data = JSON.parse( fs.readFileSync(jsonFile,'utf8') ); + //console.log('loaded %d buckets for %s', data.stats.buckets, data.name); + } catch(e) { + console.error('Error with fast index file. Try reinstalling from npm!'); + throw e; + } + return data; +} + +/** + * read index file using fast index data at key + * + * @param key {string} - 3-char key into fast index + * @param index {object} - index file object + * @param callback {function} - function receives buffer of data read + * @returns none + */ +function readIndexForKey(key, index, callback) { + var data = index.fastIndex, + offset = data.offsets[key][0], + nextKey = data.offsets[key][1], + nextOffset = data.offsets[nextKey][0], + len = nextOffset - offset - 1, + buffer = new Buffer(len); + + fs.read(index.fd, buffer, 0, len, offset, function(err, count){ + if (err) return console.log(err); + //console.log(' read %d bytes for <%s>', count, key); + callback(buffer); + }); +} + + +/** + * read index file using fast index data at keyStart to keyEnd (inclusive) + * + * @param keyStart {string} - 3-char key into fast index to begin at + * @param keyEnd {string|null} - 3-char key into fast index to end at. If null, reads to next key. + * @param index {object} - index file object + * @param callback - function receives buffer of data read + * @returns none + */ +function readIndexBetweenKeys(keyStart, keyEnd, index, callback) { + var data = index.fastIndex, + offset = data.offsets[keyStart][0], + end = keyEnd || keyStart, + nextKey = data.offsets[end][1], + nextOffset = data.offsets[nextKey][0], + len = nextOffset - offset - 1, + buffer = new Buffer(len); + + //console.log('### readIndexBetweenKeys', keyStart, keyEnd, nextKey, len) + fs.read(index.fd, buffer, 0, len, offset, function(err, count){ + if (err) return console.log(err); + // console.log(' read %d bytes for <%s>', count, keyStart); + callback(buffer); + }); +} + +/** + * find a search term in an index file (using fast index) + * + * Calls to same bucket are queued for callback using the piper. + * + * @param search {string} - word to search for + * @param callback {function} - callback receives found line and tokens + * @returns none + */ +function find(search, callback) { + var self = this, + data = this.fastIndex, + readCallbacks = this.callbackQueue, + miss = {status: 'miss'}; + + var key = search.slice(0, KEY_LENGTH); + if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) }); + + // prepare the piper + var task = 'find:' + key, + args = [key, this], + context = [search, callback]; // last arg MUST be callback + + // pay the piper + this.piper(task, readIndexForKey, args, context, collector); + + function collector(key, index, search, callback, buffer){ + var lines = buffer.toString().split('\n'), + keys = lines.map(function(line){ + return line.substring(0,line.indexOf(' ')); + }), + ind = _.indexOf(keys, search, /*isSorted*/ true); // binary search! + + //console.log(' %s is %d', search, ind); + if (ind === -1) return callback(miss); + + var tokens = lines[ind].split(/\s+/), + key = tokens[0], + result = {status: 'hit', key: key, 'line': lines[ind], tokens: tokens}; + + callback(result); + } +} + +/** + * find a word and prepare its lexical record + * + * @param word {string} - search word + * @param callback {function} - callback function receives result + * @returns none + */ +function lookup(word, callback) { + var self = this; + + return new Promise(function(resolve, reject){ + self.find(word, function (record) { + var indexRecord = null; + + if (record.status == 'hit') { + var ptrs = [], offsets = []; + + for (var i = 0; i < parseInt(record.tokens[3]); i++) + ptrs.push(record.tokens[i]); + + for (var i = 0; i < parseInt(record.tokens[2]); i++) + offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10)); + + indexRecord = { + lemma : record.tokens[0], + pos : record.tokens[1], + ptrSymbol : ptrs, + senseCnt : parseInt(record.tokens[ptrs.length + 4], 10), + tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10), + synsetOffset: offsets + }; + } + + callback && callback(indexRecord); + resolve(indexRecord); + }); + }); +} + + +/** + * loads fast index data and return fast index find function + * + * @param index {object} - the IndexFile instance + */ +function initIndex(index){ + var key = index.filePath, + data; + + if (!(key in cache)) { + data = loadFastIndex(index.dictPath, index.fileName); + cache[key] = data; + } + + // if no fast index data was found or was corrupt, throw + if (!cache[key]) throw new Error('Unable to load fastIndex file: ' + index.filePath); + + index.fastIndex = cache[key]; + index.fastIndex.indexKeys = Object.keys(index.fastIndex.offsets); + index.fastIndex.trie = null; // calc on demand + + index.refcount = 0; + index.callbackQueue = {}; + index.piper = _.bind(piper, index); +} + +/** + * IndexFile class + * + * @param dictPath {string} - WordNet db dict path + * @param name {string} - name of index: noun, verb, adj, adv + * @constructor + */ +var IndexFile = function(dictPath, name) { + this.dictPath = dictPath; + this.fileName = 'index.' + name; + this.filePath = path.join(this.dictPath, this.fileName); + initIndex(this); +}; + +IndexFile.prototype.lookup = lookup; +IndexFile.prototype.find = find; + +/** + * export static method + * @type {readIndexBetweenKeys} + */ +IndexFile.readIndexBetweenKeys = readIndexBetweenKeys; + +/** + * cache of fast index data across instances of WordPOS class + * + * @type {object} + */ +var cache = {}; + + + +module.exports = IndexFile; diff --git a/src/piper.js b/src/piper.js new file mode 100644 index 0000000..a5c18b5 --- /dev/null +++ b/src/piper.js @@ -0,0 +1,83 @@ +/*! + * piper.js + * + * executes multiple async i/o tasks and pools similar callbacks, + * calling i/o open/close when all incoming tasks are done. + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + path = require('path'), + fs = require('fs'); + +/** + * run single 'task' method sharing callbacks. Method MUST take callback as LAST arg. + * piper is bound to an IndexFile. + * + * @param task {string} - task name unique to method! + * @param method {function} - method to execute, gets (args, ... , callback) + * @param args {array} - args to pass to method + * @param context {object} - other params to remember and sent to callback + * @param callback {function} - result callback + */ +function piper(task, method, args, context, callback){ + var readCallbacks = this.callbackQueue, + memoArgs = _.rest(arguments, 2), + wrappedCallback; + + //console.log('piper', task, [method]); + + // queue up if already reading file for this task + if (task in readCallbacks){ + readCallbacks[task].push(memoArgs); + return; + } + readCallbacks[task] = [memoArgs]; + + if (!this.fd) { + //console.log(' ... opening', this.filePath); + this.fd = fs.openSync(this.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++this.refcount; + + wrappedCallback = _.partial(piper.wrapper, this, task); + + // call method -- replace original callback (last arg) with wrapped one + method.apply(null, [].concat( args, wrappedCallback )); +} + +// result is the *same* for same task +piper.wrapper = function(self, task /*, result...*/){ + var readCallbacks = self.callbackQueue, + result = _.rest(arguments, 2), + callback, args; + + // live access callbacks cache in case nested cb's + // add to the array. + while (args = readCallbacks[task].shift()) { + callback = args.pop(); // last arg MUST be callback + +// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString()) + callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result)); + } + + // now done - delete cb cache + delete readCallbacks[task]; + + if (--self.refcount === 0) { + //console.log(' ... closing', self.filePath); + fs.close(self.fd); + self.fd = null; + } +}; + + +module.exports = piper; + diff --git a/src/rand.js b/src/rand.js new file mode 100644 index 0000000..ec29047 --- /dev/null +++ b/src/rand.js @@ -0,0 +1,246 @@ +/*! + * rand.js + * + * define rand() and randX() functions on wordpos + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + Trie = require('../lib/natural/trie/trie'), + IndexFile = require('./indexFile'), + KEY_LENGTH = 3; + + +/** + * factory function for randX() + * + * @param pos {string} - a,r,n,v + * @returns {Function} - rand function bound to an index file + */ +function makeRandX(pos){ + return function(opts, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index, + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1; + + if (typeof opts === 'function') { + callback = opts; + } + + index.rand(startsWith, count, function(record) { + args.push(record, startsWith); + profile && args.push(new Date() - start); + callback.apply(null, args); + }); + }; +} + +/** + * rand function (bound to index) + * + * @param startsWith {string} - get random word(s) that start with this, or '' + * @param num {number} - number of words to return + * @param callback {function} - callback function, receives words array and startsWith + */ +function rand(startsWith, num, callback){ + var self = this, + nextKey = null, + trie = this.fastIndex.trie, + key, keys; + + //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); + if (startsWith){ + key = startsWith.slice(0, KEY_LENGTH); + + /** + * if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that. + */ + if (key.length < KEY_LENGTH) { + + // calc trie if haven't done so yet + if (!trie){ + trie = new Trie(); + trie.addStrings(self.fastIndex.indexKeys); + this.fastIndex.trie = trie; + //console.log(' +++ Trie calc '); + } + + try{ + // trie throws if not found!!!!! + keys = trie.keysWithPrefix( startsWith ); + } catch(e){ + keys = []; + } + + // read all keys then select random word. + // May be large disk read! + key = keys[0]; + nextKey = _.last(keys); + } + + if (!key || !(key in self.fastIndex.offsets)) return process.nextTick(function(){ callback([], startsWith) }); + + } else { + // no startWith given - random select among keys + keys = _.sample( this.fastIndex.indexKeys, num ); + + // if num > 1, run each key independently and collect results + if (num > 1){ + var results = [], ii = 0; + _(keys).each(function(startsWith){ + self.rand(startsWith, 1, function(result){ + results.push(result[0]); + if (++ii == num) { + callback(results, ''); + } + }) + }); + return; + } + key = keys; + } +// console.log(' using key', key, nextKey); + + // prepare the piper + var args = [key, nextKey, this], + task = 'rand:' + key + nextKey, + context = [startsWith, num, callback]; // last arg MUST be callback + + // pay the piper + this.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector); + + function collector(key, nextKey, index, startsWith, num, callback, buffer){ + var lines = buffer.toString().split('\n'), + matches = lines.map(function(line){ + return line.substring(0,line.indexOf(' ')); + }); + + //console.log(' got lines for key ', key, lines.length); + + // we got bunch of matches for key - now search within for startsWith + if (startsWith !== key){ + + // binary search for startsWith within set of matches + var ind = _.sortedIndex(matches, startsWith); + if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1){ + return callback([], startsWith); + } + + // FIXME --- using Trie's new keysWithPrefix not yet pushed to npm. + // see https://github.com/NaturalNode/natural/commit/5fc86c42e41c1314bfc6a37384dd14acf5f4bb7b + + var trie = new Trie(); + + trie.addStrings(matches); + //console.log('Trie > ', trie.matchesWithPrefix( startsWith )); + + matches = trie.keysWithPrefix( startsWith ); + } + + var words = _.sample(matches, num); + callback(words, startsWith); + } +} + +/** + * rand() - for all Index files + */ +function randAll(opts, callback) { + var + profile = this.options.profile, + start = profile && new Date(), + results = [], + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1, + args = [null, startsWith], + parts = 'Noun Verb Adjective Adverb'.split(' '), + self = this, + done = function(){ + profile && (args.push(new Date() - start)); + args[0] = results; + callback.apply(null, args) + }; + + if (typeof opts === 'function') { + callback = opts; + } else { + opts = _.clone(opts); + } + + // TODO -- or loop count times each time getting 1 from random part!! + // slower but more random. + + // select at random a part to look at + var doParts = _.sample(parts, parts.length); + tryPart(); + + function tryPart(){ + var rand = 'rand' + doParts.pop(); + self[ rand ](opts, partCallback); + } + + function partCallback(result){ + if (result) { + results = _.uniq(results.concat(result)); // make sure it's unique! + } + + //console.log(result); + if (results.length < count && doParts.length) { + // reduce count for next part -- NO! may get duplicates + // opts.count = count - results.length; + return tryPart(); + } + + // trim excess + if (results.length > count) { + results.length = count; + } + done(); + } +} + +/** + * bind rand() to index + * + * @param index {object} - the IndexFile instance + * @returns {function} - bound rand function for index + */ +function randomify(index){ + if (!index.fastIndex) throw 'rand requires fastIndex'; + return _.bind(rand, index); +} + + + +module.exports = { + + init: function(wordposProto) { + wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); + wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); + wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex); + wordposProto.advIndex.rand = randomify(wordposProto.advIndex); + + /** + * define rand() + */ + wordposProto.rand = randAll; + + /** + * define randX() + */ + wordposProto.randAdjective = makeRandX('a'); + wordposProto.randAdverb = makeRandX('r'); + wordposProto.randNoun = makeRandX('n'); + wordposProto.randVerb = makeRandX('v'); + } +}; + diff --git a/src/wordpos.js b/src/wordpos.js index 0ff104d..666c21a 100644 --- a/src/wordpos.js +++ b/src/wordpos.js @@ -1,9 +1,9 @@ /** * wordpos.js * -* Node.js part-of-speech utilities using natural's WordNet module. +* Node.js part-of-speech utilities using WordNet database. * -* Copyright (c) 2012-2014 mooster@42at.com +* Copyright (c) 2012-2016 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license @@ -11,16 +11,12 @@ var _ = require('underscore')._, util = require('util'), - natural = require('natural'), - WordNet = natural.WordNet, - tokenizer = new natural.WordTokenizer(), - natural_stopwords = makeStopwordString(natural.stopwords), + stopwords = require('../lib/natural/util/stopwords').words, + stopwordsStr = makeStopwordString(stopwords), WNdb = require('wordnet-db'), - fastIndex = null; + DataFile = require('./dataFile'), + IndexFile = require('./indexFile'); -try { - fastIndex = require('./fastIndex'); -} catch(e) {} function normalize(word) { return word.toLowerCase().replace(/\s+/g, '_'); @@ -34,139 +30,178 @@ function isStopword(stopwords, word) { return stopwords.indexOf(' '+word+' ') >= 0; } -function prepText(text) { - if (_.isArray(text)) return text; - var deduped = _.uniq(tokenizer.tokenize(text)); - if (!this.options.stopwords) return deduped; - return _.reject(deduped, _.bind(isStopword, null, - _.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords - )); +function tokenizer(str) { + return str.split(/\W+/); //_.without(results,'',' ') } +function prepText(text) { + if (_.isArray(text)) return text; + var deduped = _.uniq(tokenizer(text)); + if (!this.options.stopwords) return deduped; + return _.reject(deduped, _.bind(isStopword, null, + _.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr + )); +} + +/** + * factory for main lookup function + * + * @param pos {string} - n/v/a/r + * @returns {Function} - lookup function bound to POS + */ function lookup(pos) { return function(word, callback) { var profile = this.options.profile, start = profile && new Date(), + files = this.getFilesFor(pos), args = []; + word = normalize(word); - this.lookupFromFiles([ - {index: this.getIndexFile(pos), data: this.getDataFile(pos)} - ], [], word, function(results){ + + // lookup index + return files.index.lookup(word) + .then(function(result) { + if (result) { + // lookup data + return files.data.lookup(result).then(done); + } else { + // not found in index + return done([]); + } + }) + .catch(done); + + function done(results) { + if (results instanceof Error) { + args.push([], word); + } else { args.push(results, word); - profile && args.push(new Date() - start); - callback.apply(null, args); - }); + } + //console.log(3333, args) + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } }; } +/** + * isX() factory function + * + * @param pos {string} - n/v/a/r + * @returns {Function} + */ function is(pos){ return function(word, callback, _noprofile) { // disable profiling when isX() used internally var profile = this.options.profile && !_noprofile, start = profile && new Date(), args = [], - index = this.getIndexFile(pos); + index = this.getFilesFor(pos).index; word = normalize(word); - index.lookup(word, function(record) { - args.push(!!record, word); - profile && args.push(new Date() - start); - callback.apply(null, args); - }); + + return index + .lookup(word) + .then(function(record) { + var result = !!record; + args.push(result, word); + profile && args.push(new Date() - start); + nextTick(callback, args); + return result; + }); }; } -function rand(pos){ - return function(opts, callback, _noprofile) { - // disable profiling when isX() used internally +/** + * getX() factory function + * + * @param isFn {function} - an isX() function + * @returns {Function} + */ +function get(isFn) { + return function(text, callback, _noprofile) { var profile = this.options.profile && !_noprofile, start = profile && new Date(), - args = [], - index = this.getIndexFile(pos), - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1; + words = this.parse(text), + results = [], + self = this; - if (typeof opts === 'function') { - callback = opts; + //if (!n) return (process.nextTick(done),0); + return Promise + .all(words.map(exec)) + .then(done); + + function exec(word) { + return self[isFn] + .call(self, word, null, /*_noprofile*/ true) + .then(function collect(result) { + result && results.push(word); + }); } - index.rand(startsWith, count, function(record) { - args.push(record, startsWith); + function done(){ + var args = [results]; profile && args.push(new Date() - start); - callback.apply(null, args); - }); + nextTick(callback, args); + return results; + } }; } - -function get(isFn) { - return function(text, callback) { - var profile = this.options.profile, - start = profile && new Date(), - words = this.parse(text), - n = words.length, - i = 0, - self = this, - results = [], - args = [results], - done = function(){ - profile && (args[1] = new Date() - start); - callback.apply(null, args) - }; - if (!n) return (process.nextTick(done),0); - words.forEach(function(word,j){ - self[isFn](word, function(yes){ - yes && results.push(word); - (++i==n) && done(); - }, /*_noprofile*/ true); +function nextTick(fn, args) { + if (fn) { + setImmediate(function(){ + fn.apply(null, args); }); - return n; - }; + } } + /** * @class WordPOS + * @param options {object} -- @see WordPOS.defaults * @constructor */ var WordPOS = function(options) { - if (arguments.length == 0 || _.isObject(options)) { - WordPOS.super_.call(this, WNdb.path); - } else { - WordPOS.super_.apply(this, arguments); - } - this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults); + var dictPath; - if (this.options.fastIndex && fastIndex) { - // override find - this.nounIndex.find = fastIndex.find(this.nounIndex); - this.verbIndex.find = fastIndex.find(this.verbIndex); - this.adjIndex.find = fastIndex.find(this.adjIndex); - this.advIndex.find = fastIndex.find(this.advIndex); + this.options = _.defaults({}, _.isObject(options) && options || {}, { + dictPath: WNdb.path + }, WordPOS.defaults); - // rand - this.nounIndex.rand = fastIndex.rand(this.nounIndex); - this.verbIndex.rand = fastIndex.rand(this.verbIndex); - this.adjIndex.rand = fastIndex.rand(this.adjIndex); - this.advIndex.rand = fastIndex.rand(this.advIndex); - } + dictPath = this.options.dictPath; + + this.nounIndex = new IndexFile(dictPath, 'noun'); + this.verbIndex = new IndexFile(dictPath, 'verb'); + this.adjIndex = new IndexFile(dictPath, 'adj'); + this.advIndex = new IndexFile(dictPath, 'adv'); + + this.nounData = new DataFile(dictPath, 'noun'); + this.verbData = new DataFile(dictPath, 'verb'); + this.adjData = new DataFile(dictPath, 'adj'); + this.advData = new DataFile(dictPath, 'adv'); + + // define randX() functions + require('./rand').init(this); if (_.isArray(this.options.stopwords)) { this.options.stopwords = makeStopwordString(this.options.stopwords); } }; -util.inherits(WordPOS, WordNet); + WordPOS.defaults = { + /** + * path to WordNet data (override only if not using wordnet-db) + */ + dictPath: '', + /** * enable profiling, time in msec returned as second argument in callback */ profile: false, - /** - * use fast index if available - */ - fastIndex: true, - /** * if true, exclude standard stopwords. * if array, stopwords to exclude, eg, ['all','of','this',...] @@ -177,14 +212,123 @@ WordPOS.defaults = { var wordposProto = WordPOS.prototype; -// fast POS lookups (only look in specified file) /** - * lookupX() - * Lookup word definition if already know POS + * lookup a word in all indexes * - * @param string word - word to lookup in given POS - * @param function callback receives array of definition objects or empty - * @return none + * @param word {string} - search word + * @param callback {Functino} (optional) - callback with (results, word) signature + * @returns {Promise} + */ +wordposProto.lookup = function(word, callback) { + var self = this, + results = [], + profile = this.options.profile, + start = profile && new Date(), + methods = ['lookupAdverb', 'lookupAdjective', 'lookupVerb', 'lookupNoun']; + + return Promise + .all(methods.map(exec)) + .then(done) + .catch(error); + + function exec(method) { + return self[ method ] + .call(self, word) + .then(function collect(result){ + results = results.concat(result); + }); + } + + function done() { + var args = [results, word]; + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } + + function error(err) { + nextTick(callback, [[], word]); + throw err; + } +}; + + +/** + * getPOS() - Find all POS for all words in given string + * + * @param text {string} - words to lookup for POS + * @param callback {function} (optional) - receives object with words broken into POS or 'rest', ie, + * Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]} + * @return Promise - resolve function receives data object + */ +wordposProto.getPOS = function(text, callback) { + var self = this, + data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}, + profile = this.options.profile, + start = profile && new Date(), + words = this.parse(text), + methods = ['getAdverbs', 'getAdjectives', 'getVerbs', 'getNouns']; + + return Promise + .all(methods.map(exec)) + .then(done) + .catch(error); + + function exec(method) { + return self[ method ] + .call(self, text, null, true) + .then(function collect(results) { + // getAdjectives --> adjectives + var pos = method.replace('get','').toLowerCase(); + data[ pos ] = results; + }); + } + + function done() { + var matches = _(data).chain() + .values() + .flatten() + .uniq() + .value(), + args = [data]; + + data.rest = _(words).difference(matches); + + profile && args.push(new Date() - start); + nextTick(callback, args); + return data; + } + + function error(err) { + nextTick(callback, []); + throw err; + } +}; + +/** + * get index and data files for given pos + * + * @param pos {string} - n/v/a/r + * @returns {object} - keys {index, data} + */ +wordposProto.getFilesFor = function (pos) { + switch(pos) { + case 'n': + return {index: this.nounIndex, data: this.nounData}; + case 'v': + return {index: this.verbIndex, data: this.verbData}; + case 'a': case 's': + return {index: this.adjIndex, data: this.adjData}; + case 'r': + return {index: this.advIndex, data: this.advData}; + } + return {}; +}; + + +/** + * lookupX() - Lookup word definition if already know POS + * @see lookup */ wordposProto.lookupAdjective = lookup('a'); wordposProto.lookupAdverb = lookup('r'); @@ -192,12 +336,8 @@ wordposProto.lookupNoun = lookup('n'); wordposProto.lookupVerb = lookup('v'); /** - * isX() - * Test if word is given POS - * - * @param string word - word to test for given POS - * @param function Callback receives true or false if word is given POS - * @return none + * isX() - Test if word is given POS + * @see is */ wordposProto.isAdjective = is('a'); wordposProto.isAdverb = is('r'); @@ -205,155 +345,24 @@ wordposProto.isNoun = is('n'); wordposProto.isVerb = is('v'); /** - * randX() - */ -wordposProto.randAdjective = rand('a'); -wordposProto.randAdverb = rand('r'); -wordposProto.randNoun = rand('n'); -wordposProto.randVerb = rand('v'); - - -/** - * getX() - * Find all words in string that are given POS - * - * @param string Text Words to search - * @param function callback Receives array of words that are given POS - * @return none + * getX() - Find all words in string that are given POS + * @see get */ wordposProto.getAdjectives = get('isAdjective'); wordposProto.getAdverbs = get('isAdverb'); wordposProto.getNouns = get('isNoun'); wordposProto.getVerbs = get('isVerb'); -wordposProto.parse = prepText; - -if (!wordposProto.getIndexFile) { - wordposProto.getIndexFile = function getIndexFile(pos) { - switch(pos) { - case 'n': - return this.nounIndex; - case 'v': - return this.verbIndex; - case 'a': case 's': - return this.adjIndex; - case 'r': - return this.advIndex; - } -}; -} - /** - * getPOS() - * Find all POS for all words in given string + * parse - get deduped, less stopwords * - * @param {string} text - words to lookup for POS - * @param {function} callback - receives object with words broken into POS or 'rest', ie, - * Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]} - * @return none + * @param text {string|array} - string of words to parse. If array is given, it is left in tact. + * @returns {array} */ -wordposProto.getPOS = function(text, callback) { - var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}, - profile = this.options.profile, - start = profile && new Date(), - args = [data], - testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '), - parts = 'nouns verbs adjectives adverbs'.split(' '), - words = this.parse(text), - nTests = testFns.length, - nWords = words.length, - self = this, - c = 0, - done = function(){ - profile && (args[1] = new Date() - start); - callback.apply(null, args) - }; - - if (!nWords) return (process.nextTick(done),0); - words.forEach(lookup); - - function lookup(word){ - var any = false, - t=0; - testFns.forEach(lookupPOS); - - function lookupPOS(isFn,i,list){ - self[isFn](word, function(yes){ - yes && data[parts[i]].push(word); - any |= yes; - donePOS(); - }); - } - - function donePOS() { - if (++t == nTests) { - !any && data['rest'].push(word); - (++c == nWords) && done(); - } - } - } - return nWords; -}; - -/** - * rand() - */ -wordposProto.rand = function(opts, callback) { - var - profile = this.options.profile, - start = profile && new Date(), - results = [], - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1, - args = [null, startsWith], - parts = 'Noun Verb Adjective Adverb'.split(' '), - self = this, - done = function(){ - profile && (args.push(new Date() - start)); - args[0] = results; - callback.apply(null, args) - }; - - if (typeof opts === 'function') { - callback = opts; - } else { - opts = _.clone(opts); - } - - // TODO -- or loop count times each time getting 1 from random part!! - // slower but more random. - - // select at random a part to look at - var doParts = _.sample(parts, parts.length); - tryPart(); - - function tryPart(){ - var rand = 'rand' + doParts.pop(); - self[ rand ](opts, partCallback); - } - - function partCallback(result){ - if (result) { - results = _.uniq(results.concat(result)); // make sure it's unique! - } - - //console.log(result); - if (results.length < count && doParts.length) { - // reduce count for next part -- NO! may get duplicates - // opts.count = count - results.length; - return tryPart(); - } - - // trim excess - if (results.length > count) { - results.length = count; - } - done(); - } -}; +wordposProto.parse = prepText; WordPOS.WNdb = WNdb; -WordPOS.natural = natural; +WordPOS.stopwords = stopwords; module.exports = WordPOS; diff --git a/test.js b/test.js new file mode 100644 index 0000000..45a6324 --- /dev/null +++ b/test.js @@ -0,0 +1,40 @@ +var + WordPOS = require('./src/wordpos'), + wordpos = new WordPOS({profile: true}), + getAllPOS = wordpos.getPOS + ; + + +console.log(1111, + wordpos.lookup('foot') + //wordpos.getPOS('was doing the work the ashtray closer Also known as inject and foldl, reduce boils down a list of values into a single value', console.log + .then(function(result){ + console.log(' xxx - ', result) + }) + .catch(function(result){ + console.log(' error xxx - ', result) + })); + +//wordpos.rand({count: 3},console.log) + +return; + + +//getAllPOS('se', console.log) +wordpos.getPOS('se', console.log) + + + + + a=wordpos.getPOS('se', function(res) { + console.log(1, res) + wordpos.getPOS('sea hey who work', function(res) { + console.log(2, res) + wordpos.getPOS('sear done work ', function(res) { + console.log(3, res) + console.log('all done'); + }); + }); + }); + + console.log(a) \ No newline at end of file diff --git a/test/validate_test.js b/test/validate_test.js new file mode 100644 index 0000000..df42d61 --- /dev/null +++ b/test/validate_test.js @@ -0,0 +1,53 @@ +/** + * validate_test.js + * + * Run validate on all four main index files + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var + chai = require('chai'), + assert = chai.assert, + exec = require('child_process').exec, + cmd = 'node ' + __dirname + '/../tools/validate ', + TIMEOUT_SEC = 25 * 1000, + gDone; + + +describe('validate isX() using fastIndex', function() { + + this.timeout(TIMEOUT_SEC); + this.slow(1500); + + it('should validate index.verb', function(done) { + gDone = done; + exec(cmd + 'index.verb', callback); + }); + + it('should validate index.adv', function(done) { + gDone = done; + exec(cmd + 'index.adv', callback); + }); + + it('should validate index.adj', function(done) { + gDone = done; + exec(cmd + 'index.adj', callback); + }); + + it('should validate index.noun', function(done) { + gDone = done; + exec(cmd + 'index.noun', callback); + }); + +}); + +function callback(error, stdout, stderr) { + assert.isNull(error); + console.log(stdout); + console.error(stderr); + gDone(); +} \ No newline at end of file diff --git a/spec/wordpos_spec.js b/test/wordpos_test.js similarity index 53% rename from spec/wordpos_spec.js rename to test/wordpos_test.js index dfe4667..b349f83 100644 --- a/spec/wordpos_spec.js +++ b/test/wordpos_test.js @@ -1,19 +1,31 @@ /** * wordpos_spec.js * - * spec file for main wordpos functionality + * test file for main wordpos functionality * * Usage: - * npm install jasmine-node -g - * jasmine-node wordpos_spec.js --verbose + * npm install mocha -g + * mocha wordpos_spec.js --verbose * - * Copyright (c) 2012 mooster@42at.com + * or + * + * npm test + * + * Copyright (c) 2012-2016 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license */ -var WordPOS = require('../src/wordpos'), - wordpos = new WordPOS(); + +//import {describe, it} from 'mocha/lib/mocha.js'; + +var + chai = require('chai'), + assert = chai.assert, + WordPOS = require('../src/wordpos'), + wordpos = new WordPOS({profile: false}); + +chai.config.showDiff = true; var str = "The angry bear chased the frightened little squirrel", expected = { @@ -21,207 +33,236 @@ var str = "The angry bear chased the frightened little squirrel", verbs: [ 'bear' ], adjectives: [ 'little', 'angry', 'frightened' ], adverbs: [ 'little' ], - rest: [ 'the' ] + rest: [ 'The' ] }, garble = 'garblegarble'; // expect not to find word -function noop(){} - -describe('getX()...', function() { - - beforeEach(function() { - this.addMatchers({ - // unordered (multiset) comparison -- NOTE: doesn't handle deep! - toEqualUnordered: function(expected) { - var mismatchKeys=[], - mismatchValues=[], - result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues); - return result || (mismatchKeys.length == 0 && mismatchValues.length > 0); - } +describe('lookup', function() { + it('with callback', function (done) { + wordpos.lookup('hegemony', function (result) { + assert.equal(result.length, 1); + assert.equal(result[0].pos, 'n'); + assert.equal(result[0].lemma, 'hegemony'); + assert.equal(result[0].synonyms.length, 1); + done(); }); }); + it('with Promise', function (done) { + wordpos.lookup('hegemony').then(function (result) { + assert.equal(result.length, 1); + assert.equal(result[0].pos, 'n'); + assert.equal(result[0].lemma, 'hegemony'); + assert.equal(result[0].synonyms.length, 1); + done(); + }); + }); +}); + + +describe('options passed to constructor', function() { + var wp, + origProfile = WordPOS.defaults.profile; + + it('should override default option', function(){ + wp = new WordPOS({profile:123}); + assert.equal(wp.options.profile, 123); + assert.equal(WordPOS.defaults.profile, origProfile); + }); + + it('should not erase default option', function(){ + wp = new WordPOS({aaa:123}); + assert.equal(wp.options.aaa, 123); + assert.equal(wp.options.profile, WordPOS.defaults.profile); + }); +}); + + +describe('getX()...', function() { it('should get all POS', function(done) { wordpos.getPOS(str, function(result) { - expect(result.nouns).toEqualUnordered(expected.nouns); - expect(result.verbs).toEqualUnordered(expected.verbs); - expect(result.adjectives).toEqualUnordered(expected.adjectives); - expect(result.adverbs).toEqualUnordered(expected.adverbs); - expect(result.rest).toEqualUnordered(expected.rest); + assert.sameMembers(result.nouns, expected.nouns); + assert.sameMembers(result.verbs, expected.verbs); + assert.sameMembers(result.adjectives, expected.adjectives); + assert.sameMembers(result.adverbs, expected.adverbs); + assert.sameMembers(result.rest, expected.rest); done(); }); }); it('should get nouns', function(done) { wordpos.getNouns(str, function(result) { - expect(result).toEqualUnordered(expected.nouns); + assert.sameMembers(result, expected.nouns); done(); }); }); it('should get verbs', function(done) { wordpos.getVerbs(str, function(result) { - expect(result).toEqualUnordered(expected.verbs); + assert.sameMembers(result, expected.verbs); done(); }); }); it('should get adjectives', function(done) { wordpos.getAdjectives(str, function(result) { - expect(result).toEqualUnordered(expected.adjectives); + assert.sameMembers(result, expected.adjectives); done(); }); }); it('should get adverbs', function(done) { wordpos.getAdverbs(str, function(result) { - expect(result).toEqualUnordered(expected.adverbs); + assert.sameMembers(result, expected.adverbs); done(); }); }); }); + describe('isX()...', function() { it('should check if noun', function(done) { wordpos.isNoun(expected.nouns[0], function(result) { - expect(result).toBeTruthy(); + assert.ok(result); done(); }); }); it('should check if verb', function(done) { wordpos.isVerb(expected.verbs[0], function(result) { - expect(result).toBeTruthy(); + assert.ok(result); done(); }); }); it('should check if adjective', function(done) { wordpos.isAdjective(expected.adjectives[0], function(result) { - expect(result).toBeTruthy(); + assert.ok(result); done(); }); }); it('should check if adverb', function(done) { wordpos.isAdverb(expected.adverbs[0], function(result) { - expect(result).toBeTruthy(); + assert.ok(result); done(); }); }); }); + describe('!isX()...', function() { it('should check if !noun', function(done) { wordpos.isNoun(garble, function(result) { - expect(result).not.toBeTruthy(); + assert.notOk(result); done(); }); }); + it('should check if !verb', function(done) { wordpos.isVerb(garble, function(result) { - expect(result).not.toBeTruthy(); + assert.notOk(result); done(); }); }); + it('should check if !adjective', function(done) { wordpos.isAdjective(garble, function(result) { - expect(result).not.toBeTruthy(); + assert.notOk(result); done(); }); }); + it('should check if !adverb', function(done) { wordpos.isAdverb(garble, function(result) { - expect(result).not.toBeTruthy(); + assert.notOk(result); done(); }); }); }); + describe('lookupX()...', function() { it('should lookup noun', function(done) { wordpos.lookupNoun('squirrel', function(result) { - expect(result[0].pos).toBe('n'); - expect(result[0].lemma).toBe('squirrel'); + assert.equal(result.length, 2); + assert.equal(result[0].pos, 'n'); + assert.equal(result[0].lemma, 'squirrel'); done(); }); }); + it('should lookup verb', function(done) { wordpos.lookupVerb('bear', function(result) { - expect(result[0].pos).toBe('v'); - expect(result[0].lemma).toBe('have_a_bun_in_the_oven'); + assert.equal(result.length, 13); + assert.equal(result[0].pos, 'v'); + assert.equal(result[0].lemma, 'bear'); done(); }); }); + it('should lookup adjective', function(done) { wordpos.lookupAdjective('angry', function(result) { - expect(result[0].pos).toBe('s'); - expect(result[0].lemma).toBe('angry'); + assert.equal(result.length, 3); + assert.equal(result[0].pos, 'a'); + assert.equal(result[0].lemma, 'angry'); done(); }); }); + it('should lookup adverb', function(done) { wordpos.lookupAdverb('little', function(result) { - expect(result[0].pos).toBe('r'); - expect(result[0].lemma).toBe('little'); + assert.equal(result.length, 1); + assert.equal(result[0].pos, 'r'); + assert.equal(result[0].lemma, 'little'); done(); }); }); }); -describe('options passed to constructor', function() { - var wp, origProfile = WordPOS.defaults.profile; - - it('should override default option', function(){ - wp = new WordPOS({profile:123}); - expect(wp.options.profile).toEqual(123); - expect(WordPOS.defaults.profile).toEqual(origProfile); - }); - - it('should not erase default option', function(){ - wp = new WordPOS({aaa:123}); - expect(wp.options.aaa).toEqual(123); - expect(wp.options.profile).toEqual(WordPOS.defaults.profile); - }); -}); describe('profile option', function() { var wp = new WordPOS({profile : true}); it('should return time argument for isX()', function(done){ wp.isNoun(garble, function(result, word, time) { - expect(word).toEqual(garble); - expect(time).toBeDefined(); + assert.equal(word, garble); + assert.isDefined(time); done(); }); }); it('should return time argument for getX()', function(done){ wp.getNouns(garble, function(result, time) { - expect(time).toBeDefined(); + assert.isDefined(time); done(); }); }); it('should return time argument for lookupX()', function(done){ wp.isNoun(garble, function(result, time) { - expect(time).toBeDefined(); + assert.isDefined(time); done(); }); }); - it('should disable stopword filtering', function(){ + it('should disable stopword filtering', function(done){ var wp = new WordPOS({stopwords : false}), strWithStopwords = 'about after all'; // 3 adjective stopwords - expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3); + wp.getAdjectives(strWithStopwords, function(result){ + assert.equal(result.length, 3); + done(); + }); }); - it('should use custom stopwords', function(){ + it('should use custom stopwords', function(done){ var wp = new WordPOS({stopwords : ['all']}), strWithStopwords = 'about after all'; // 3 adjective stopwords // 'all' should be filtered - expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2); + wp.getAdjectives(strWithStopwords, function(result){ + assert.equal(result.length, 2); + done(); + }); }); - }); @@ -232,11 +273,11 @@ describe('nested callbacks on same index key', function() { it('should call inner callback', function(done){ wp.getPOS(word1, function(result) { - expect(result.nouns[0]).toEqual(word1); + assert.equal(result.nouns[0], word1); // inner call on word2 wp.getPOS(word2, function(result) { - expect(result.nouns[0]).toEqual(word2); + assert.equal(result.nouns[0], word2); done(); }); }); @@ -246,54 +287,61 @@ describe('nested callbacks on same index key', function() { describe('rand()...', function() { it('should get random word', function(done) { - wordpos.randNoun(function(result) { - expect(result).toBeTruthy(); + wordpos.rand(function(result) { + assert.equal(result.length, 1); done(); }); }); + it('should get N random words', function(done) { wordpos.rand({count: 3}, function(result) { - expect(result.length).toEqual(3); + assert.equal(result.length, 3); done(); }); }); + it('should get random word starting with', function(done) { wordpos.rand({startsWith: 'foo'}, function(result, startsWith) { - expect(result[0].indexOf('foo')).toEqual(0); - expect(startsWith).toEqual('foo'); + assert.equal(result[0].indexOf('foo'), 0); + assert.equal(startsWith, 'foo'); done(); }); }); - it('should get nothing starting with not fount', function(done) { + + it('should get nothing starting with not found', function(done) { wordpos.rand({startsWith: 'zzzz'}, function(result) { - expect(result.length).toEqual(0); + assert.equal(result.length, 0); done(); }); }); }); + describe('randX()...', function() { it('should get random noun', function(done) { wordpos.randNoun(function(result) { - expect(result.length).toEqual(1); + assert.equal(result.length, 1); done(); }); }); + it('should get random verb', function(done) { wordpos.randVerb(function(result) { - expect(result.length).toEqual(1); + assert.equal(result.length, 1); done(); }); }); + it('should get random adjective', function(done) { wordpos.randAdjective(function(result) { - expect(result.length).toEqual(1); + assert.equal(result.length, 1); done(); }); }); + it('should get random adverb', function(done) { wordpos.randAdverb(function(result) { - expect(result.length).toEqual(1); + assert.equal(result.length, 1); done(); }); }); @@ -301,9 +349,43 @@ describe('randX()...', function() { // not found it('should NOT get random noun starting with', function(done) { wordpos.randNoun({startsWith: 'zzzz'},function(result, startsWith) { - expect(result.length).toEqual(0); + assert.equal(result.length, 0); done(); }); }); +}); + + +describe('Promise pattern', function() { + + it('lookup()', function () { + return wordpos.lookup('hegemony').then(function (result) { + assert.equal(result.length, 1); + }); + }); + + it('lookupX()', function () { + return wordpos.lookupNoun('hegemony').then(function (result) { + assert.equal(result.length, 1); + }); + }); + + it('getPOS()', function () { + return wordpos.getPOS('hegemony').then(function (result) { + assert.equal(result.nouns.length, 1); + }); + }); + + it('getX()', function () { + return wordpos.getVerbs('bear').then(function (result) { + assert.equal(result.length, 1); + }); + }); + + it('isX()', function () { + return wordpos.isAdjective('little').then(function (result) { + assert.equal(result, true); + }); + }); }); \ No newline at end of file