add stopword option, pass array to getX(), lookupX() cb gets lookup word

This commit is contained in:
moos 2012-05-30 14:58:18 -07:00
parent 8c3ec4ea8a
commit 75a51beccd
3 changed files with 74 additions and 23 deletions

View File

@ -18,7 +18,7 @@ wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', f
wordpos.isAdjective('awesome', function(result){ wordpos.isAdjective('awesome', function(result){
console.log(result); console.log(result);
}); });
// true // true 'awesome'
``` ```
See `wordpos_spec.js` for full usage. See `wordpos_spec.js` for full usage.
@ -46,31 +46,33 @@ Please note: all API are async since the underlying WordNet library is async. Wo
Get POS from text. Get POS from text.
``` ```
wordpos.getPOS(str, callback) -- callback receives a result object: wordpos.getPOS(text, callback) -- callback receives a result object:
{ {
nouns:[], Array of str words that are nouns nouns:[], Array of text words that are nouns
verbs:[], Array of str words that are verbs verbs:[], Array of text words that are verbs
adjectives:[], Array of str words that are adjectives adjectives:[], Array of text words that are adjectives
adverbs:[], Array of str words that are adverbs adverbs:[], Array of text words that are adverbs
rest:[] Array of str words that are not in dict or could not be categorized as a POS rest:[] Array of text words that are not in dict or could not be categorized as a POS
} }
Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective) Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective)
wordpos.getNouns(str, callback) -- callback receives an array of nouns in str wordpos.getNouns(text, callback) -- callback receives an array of nouns in text
wordpos.getVerbs(str, callback) -- callback receives an array of verbs in str wordpos.getVerbs(text, callback) -- callback receives an array of verbs in text
wordpos.getAdjectives(str, callback) -- callback receives an array of adjectives in str wordpos.getAdjectives(text, callback) -- callback receives an array of adjectives in text
wordpos.getAdverbs(str, callback) -- callback receives an array of adverbs in str wordpos.getAdverbs(text, callback) -- callback receives an array of adverbs in text
``` ```
If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js) than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
are stripped out from str before lookup. are stripped out from text before lookup.
All getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords). If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied.
getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
Example: Example:
@ -151,6 +153,8 @@ wordpos.lookupAdjective(word, callback) -- callback receives array of lookup obj
wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb
``` ```
lookupX() methods return the looked-up word as the second argument to the callback.
Example: Example:
```js ```js
@ -166,7 +170,7 @@ wordpos.lookupAdjective('awesome', console.log);
ptrs: [], ptrs: [],
gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring
sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s
oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ] oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ], 'awesome'
``` ```
In this case only one lookup was found. But there could be several. In this case only one lookup was found. But there could be several.
@ -182,9 +186,12 @@ wordpos.lookup('great', console.log);
``` ```
WordPOS.WNdb -- access to the WNdb object WordPOS.WNdb -- access to the WNdb object
WordPOS.natural -- access to underlying 'natural' module
wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally. wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally.
``` ```
E.g., WordPOS.natural.stopwords is the list of stopwords.
### Options ### Options
@ -198,7 +205,13 @@ WordPOS.defaults = {
/** /**
* use fast index if available * use fast index if available
*/ */
fastIndex: true fastIndex: true,
/**
* if true, exclude standard stopwords, or array of stop words to exclude.
* Set to false to not filter for any stopwords.
*/
stopwords: true
}; };
``` ```
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call. To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.

View File

@ -206,5 +206,22 @@ describe('profile option', function() {
done(); done();
}); });
}); });
it('should disable stopword filtering', function(){
var wp = new WordPOS({stopwords : false}),
strWithStopwords = 'about after all'; // 3 adjective stopwords
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3);
});
it('should use custom stopwords', function(){
var wp = new WordPOS({stopwords : ['all']}),
strWithStopwords = 'about after all'; // 3 adjective stopwords
// 'all' should be filtered
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2);
});
}); });
function noop(){}

View File

@ -14,7 +14,7 @@ var _ = require('underscore')._,
natural = require('natural'), natural = require('natural'),
WordNet = natural.WordNet, WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(), tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ', natural_stopwords = makeStopwordString(natural.stopwords),
WNdb = require('WNdb'), WNdb = require('WNdb'),
fastIndex = null; fastIndex = null;
@ -26,12 +26,21 @@ function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_'); return word.toLowerCase().replace(/\s+/g, '_');
} }
function isStopword(word) { function makeStopwordString(stopwords) {
return ' '+ stopwords.join(' ') +' ';
}
function isStopword(stopwords, word) {
return stopwords.indexOf(' '+word+' ') >= 0; return stopwords.indexOf(' '+word+' ') >= 0;
} }
function prepText(text) { function prepText(text) {
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword); if (_.isArray(text)) return text;
var deduped = _.uniq(tokenizer.tokenize(text));
if (!this.options.stopwords) return deduped;
return _.reject(deduped, _.bind(isStopword, null,
_.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords
));
} }
function lookup(pos) { function lookup(pos) {
@ -43,7 +52,7 @@ function lookup(pos) {
this.lookupFromFiles([ this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)} {index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, function(results){ ], [], word, function(results){
args.push(results); args.push(results, word);
profile && args.push(new Date() - start); profile && args.push(new Date() - start);
callback.apply(null, args); callback.apply(null, args);
}); });
@ -70,7 +79,7 @@ function get(isFn) {
return function(text, callback) { return function(text, callback) {
var profile = this.options.profile, var profile = this.options.profile,
start = profile && new Date(), start = profile && new Date(),
words = prepText(text), words = this.parse(text),
n = words.length, n = words.length,
i = 0, i = 0,
self = this, self = this,
@ -110,6 +119,10 @@ var WordPOS = function(options) {
this.adjIndex.find = fastIndex.find(this.adjIndex); this.adjIndex.find = fastIndex.find(this.adjIndex);
this.advIndex.find = fastIndex.find(this.advIndex); this.advIndex.find = fastIndex.find(this.advIndex);
} }
if (_.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
}
}; };
util.inherits(WordPOS, WordNet); util.inherits(WordPOS, WordNet);
@ -122,7 +135,13 @@ WordPOS.defaults = {
/** /**
* use fast index if available * use fast index if available
*/ */
fastIndex: true fastIndex: true,
/**
* if true, exclude standard stopwords, or array of stop words to exclude.
* Set to false to not filter for any stopwords.
*/
stopwords: true
}; };
var wordposProto = WordPOS.prototype; var wordposProto = WordPOS.prototype;
@ -200,7 +219,7 @@ wordposProto.getPOS = function(text, callback) {
args = [data], args = [data],
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '), testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
parts = 'nouns verbs adjectives adverbs'.split(' '), parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text), words = this.parse(text),
nTests = testFns.length, nTests = testFns.length,
nWords = words.length, nWords = words.length,
self = this, self = this,
@ -237,5 +256,7 @@ wordposProto.getPOS = function(text, callback) {
}; };
WordPOS.WNdb = WNdb; WordPOS.WNdb = WNdb;
WordPOS.natural = natural;
module.exports = WordPOS; module.exports = WordPOS;