add stopword option, pass array to getX(), lookupX() cb gets lookup word
This commit is contained in:
parent
8c3ec4ea8a
commit
75a51beccd
45
README.md
45
README.md
|
@ -18,7 +18,7 @@ wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', f
|
|||
wordpos.isAdjective('awesome', function(result){
|
||||
console.log(result);
|
||||
});
|
||||
// true
|
||||
// true 'awesome'
|
||||
```
|
||||
|
||||
See `wordpos_spec.js` for full usage.
|
||||
|
@ -46,31 +46,33 @@ Please note: all API are async since the underlying WordNet library is async. Wo
|
|||
Get POS from text.
|
||||
|
||||
```
|
||||
wordpos.getPOS(str, callback) -- callback receives a result object:
|
||||
wordpos.getPOS(text, callback) -- callback receives a result object:
|
||||
{
|
||||
nouns:[], Array of str words that are nouns
|
||||
verbs:[], Array of str words that are verbs
|
||||
adjectives:[], Array of str words that are adjectives
|
||||
adverbs:[], Array of str words that are adverbs
|
||||
rest:[] Array of str words that are not in dict or could not be categorized as a POS
|
||||
nouns:[], Array of text words that are nouns
|
||||
verbs:[], Array of text words that are verbs
|
||||
adjectives:[], Array of text words that are adjectives
|
||||
adverbs:[], Array of text words that are adverbs
|
||||
rest:[] Array of text words that are not in dict or could not be categorized as a POS
|
||||
}
|
||||
|
||||
Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective)
|
||||
|
||||
wordpos.getNouns(str, callback) -- callback receives an array of nouns in str
|
||||
wordpos.getNouns(text, callback) -- callback receives an array of nouns in text
|
||||
|
||||
wordpos.getVerbs(str, callback) -- callback receives an array of verbs in str
|
||||
wordpos.getVerbs(text, callback) -- callback receives an array of verbs in text
|
||||
|
||||
wordpos.getAdjectives(str, callback) -- callback receives an array of adjectives in str
|
||||
wordpos.getAdjectives(text, callback) -- callback receives an array of adjectives in text
|
||||
|
||||
wordpos.getAdverbs(str, callback) -- callback receives an array of adverbs in str
|
||||
wordpos.getAdverbs(text, callback) -- callback receives an array of adverbs in text
|
||||
```
|
||||
|
||||
If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
|
||||
than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
|
||||
are stripped out from str before lookup.
|
||||
are stripped out from text before lookup.
|
||||
|
||||
All getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
|
||||
If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied.
|
||||
|
||||
getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -151,6 +153,8 @@ wordpos.lookupAdjective(word, callback) -- callback receives array of lookup obj
|
|||
wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb
|
||||
```
|
||||
|
||||
lookupX() methods return the looked-up word as the second argument to the callback.
|
||||
|
||||
Example:
|
||||
|
||||
```js
|
||||
|
@ -166,7 +170,7 @@ wordpos.lookupAdjective('awesome', console.log);
|
|||
ptrs: [],
|
||||
gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring
|
||||
sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s
|
||||
oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ]
|
||||
oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ], 'awesome'
|
||||
```
|
||||
In this case only one lookup was found. But there could be several.
|
||||
|
||||
|
@ -182,9 +186,12 @@ wordpos.lookup('great', console.log);
|
|||
|
||||
```
|
||||
WordPOS.WNdb -- access to the WNdb object
|
||||
|
||||
WordPOS.natural -- access to underlying 'natural' module
|
||||
wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally.
|
||||
|
||||
```
|
||||
E.g., WordPOS.natural.stopwords is the list of stopwords.
|
||||
|
||||
|
||||
### Options
|
||||
|
||||
|
@ -198,7 +205,13 @@ WordPOS.defaults = {
|
|||
/**
|
||||
* use fast index if available
|
||||
*/
|
||||
fastIndex: true
|
||||
fastIndex: true,
|
||||
|
||||
/**
|
||||
* if true, exclude standard stopwords, or array of stop words to exclude.
|
||||
* Set to false to not filter for any stopwords.
|
||||
*/
|
||||
stopwords: true
|
||||
};
|
||||
```
|
||||
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
|
||||
|
|
|
@ -206,5 +206,22 @@ describe('profile option', function() {
|
|||
done();
|
||||
});
|
||||
});
|
||||
|
||||
it('should disable stopword filtering', function(){
|
||||
var wp = new WordPOS({stopwords : false}),
|
||||
strWithStopwords = 'about after all'; // 3 adjective stopwords
|
||||
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3);
|
||||
});
|
||||
|
||||
it('should use custom stopwords', function(){
|
||||
var wp = new WordPOS({stopwords : ['all']}),
|
||||
strWithStopwords = 'about after all'; // 3 adjective stopwords
|
||||
// 'all' should be filtered
|
||||
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2);
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
|
||||
function noop(){}
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ var _ = require('underscore')._,
|
|||
natural = require('natural'),
|
||||
WordNet = natural.WordNet,
|
||||
tokenizer = new natural.WordTokenizer(),
|
||||
stopwords = ' '+ natural.stopwords.join(' ') +' ',
|
||||
natural_stopwords = makeStopwordString(natural.stopwords),
|
||||
WNdb = require('WNdb'),
|
||||
fastIndex = null;
|
||||
|
||||
|
@ -26,12 +26,21 @@ function normalize(word) {
|
|||
return word.toLowerCase().replace(/\s+/g, '_');
|
||||
}
|
||||
|
||||
function isStopword(word) {
|
||||
function makeStopwordString(stopwords) {
|
||||
return ' '+ stopwords.join(' ') +' ';
|
||||
}
|
||||
|
||||
function isStopword(stopwords, word) {
|
||||
return stopwords.indexOf(' '+word+' ') >= 0;
|
||||
}
|
||||
|
||||
function prepText(text) {
|
||||
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
|
||||
if (_.isArray(text)) return text;
|
||||
var deduped = _.uniq(tokenizer.tokenize(text));
|
||||
if (!this.options.stopwords) return deduped;
|
||||
return _.reject(deduped, _.bind(isStopword, null,
|
||||
_.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords
|
||||
));
|
||||
}
|
||||
|
||||
function lookup(pos) {
|
||||
|
@ -43,7 +52,7 @@ function lookup(pos) {
|
|||
this.lookupFromFiles([
|
||||
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
||||
], [], word, function(results){
|
||||
args.push(results);
|
||||
args.push(results, word);
|
||||
profile && args.push(new Date() - start);
|
||||
callback.apply(null, args);
|
||||
});
|
||||
|
@ -70,7 +79,7 @@ function get(isFn) {
|
|||
return function(text, callback) {
|
||||
var profile = this.options.profile,
|
||||
start = profile && new Date(),
|
||||
words = prepText(text),
|
||||
words = this.parse(text),
|
||||
n = words.length,
|
||||
i = 0,
|
||||
self = this,
|
||||
|
@ -110,6 +119,10 @@ var WordPOS = function(options) {
|
|||
this.adjIndex.find = fastIndex.find(this.adjIndex);
|
||||
this.advIndex.find = fastIndex.find(this.advIndex);
|
||||
}
|
||||
|
||||
if (_.isArray(this.options.stopwords)) {
|
||||
this.options.stopwords = makeStopwordString(this.options.stopwords);
|
||||
}
|
||||
};
|
||||
util.inherits(WordPOS, WordNet);
|
||||
|
||||
|
@ -122,7 +135,13 @@ WordPOS.defaults = {
|
|||
/**
|
||||
* use fast index if available
|
||||
*/
|
||||
fastIndex: true
|
||||
fastIndex: true,
|
||||
|
||||
/**
|
||||
* if true, exclude standard stopwords, or array of stop words to exclude.
|
||||
* Set to false to not filter for any stopwords.
|
||||
*/
|
||||
stopwords: true
|
||||
};
|
||||
|
||||
var wordposProto = WordPOS.prototype;
|
||||
|
@ -200,7 +219,7 @@ wordposProto.getPOS = function(text, callback) {
|
|||
args = [data],
|
||||
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
||||
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
||||
words = prepText(text),
|
||||
words = this.parse(text),
|
||||
nTests = testFns.length,
|
||||
nWords = words.length,
|
||||
self = this,
|
||||
|
@ -237,5 +256,7 @@ wordposProto.getPOS = function(text, callback) {
|
|||
};
|
||||
|
||||
WordPOS.WNdb = WNdb;
|
||||
WordPOS.natural = natural;
|
||||
|
||||
|
||||
module.exports = WordPOS;
|
||||
|
|
Loading…
Reference in New Issue