add stopword option, pass array to getX(), lookupX() cb gets lookup word
This commit is contained in:
parent
8c3ec4ea8a
commit
75a51beccd
45
README.md
45
README.md
|
@ -18,7 +18,7 @@ wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', f
|
||||||
wordpos.isAdjective('awesome', function(result){
|
wordpos.isAdjective('awesome', function(result){
|
||||||
console.log(result);
|
console.log(result);
|
||||||
});
|
});
|
||||||
// true
|
// true 'awesome'
|
||||||
```
|
```
|
||||||
|
|
||||||
See `wordpos_spec.js` for full usage.
|
See `wordpos_spec.js` for full usage.
|
||||||
|
@ -46,31 +46,33 @@ Please note: all API are async since the underlying WordNet library is async. Wo
|
||||||
Get POS from text.
|
Get POS from text.
|
||||||
|
|
||||||
```
|
```
|
||||||
wordpos.getPOS(str, callback) -- callback receives a result object:
|
wordpos.getPOS(text, callback) -- callback receives a result object:
|
||||||
{
|
{
|
||||||
nouns:[], Array of str words that are nouns
|
nouns:[], Array of text words that are nouns
|
||||||
verbs:[], Array of str words that are verbs
|
verbs:[], Array of text words that are verbs
|
||||||
adjectives:[], Array of str words that are adjectives
|
adjectives:[], Array of text words that are adjectives
|
||||||
adverbs:[], Array of str words that are adverbs
|
adverbs:[], Array of text words that are adverbs
|
||||||
rest:[] Array of str words that are not in dict or could not be categorized as a POS
|
rest:[] Array of text words that are not in dict or could not be categorized as a POS
|
||||||
}
|
}
|
||||||
|
|
||||||
Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective)
|
Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective)
|
||||||
|
|
||||||
wordpos.getNouns(str, callback) -- callback receives an array of nouns in str
|
wordpos.getNouns(text, callback) -- callback receives an array of nouns in text
|
||||||
|
|
||||||
wordpos.getVerbs(str, callback) -- callback receives an array of verbs in str
|
wordpos.getVerbs(text, callback) -- callback receives an array of verbs in text
|
||||||
|
|
||||||
wordpos.getAdjectives(str, callback) -- callback receives an array of adjectives in str
|
wordpos.getAdjectives(text, callback) -- callback receives an array of adjectives in text
|
||||||
|
|
||||||
wordpos.getAdverbs(str, callback) -- callback receives an array of adverbs in str
|
wordpos.getAdverbs(text, callback) -- callback receives an array of adverbs in text
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
|
If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
|
||||||
than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
|
than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
|
||||||
are stripped out from str before lookup.
|
are stripped out from text before lookup.
|
||||||
|
|
||||||
All getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
|
If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied.
|
||||||
|
|
||||||
|
getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -151,6 +153,8 @@ wordpos.lookupAdjective(word, callback) -- callback receives array of lookup obj
|
||||||
wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb
|
wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb
|
||||||
```
|
```
|
||||||
|
|
||||||
|
lookupX() methods return the looked-up word as the second argument to the callback.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
|
@ -166,7 +170,7 @@ wordpos.lookupAdjective('awesome', console.log);
|
||||||
ptrs: [],
|
ptrs: [],
|
||||||
gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring
|
gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring
|
||||||
sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s
|
sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s
|
||||||
oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ]
|
oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ], 'awesome'
|
||||||
```
|
```
|
||||||
In this case only one lookup was found. But there could be several.
|
In this case only one lookup was found. But there could be several.
|
||||||
|
|
||||||
|
@ -182,9 +186,12 @@ wordpos.lookup('great', console.log);
|
||||||
|
|
||||||
```
|
```
|
||||||
WordPOS.WNdb -- access to the WNdb object
|
WordPOS.WNdb -- access to the WNdb object
|
||||||
|
WordPOS.natural -- access to underlying 'natural' module
|
||||||
wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally.
|
wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally.
|
||||||
|
|
||||||
```
|
```
|
||||||
|
E.g., WordPOS.natural.stopwords is the list of stopwords.
|
||||||
|
|
||||||
|
|
||||||
### Options
|
### Options
|
||||||
|
|
||||||
|
@ -198,7 +205,13 @@ WordPOS.defaults = {
|
||||||
/**
|
/**
|
||||||
* use fast index if available
|
* use fast index if available
|
||||||
*/
|
*/
|
||||||
fastIndex: true
|
fastIndex: true,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if true, exclude standard stopwords, or array of stop words to exclude.
|
||||||
|
* Set to false to not filter for any stopwords.
|
||||||
|
*/
|
||||||
|
stopwords: true
|
||||||
};
|
};
|
||||||
```
|
```
|
||||||
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
|
To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
|
||||||
|
|
|
@ -206,5 +206,22 @@ describe('profile option', function() {
|
||||||
done();
|
done();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should disable stopword filtering', function(){
|
||||||
|
var wp = new WordPOS({stopwords : false}),
|
||||||
|
strWithStopwords = 'about after all'; // 3 adjective stopwords
|
||||||
|
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use custom stopwords', function(){
|
||||||
|
var wp = new WordPOS({stopwords : ['all']}),
|
||||||
|
strWithStopwords = 'about after all'; // 3 adjective stopwords
|
||||||
|
// 'all' should be filtered
|
||||||
|
expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
function noop(){}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ var _ = require('underscore')._,
|
||||||
natural = require('natural'),
|
natural = require('natural'),
|
||||||
WordNet = natural.WordNet,
|
WordNet = natural.WordNet,
|
||||||
tokenizer = new natural.WordTokenizer(),
|
tokenizer = new natural.WordTokenizer(),
|
||||||
stopwords = ' '+ natural.stopwords.join(' ') +' ',
|
natural_stopwords = makeStopwordString(natural.stopwords),
|
||||||
WNdb = require('WNdb'),
|
WNdb = require('WNdb'),
|
||||||
fastIndex = null;
|
fastIndex = null;
|
||||||
|
|
||||||
|
@ -26,12 +26,21 @@ function normalize(word) {
|
||||||
return word.toLowerCase().replace(/\s+/g, '_');
|
return word.toLowerCase().replace(/\s+/g, '_');
|
||||||
}
|
}
|
||||||
|
|
||||||
function isStopword(word) {
|
function makeStopwordString(stopwords) {
|
||||||
|
return ' '+ stopwords.join(' ') +' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
function isStopword(stopwords, word) {
|
||||||
return stopwords.indexOf(' '+word+' ') >= 0;
|
return stopwords.indexOf(' '+word+' ') >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
function prepText(text) {
|
function prepText(text) {
|
||||||
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
|
if (_.isArray(text)) return text;
|
||||||
|
var deduped = _.uniq(tokenizer.tokenize(text));
|
||||||
|
if (!this.options.stopwords) return deduped;
|
||||||
|
return _.reject(deduped, _.bind(isStopword, null,
|
||||||
|
_.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
function lookup(pos) {
|
function lookup(pos) {
|
||||||
|
@ -43,7 +52,7 @@ function lookup(pos) {
|
||||||
this.lookupFromFiles([
|
this.lookupFromFiles([
|
||||||
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
||||||
], [], word, function(results){
|
], [], word, function(results){
|
||||||
args.push(results);
|
args.push(results, word);
|
||||||
profile && args.push(new Date() - start);
|
profile && args.push(new Date() - start);
|
||||||
callback.apply(null, args);
|
callback.apply(null, args);
|
||||||
});
|
});
|
||||||
|
@ -70,7 +79,7 @@ function get(isFn) {
|
||||||
return function(text, callback) {
|
return function(text, callback) {
|
||||||
var profile = this.options.profile,
|
var profile = this.options.profile,
|
||||||
start = profile && new Date(),
|
start = profile && new Date(),
|
||||||
words = prepText(text),
|
words = this.parse(text),
|
||||||
n = words.length,
|
n = words.length,
|
||||||
i = 0,
|
i = 0,
|
||||||
self = this,
|
self = this,
|
||||||
|
@ -110,6 +119,10 @@ var WordPOS = function(options) {
|
||||||
this.adjIndex.find = fastIndex.find(this.adjIndex);
|
this.adjIndex.find = fastIndex.find(this.adjIndex);
|
||||||
this.advIndex.find = fastIndex.find(this.advIndex);
|
this.advIndex.find = fastIndex.find(this.advIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (_.isArray(this.options.stopwords)) {
|
||||||
|
this.options.stopwords = makeStopwordString(this.options.stopwords);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
util.inherits(WordPOS, WordNet);
|
util.inherits(WordPOS, WordNet);
|
||||||
|
|
||||||
|
@ -122,7 +135,13 @@ WordPOS.defaults = {
|
||||||
/**
|
/**
|
||||||
* use fast index if available
|
* use fast index if available
|
||||||
*/
|
*/
|
||||||
fastIndex: true
|
fastIndex: true,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if true, exclude standard stopwords, or array of stop words to exclude.
|
||||||
|
* Set to false to not filter for any stopwords.
|
||||||
|
*/
|
||||||
|
stopwords: true
|
||||||
};
|
};
|
||||||
|
|
||||||
var wordposProto = WordPOS.prototype;
|
var wordposProto = WordPOS.prototype;
|
||||||
|
@ -200,7 +219,7 @@ wordposProto.getPOS = function(text, callback) {
|
||||||
args = [data],
|
args = [data],
|
||||||
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
||||||
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
||||||
words = prepText(text),
|
words = this.parse(text),
|
||||||
nTests = testFns.length,
|
nTests = testFns.length,
|
||||||
nWords = words.length,
|
nWords = words.length,
|
||||||
self = this,
|
self = this,
|
||||||
|
@ -237,5 +256,7 @@ wordposProto.getPOS = function(text, callback) {
|
||||||
};
|
};
|
||||||
|
|
||||||
WordPOS.WNdb = WNdb;
|
WordPOS.WNdb = WNdb;
|
||||||
|
WordPOS.natural = natural;
|
||||||
|
|
||||||
|
|
||||||
module.exports = WordPOS;
|
module.exports = WordPOS;
|
||||||
|
|
Loading…
Reference in New Issue