diff --git a/README.md b/README.md index 9d0d758..c0c7e03 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', f wordpos.isAdjective('awesome', function(result){ console.log(result); }); -// true +// true 'awesome' ``` See `wordpos_spec.js` for full usage. @@ -46,31 +46,33 @@ Please note: all API are async since the underlying WordNet library is async. Wo Get POS from text. ``` -wordpos.getPOS(str, callback) -- callback receives a result object: +wordpos.getPOS(text, callback) -- callback receives a result object: { - nouns:[], Array of str words that are nouns - verbs:[], Array of str words that are verbs - adjectives:[], Array of str words that are adjectives - adverbs:[], Array of str words that are adverbs - rest:[] Array of str words that are not in dict or could not be categorized as a POS + nouns:[], Array of text words that are nouns + verbs:[], Array of text words that are verbs + adjectives:[], Array of text words that are adjectives + adverbs:[], Array of text words that are adverbs + rest:[] Array of text words that are not in dict or could not be categorized as a POS } Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective) -wordpos.getNouns(str, callback) -- callback receives an array of nouns in str +wordpos.getNouns(text, callback) -- callback receives an array of nouns in text -wordpos.getVerbs(str, callback) -- callback receives an array of verbs in str +wordpos.getVerbs(text, callback) -- callback receives an array of verbs in text -wordpos.getAdjectives(str, callback) -- callback receives an array of adjectives in str +wordpos.getAdjectives(text, callback) -- callback receives an array of adjectives in text -wordpos.getAdverbs(str, callback) -- callback receives an array of adverbs in str +wordpos.getAdverbs(text, callback) -- callback receives an array of adverbs in text ``` If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js) -are stripped out from str before lookup. +are stripped out from text before lookup. -All getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords). +If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied. + +getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords). Example: @@ -151,6 +153,8 @@ wordpos.lookupAdjective(word, callback) -- callback receives array of lookup obj wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb ``` +lookupX() methods return the looked-up word as the second argument to the callback. + Example: ```js @@ -166,7 +170,7 @@ wordpos.lookupAdjective('awesome', console.log); ptrs: [], gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s -oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ] +oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent" ' } ], 'awesome' ``` In this case only one lookup was found. But there could be several. @@ -182,9 +186,12 @@ wordpos.lookup('great', console.log); ``` WordPOS.WNdb -- access to the WNdb object - +WordPOS.natural -- access to underlying 'natural' module wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords. This method is called on all getX() calls internally. + ``` +E.g., WordPOS.natural.stopwords is the list of stopwords. + ### Options @@ -198,7 +205,13 @@ WordPOS.defaults = { /** * use fast index if available */ - fastIndex: true + fastIndex: true, + + /** + * if true, exclude standard stopwords, or array of stop words to exclude. + * Set to false to not filter for any stopwords. + */ + stopwords: true }; ``` To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call. diff --git a/spec/wordpos_spec.js b/spec/wordpos_spec.js index 88efab0..0cf7583 100644 --- a/spec/wordpos_spec.js +++ b/spec/wordpos_spec.js @@ -206,5 +206,22 @@ describe('profile option', function() { done(); }); }); + + it('should disable stopword filtering', function(){ + var wp = new WordPOS({stopwords : false}), + strWithStopwords = 'about after all'; // 3 adjective stopwords + expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3); + }); + + it('should use custom stopwords', function(){ + var wp = new WordPOS({stopwords : ['all']}), + strWithStopwords = 'about after all'; // 3 adjective stopwords + // 'all' should be filtered + expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2); + }); + }); + +function noop(){} + diff --git a/src/wordpos.js b/src/wordpos.js index 1b41888..ff538ae 100644 --- a/src/wordpos.js +++ b/src/wordpos.js @@ -14,7 +14,7 @@ var _ = require('underscore')._, natural = require('natural'), WordNet = natural.WordNet, tokenizer = new natural.WordTokenizer(), - stopwords = ' '+ natural.stopwords.join(' ') +' ', + natural_stopwords = makeStopwordString(natural.stopwords), WNdb = require('WNdb'), fastIndex = null; @@ -26,12 +26,21 @@ function normalize(word) { return word.toLowerCase().replace(/\s+/g, '_'); } -function isStopword(word) { +function makeStopwordString(stopwords) { + return ' '+ stopwords.join(' ') +' '; +} + +function isStopword(stopwords, word) { return stopwords.indexOf(' '+word+' ') >= 0; } function prepText(text) { - return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword); + if (_.isArray(text)) return text; + var deduped = _.uniq(tokenizer.tokenize(text)); + if (!this.options.stopwords) return deduped; + return _.reject(deduped, _.bind(isStopword, null, + _.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords + )); } function lookup(pos) { @@ -43,7 +52,7 @@ function lookup(pos) { this.lookupFromFiles([ {index: this.getIndexFile(pos), data: this.getDataFile(pos)} ], [], word, function(results){ - args.push(results); + args.push(results, word); profile && args.push(new Date() - start); callback.apply(null, args); }); @@ -70,7 +79,7 @@ function get(isFn) { return function(text, callback) { var profile = this.options.profile, start = profile && new Date(), - words = prepText(text), + words = this.parse(text), n = words.length, i = 0, self = this, @@ -110,6 +119,10 @@ var WordPOS = function(options) { this.adjIndex.find = fastIndex.find(this.adjIndex); this.advIndex.find = fastIndex.find(this.advIndex); } + + if (_.isArray(this.options.stopwords)) { + this.options.stopwords = makeStopwordString(this.options.stopwords); + } }; util.inherits(WordPOS, WordNet); @@ -122,7 +135,13 @@ WordPOS.defaults = { /** * use fast index if available */ - fastIndex: true + fastIndex: true, + + /** + * if true, exclude standard stopwords, or array of stop words to exclude. + * Set to false to not filter for any stopwords. + */ + stopwords: true }; var wordposProto = WordPOS.prototype; @@ -200,7 +219,7 @@ wordposProto.getPOS = function(text, callback) { args = [data], testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '), parts = 'nouns verbs adjectives adverbs'.split(' '), - words = prepText(text), + words = this.parse(text), nTests = testFns.length, nWords = words.length, self = this, @@ -237,5 +256,7 @@ wordposProto.getPOS = function(text, callback) { }; WordPOS.WNdb = WNdb; +WordPOS.natural = natural; + module.exports = WordPOS;