add stopword option, pass array to getX(), lookupX() cb gets lookup word

2012-05-30 14:58:18 -07:00 · 2012-05-30 14:58:18 -07:00 · 75a51beccd
parent 8c3ec4ea8a
commit 75a51beccd
3 changed files with 74 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -18,7 +18,7 @@ wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', f
 wordpos.isAdjective('awesome', function(result){
    console.log(result);
 });
-// true
+// true 'awesome'
 ```
 See `wordpos_spec.js` for full usage.
@ -46,31 +46,33 @@ Please note: all API are async since the underlying WordNet library is async. Wo
 Get POS from text.
 ```
-wordpos.getPOS(str, callback) -- callback receives a result object:
+wordpos.getPOS(text, callback) -- callback receives a result object:
    {
-      nouns:[],       Array of str words that are nouns
+      nouns:[],       Array of text words that are nouns
-      verbs:[],       Array of str words that are verbs
+      verbs:[],       Array of text words that are verbs
-      adjectives:[],  Array of str words that are adjectives
+      adjectives:[],  Array of text words that are adjectives
-      adverbs:[],     Array of str words that are adverbs
+      adverbs:[],     Array of text words that are adverbs
-      rest:[]         Array of str words that are not in dict or could not be categorized as a POS
+      rest:[]         Array of text words that are not in dict or could not be categorized as a POS
    }
    Note: a word may appear in multiple POS (eg, 'great' is both a noun and an adjective)
-wordpos.getNouns(str, callback) -- callback receives an array of nouns in str
+wordpos.getNouns(text, callback) -- callback receives an array of nouns in text
-wordpos.getVerbs(str, callback) -- callback receives an array of verbs in str
+wordpos.getVerbs(text, callback) -- callback receives an array of verbs in text
-wordpos.getAdjectives(str, callback) -- callback receives an array of adjectives in str
+wordpos.getAdjectives(text, callback) -- callback receives an array of adjectives in text
-wordpos.getAdverbs(str, callback) -- callback receives an array of adverbs in str
+wordpos.getAdverbs(text, callback) -- callback receives an array of adverbs in text
 ```
 If you're only interested in a certain POS (say, adjectives), using the particular getX() is faster
 than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
-are stripped out from str before lookup.
+are stripped out from text before lookup.
-All getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
+If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied.
 getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
 Example:
@ -151,6 +153,8 @@ wordpos.lookupAdjective(word, callback) -- callback receives array of lookup obj
 wordpos.lookupAdverb(word, callback) -- callback receives array of lookup objects for an adverb
 ```
 lookupX() methods return the looked-up word as the second argument to the callback.
 Example:
 ```js
@ -166,7 +170,7 @@ wordpos.lookupAdjective('awesome', console.log);
    ptrs: [],
    gloss: 'inspiring awe or admiration or wonder; "New York is an amazing city"; "the Grand Canyon is an awe-inspiring
 sight"; "the awesome complexity of the universe"; "this sea, whose gently awful stirrings seem to speak of some hidden s
-oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent"  ' } ]
+oul beneath"- Melville; "Westminster Hall\'s awing majesty, so vast, so high, so silent"  ' } ], 'awesome'
 ```
 In this case only one lookup was found.  But there could be several.
@ -182,9 +186,12 @@ wordpos.lookup('great', console.log);
 ```
 WordPOS.WNdb -- access to the WNdb object
-
+WordPOS.natural -- access to underlying 'natural' module
 wordpos.parse(str) -- returns tokenized array of words, less duplicates and stopwords.  This method is called on all getX() calls internally.
 ```
 E.g., WordPOS.natural.stopwords is the list of stopwords.
 ### Options
@ -198,7 +205,13 @@ WordPOS.defaults = {
  /**
   * use fast index if available
   */
-  fastIndex: true
+  fastIndex: true,
  /**
   * if true, exclude standard stopwords, or array of stop words to exclude.
   * Set to false to not filter for any stopwords.
   */
  stopwords: true
 };
 ```
 To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a second argument that is the execution time in msec of the call.
--- a/spec/wordpos_spec.js
+++ b/spec/wordpos_spec.js
@ -206,5 +206,22 @@ describe('profile option', function() {
      done();
    });
  });
  it('should disable stopword filtering', function(){
    var wp = new WordPOS({stopwords : false}),
      strWithStopwords = 'about after all';  // 3 adjective stopwords
    expect(wp.getAdjectives(strWithStopwords, noop)).toBe(3);
  });
  it('should use custom stopwords', function(){
    var wp = new WordPOS({stopwords : ['all']}),
      strWithStopwords = 'about after all';  // 3 adjective stopwords
    // 'all' should be filtered
    expect(wp.getAdjectives(strWithStopwords, noop)).toBe(2);
  });
 });
 function noop(){}
--- a/src/wordpos.js
+++ b/src/wordpos.js
@ -14,7 +14,7 @@ var _ = require('underscore')._,
  natural = require('natural'),
  WordNet = natural.WordNet,
  tokenizer = new natural.WordTokenizer(),
-  stopwords = ' '+ natural.stopwords.join(' ') +' ',
+  natural_stopwords = makeStopwordString(natural.stopwords),
  WNdb = require('WNdb'),
  fastIndex = null;
@ -26,12 +26,21 @@ function normalize(word) {
  return word.toLowerCase().replace(/\s+/g, '_');
 }
-function isStopword(word) {
+function makeStopwordString(stopwords) {
  return ' '+ stopwords.join(' ') +' ';
 }
 function isStopword(stopwords, word) {
  return stopwords.indexOf(' '+word+' ') >= 0;
 }
 function prepText(text) {
-  return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
+  if (_.isArray(text)) return text;
  var deduped = _.uniq(tokenizer.tokenize(text));
  if (!this.options.stopwords) return deduped;
  return _.reject(deduped, _.bind(isStopword, null,
      _.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords
      ));
 }
 function lookup(pos) {
@ -43,7 +52,7 @@ function lookup(pos) {
    this.lookupFromFiles([
        {index: this.getIndexFile(pos), data: this.getDataFile(pos)}
        ], [], word, function(results){
-        args.push(results);
+        args.push(results, word);
        profile && args.push(new Date() - start);
        callback.apply(null, args);
    });
@ -70,7 +79,7 @@ function get(isFn) {
  return function(text, callback) {
    var profile = this.options.profile,
      start = profile && new Date(),
-      words = prepText(text),
+      words = this.parse(text),
      n = words.length,
      i = 0,
      self = this,
@ -110,6 +119,10 @@ var WordPOS = function(options) {
    this.adjIndex.find = fastIndex.find(this.adjIndex);
    this.advIndex.find = fastIndex.find(this.advIndex);
  }
  if (_.isArray(this.options.stopwords)) {
    this.options.stopwords = makeStopwordString(this.options.stopwords);
  }
 };
 util.inherits(WordPOS, WordNet);
@ -122,7 +135,13 @@ WordPOS.defaults = {
  /**
   * use fast index if available
   */
-  fastIndex: true
+  fastIndex: true,
  /**
   * if true, exclude standard stopwords, or array of stop words to exclude.
   * Set to false to not filter for any stopwords.
   */
  stopwords: true
 };
 var wordposProto = WordPOS.prototype;
@ -200,7 +219,7 @@ wordposProto.getPOS = function(text, callback) {
    args = [data],
    testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
    parts = 'nouns verbs adjectives adverbs'.split(' '),
-    words = prepText(text),
+    words = this.parse(text),
    nTests = testFns.length,
    nWords = words.length,
    self = this,
@ -237,5 +256,7 @@ wordposProto.getPOS = function(text, callback) {
 };
 WordPOS.WNdb = WNdb;
 WordPOS.natural = natural;
 module.exports = WordPOS;