added package.json and cleanup

2012-05-04 12:23:28 -07:00 · 2012-05-04 12:23:28 -07:00 · 518725c189
parent f084e31994
commit 518725c189
5 changed files with 332 additions and 293 deletions
--- a/README.md
+++ b/README.md
@ -30,10 +30,10 @@ Installation

 Get the script `wordpos.js` and use it.  (npm module may be coming.)

-You may also want to manually download WordNet files from [here](http://wordnet.princeton.edu/wordnet/download/current-version/).  Unpack into folder (say `dict`).  [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files -- 
-but I've found this to be unreliable as some of the files get truncated, leading the core program to hang.
+You may also want to manually download [WordNet files](http://wordnet.princeton.edu/wordnet/download/current-version/).  Unpack into folder (say `dict`).  [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files --
+but I've found this to be unreliable as some of the files get truncated, leading the program to hang.

-Note: `wordpos-bench` requires a customized [uubench](https://github.com/moos/uubench) module (forthcoming). 
+Note: `wordpos-bench.js` requires a [forked uubench](https://github.com/moos/uubench) module.


 API
@ -48,7 +48,7 @@ WordPOS is a subclass of natural's [WordNet class](https://github.com/NaturalNod

 Get POS from text.

-```js
+```
 wordpos.getPOS(str, callback) -- callback receives a result object:
    {
      nouns:[],       Array of str words that are nouns
@ -111,7 +111,7 @@ would be considered nouns.  (see http://nltk.googlecode.com/svn/trunk/doc/book/c

 Determine if a word is a particular POS.

-```js
+```
 wordpos.isNoun(word, callback) -- callback receives result (true/false) if word is a noun.

 wordpos.isVerb(word, callback) -- callback receives result (true/false) if word is a verb.
@ -142,7 +142,7 @@ wordpos.isAdverb('fishly', console.log);
 These calls are similar to natural's [lookup()](https://github.com/NaturalNode/natural#wordnet) call, except they can be faster if you
 already know the POS of the word.

-```js
+```
 wordpos.lookupNoun(word, callback) -- callback receives array of lookup objects for a noun

 wordpos.lookupVerb(word, callback) -- callback receives array of lookup objects for a verb
@ -185,12 +185,22 @@ Benchmark
 Generally slow as it requires loading and searching large WordNet index files.

 Single word lookup:
+```
+  getPOS : 30 ops/s { iterations: 10, elapsed: 329 }
+  getNouns : 106 ops/s { iterations: 10, elapsed: 94 }
+  getVerbs : 111 ops/s { iterations: 10, elapsed: 90 }
+  getAdjectives : 132 ops/s { iterations: 10, elapsed: 76 }
+  getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 }
+```

-    getPOS : 22 ops/s { iterations: 10, elapsed: 451 }
-    getNouns : 66 ops/s { iterations: 10, elapsed: 152 }
-    getVerbs : 66 ops/s { iterations: 10, elapsed: 152 }
-    getAdjectives : 67 ops/s { iterations: 10, elapsed: 150 }
-    getAdverbs : 83 ops/s { iterations: 10, elapsed: 120 }
+128-word lookup:
+```
+  getPOS : 0 ops/s { iterations: 1, elapsed: 2210 }
+  getNouns : 2 ops/s { iterations: 1, elapsed: 666 }
+  getVerbs : 2 ops/s { iterations: 1, elapsed: 638 }
+  getAdjectives : 2 ops/s { iterations: 1, elapsed: 489 }
+  getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 }
+```

 On a win7/64-bit/dual-core/3GHz.  getPOS() is slowest as it searches through all four index files.

--- a/package.json
+++ b/package.json
@ -0,0 +1,23 @@
+{
+  "name": "wordpos",
+  "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
+  "version": "0.1.0",
+  "homepage": "https://github.com/moos/wordpos",
+  "engines": {
+    "node": ">=0.4.10"
+  },
+  "dependencies": {
+    "natural": "latest",
+    "underscore": ">=1.3.1"
+  },
+  "devDependencies": {
+    "uubench": "git://github.com/moos/uubench.git"
+  },
+  "repository" : {
+    "type" : "git",
+    "url" : "http://github.com/moos/wordpos.git"
+  },
+  "author": "Moos <mooster@42at.com>",
+  "keywords": ["natural", "language", "wordnet", "pos"],
+  "main": "./wordpos.js"
+}
--- a/wordpos-bench.js
+++ b/wordpos-bench.js
@ -1,15 +1,14 @@

-var uubench = require('uubench'),
-	fs = require('fs'),
-	_ = require('underscore')._,
-	WordPOS = require('./wordpos'),
-	wordpos = new WordPOS('dict');
+var uubench = require('uubench'), // from: https://github.com/moos/uubench
+  fs = require('fs'),
+  _ = require('underscore')._,
+  WordPOS = require('./wordpos'),
+  wordpos = new WordPOS('dict');

 suite = new uubench.Suite({
  type: 'fixed',
  iterations: 10,
-  //delay: 750,
-  sync: true,
+  sync: true,	// important!

  start: function(tests){
    console.log('starting %d tests', tests.length);
@ -20,7 +19,7 @@ suite = new uubench.Suite({
      , ops = .5 + stats.iterations * persec;

    console.log('  \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats);
-	pos && console.log(out(pos));
+    pos && console.log(out(pos));
  },

  done: function(time){
@ -28,65 +27,64 @@ suite = new uubench.Suite({
  },

  section: function(name, stats) {
-	console.log('\033[35m%s\033[0m',name);
+    console.log('\033[35m%s\033[0m',name);
  }
 });


 function out(res){
-	return _(res).keys().map(function(k){ return k + ':' + res[k].length });
+  return _(res).keys().map(function(k){ return k + ':' + res[k].length });
 }



 var	text1 = 'laksasdf',
-	text128 = fs.readFileSync('text-128.txt', 'utf8'),
-	text,
-	pos,
-	str = "This is some sample text. This text can contain multiple sentences. It also works with urls like.";
+  text128 = fs.readFileSync('text-128.txt', 'utf8'),
+  text,
+  pos;


 function getPOS(next){
-	wordpos.getPOS(text, function(res){
-		pos = res;
-		next();
-	});
+  wordpos.getPOS(text, function(res){
+    pos = res;
+    next();
+  });
 }

 function getNouns(next){
-	wordpos.getNouns(text, function(res){
-		pos = {nouns: res};
-		next();
-	});
+  wordpos.getNouns(text, function(res){
+    pos = {nouns: res};
+    next();
+  });
 }

 function getVerbs(next){
-	wordpos.getVerbs(text, function(res){
-		pos = {verbs: res};
-		next();
-	});
+  wordpos.getVerbs(text, function(res){
+    pos = {verbs: res};
+    next();
+  });
 }

 function getAdjectives(next){
-	wordpos.getAdjectives(text, function(res){
-		pos = {adjectives: res};
-		next();
-	});
+  wordpos.getAdjectives(text, function(res){
+    pos = {adjectives: res};
+    next();
+  });
 }

 function getAdverbs(next){
-	wordpos.getAdverbs(text, function(res){
-		pos = {adverbs: res};
-		next();
-	});
+  wordpos.getAdverbs(text, function(res){
+    pos = {adverbs: res};
+    next();
+  });
 }

 /*
 * one word
 */
 suite.section('--1 word--', function(next){
-	text = text1;
-	next();
+  text = text1;
+  next();
 });
 suite.bench('getPOS', getPOS);
 suite.bench('getNouns', getNouns);
@ -99,9 +97,9 @@ suite.bench('getAdverbs', getAdverbs);
 * 128 words
 */
 suite.section('--128 words--', function(next){
-	suite.options.iterations = 1;
-	text = text128;
-	next();
+  suite.options.iterations = 1;
+  text = text128;
+  next();
 });
 suite.bench('getPOS', getPOS);
 suite.bench('getNouns', getNouns);
--- a/wordpos.js
+++ b/wordpos.js
@ -1,71 +1,71 @@
-/*!
+/**
 * wordpos
 *
-*    part-of-speech utilities using natural's wordnet module. 
+*    Node.js part-of-speech utilities using natural's WordNet module.
 *
 * Copyright (c) 2012 mooster@42at.com
 * Released under MIT license
 */

 var _ = require('underscore')._,
-	util = require('util'),
-	natural = require('./lib/natural'),
-	WordNet = natural.WordNet,
-  	tokenizer = new natural.WordTokenizer(),
-	stopwords = ' '+ natural.stopwords.join(' ') +' ';
+  util = require('util'),
+  natural = require('natural'),
+  WordNet = natural.WordNet,
+  tokenizer = new natural.WordTokenizer(),
+  stopwords = ' '+ natural.stopwords.join(' ') +' ';

 function normalize(word) {
  return word.toLowerCase().replace(/\s+/g, '_');
 }

 function isStopword(word) {
-	return stopwords.indexOf(' '+word+' ') >= 0;
+  return stopwords.indexOf(' '+word+' ') >= 0;
 }

 function prepText(text) {
-	return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
+  return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
 }

 function lookup(pos) {
-	return function(word, callback) {
-		word = normalize(word);
-		this.lookupFromFiles([
-	      {index: this.getIndexFile(pos), data: this.getDataFile(pos)}
-	      ], [], word, callback);
-	};
+  return function(word, callback) {
+    word = normalize(word);
+    this.lookupFromFiles([
+        {index: this.getIndexFile(pos), data: this.getDataFile(pos)}
+        ], [], word, callback);
+  };
 }

 function is(pos){
-	return function(word, callback) {
-		var index = this.getIndexFile(pos);
-		word = normalize(word);
-		index.lookup(word, function(record) {
-			callback(!!record);
-		});  
-	};
+  return function(word, callback) {
+    var index = this.getIndexFile(pos);
+    word = normalize(word);
+    index.lookup(word, function(record) {
+      callback(!!record);
+    });
+  };
 }

 function get(isFn) {
-	return function(text, callback) {
-	  var words = prepText(text),
-	    n = words.length,
-	    i = 0,
-	    self = this,
-  		results = [];
+  return function(text, callback) {
+    var words = prepText(text),
+      n = words.length,
+      i = 0,
+      self = this,
+      results = [];

-	  if (!n) return callback(results);
-	  words.forEach(function(word,j){
-		  self[isFn](word, function(yes){
-			  yes && results.push(word);
-			  (++i==n) && callback(results);
-		  });
-	  });
-	};
+    if (!n) return callback(results);
+    words.forEach(function(word,j){
+      self[isFn](word, function(yes){
+        yes && results.push(word);
+        (++i==n) && callback(results);
+      });
+    });
+  };
 }


 var WordPOS = function() {
-	WordPOS.super_.apply(this, arguments);
+  WordPOS.super_.apply(this, arguments);
 };
 util.inherits(WordPOS, WordNet);

@ -111,19 +111,20 @@ wordposProto.getAdverbs = get('isAdverb');
 wordposProto.getNouns = get('isNoun');
 wordposProto.getVerbs = get('isVerb');

-if (!wordposProto.getIndexFile)
-  wordposProto.getIndexFile = function getIndexFile(pos) {
-	    switch(pos) {
-	      case 'n':
-	        return this.nounIndex;
-	      case 'v':
-	        return this.verbIndex;
-	      case 'a': case 's':
-	        return this.adjIndex;
-	      case 'r':
-	        return this.advIndex;
-	    }
-	};
+if (!wordposProto.getIndexFile) {
+    wordposProto.getIndexFile = function getIndexFile(pos) {
+      switch(pos) {
+        case 'n':
+          return this.nounIndex;
+        case 'v':
+          return this.verbIndex;
+        case 'a': case 's':
+          return this.adjIndex;
+        case 'r':
+          return this.advIndex;
+      }
+  };
+}

 /**
 * getPOS()
@ -136,10 +137,10 @@ if (!wordposProto.getIndexFile)
 */
 wordposProto.getPOS = function(text, callback) {
  var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
-  	testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
+    testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
    parts = 'nouns verbs adjectives adverbs'.split(' '),
    words = prepText(text),
-  	nTests = testFns.length,
+    nTests = testFns.length,
    nWords = words.length,
    self = this,
    c = 0;
@ -148,31 +149,31 @@ wordposProto.getPOS = function(text, callback) {
  words.forEach(lookup);

  function lookup(word){
-	  var any = false,	
-	  	t=0;
-	  word = normalize(word);
-	  testFns.forEach(lookupPOS);
+    var any = false,
+      t=0;
+    word = normalize(word);
+    testFns.forEach(lookupPOS);

-	  function lookupPOS(isFn,i,list){
-		  self[isFn](word, function(yes){
-			  yes && data[parts[i]].push(word);
-			  any |= yes;
-			  donePOS();
-		  });
-	  }
+    function lookupPOS(isFn,i,list){
+      self[isFn](word, function(yes){
+        yes && data[parts[i]].push(word);
+        any |= yes;
+        donePOS();
+      });
+    }

-	  function donePOS() {
-		  if (++t == nTests) {
-			  !any && data['rest'].push(word);
-			  done();
-		  }
-	  }
+    function donePOS() {
+      if (++t == nTests) {
+        !any && data['rest'].push(word);
+        done();
+      }
+    }
  }

  function done(){
-	  if (++c == nWords) {
-		  callback(data);
-	  }
+    if (++c == nWords) {
+      callback(data);
+    }
  }
 };

--- a/wordpos_spec.js
+++ b/wordpos_spec.js
@ -1,30 +1,36 @@
+// npm install jasmine-node -g
+// jasmine-node wordpos_spec.js --verbose
+
+/* Note: 'dict' folder should contain WordNet files.
+ * Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/
+ */

 var WordPOS = require('./wordpos'),
-	wordpos = new WordPOS('dict');
+  wordpos = new WordPOS('dict');

 var str = "The angry bear chased the frightened little squirrel",
-	expected = { 
-	  nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
-	  verbs: [ 'bear' ],
-	  adjectives: [ 'little', 'angry', 'frightened' ],
-	  adverbs: [ 'little' ],
-	  rest: [ 'the' ]
-	},
-    garble = 'garblegarble';	// expect not to find word
+  expected = {
+    nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
+    verbs: [ 'bear' ],
+    adjectives: [ 'little', 'angry', 'frightened' ],
+    adverbs: [ 'little' ],
+    rest: [ 'the' ]
+  },
+  garble = 'garblegarble';	// expect not to find word


 describe('get POS', function() {

  beforeEach(function() {
-	  this.addMatchers({
-		// unordered (multiset) comparison -- NOTE: doesn't handle deep!
-		toEqualUnordered: function(expected) {
-		  var mismatchKeys=[], 
-		  	mismatchValues=[],
-		  	result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
-	      return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
-	    }
-	  });
+    this.addMatchers({
+    // unordered (multiset) comparison -- NOTE: doesn't handle deep!
+    toEqualUnordered: function(expected) {
+      var mismatchKeys=[],
+        mismatchValues=[],
+        result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
+        return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
+      }
+    });
  });

  it('should get all POS', function() {
@ -33,41 +39,42 @@ describe('get POS', function() {
      expect(result.verbs).toEqualUnordered(expected.verbs);
      expect(result.adjectives).toEqualUnordered(expected.adjectives);
      expect(result.adverbs).toEqualUnordered(expected.adverbs);
+      expect(result.rest).toEqualUnordered(expected.rest);
      asyncSpecDone();
    });
    asyncSpecWait();
  });

  it('should get nouns', function() {
-	  wordpos.getNouns(str, function(result) {
-		  expect(result).toEqualUnordered(expected.nouns);
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.getNouns(str, function(result) {
+      expect(result).toEqualUnordered(expected.nouns);
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });

  it('should get verbs', function() {
-	  wordpos.getVerbs(str, function(result) {
-		  expect(result).toEqualUnordered(expected.verbs);
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.getVerbs(str, function(result) {
+      expect(result).toEqualUnordered(expected.verbs);
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });

  it('should get adjectives', function() {
-	  wordpos.getAdjectives(str, function(result) {
-		  expect(result).toEqualUnordered(expected.adjectives);
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.getAdjectives(str, function(result) {
+      expect(result).toEqualUnordered(expected.adjectives);
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });

  it('should get adverbs', function() {
-	  wordpos.getAdverbs(str, function(result) {
-		  expect(result).toEqualUnordered(expected.adverbs);
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.getAdverbs(str, function(result) {
+      expect(result).toEqualUnordered(expected.adverbs);
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });
 });

@ -80,91 +87,91 @@ describe('is POS', function() {
    asyncSpecWait();
  });
  it('should check if verb', function() {
-	  wordpos.isVerb(expected.verbs[0], function(result) {
-		  expect(result).toBeTruthy();
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.isVerb(expected.verbs[0], function(result) {
+      expect(result).toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });
  it('should check if adjective', function() {
-	  wordpos.isAdjective(expected.adjectives[0], function(result) {
-		  expect(result).toBeTruthy();
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.isAdjective(expected.adjectives[0], function(result) {
+      expect(result).toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });
  it('should check if adverb', function() {
-	  wordpos.isAdverb(expected.adverbs[0], function(result) {
-		  expect(result).toBeTruthy();
-		  asyncSpecDone();
-	  });
-	  asyncSpecWait();
+    wordpos.isAdverb(expected.adverbs[0], function(result) {
+      expect(result).toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
  });
 });

 describe('is !POS', function() {
-	it('should check if !noun', function() {
-		wordpos.isNoun(garble, function(result) {
-			expect(result).not.toBeTruthy();
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should check if !verb', function() {
-		wordpos.isVerb(garble, function(result) {
-			expect(result).not.toBeTruthy();
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should check if !adjective', function() {
-		wordpos.isAdjective(garble, function(result) {
-			expect(result).not.toBeTruthy();
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should check if !adverb', function() {
-		wordpos.isAdverb(garble, function(result) {
-			expect(result).not.toBeTruthy();
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
+  it('should check if !noun', function() {
+    wordpos.isNoun(garble, function(result) {
+      expect(result).not.toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should check if !verb', function() {
+    wordpos.isVerb(garble, function(result) {
+      expect(result).not.toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should check if !adjective', function() {
+    wordpos.isAdjective(garble, function(result) {
+      expect(result).not.toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should check if !adverb', function() {
+    wordpos.isAdverb(garble, function(result) {
+      expect(result).not.toBeTruthy();
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
 });

 describe('lookup POS', function() {
-	it('should lookup noun', function() {
-		wordpos.lookupNoun('squirrel', function(result) {
-			expect(result[0].pos).toBe('n');
-			expect(result[0].lemma).toBe('squirrel');
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should lookup verb', function() {
-		wordpos.lookupVerb('bear', function(result) {
-			expect(result[0].pos).toBe('v');
-			expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should lookup adjective', function() {
-		wordpos.lookupAdjective('angry', function(result) {
-			expect(result[0].pos).toBe('s');
-			expect(result[0].lemma).toBe('angry');
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
-	it('should lookup adverb', function() {
-		wordpos.lookupAdverb('little', function(result) {
-			expect(result[0].pos).toBe('r');
-			expect(result[0].lemma).toBe('little');
-			asyncSpecDone();
-		});
-		asyncSpecWait();
-	});
+  it('should lookup noun', function() {
+    wordpos.lookupNoun('squirrel', function(result) {
+      expect(result[0].pos).toBe('n');
+      expect(result[0].lemma).toBe('squirrel');
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should lookup verb', function() {
+    wordpos.lookupVerb('bear', function(result) {
+      expect(result[0].pos).toBe('v');
+      expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should lookup adjective', function() {
+    wordpos.lookupAdjective('angry', function(result) {
+      expect(result[0].pos).toBe('s');
+      expect(result[0].lemma).toBe('angry');
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
+  it('should lookup adverb', function() {
+    wordpos.lookupAdverb('little', function(result) {
+      expect(result[0].pos).toBe('r');
+      expect(result[0].lemma).toBe('little');
+      asyncSpecDone();
+    });
+    asyncSpecWait();
+  });
 });