added package.json and cleanup

This commit is contained in:
moos 2012-05-04 12:23:28 -07:00
parent f084e31994
commit 518725c189
5 changed files with 332 additions and 293 deletions

View File

@ -30,10 +30,10 @@ Installation
Get the script `wordpos.js` and use it. (npm module may be coming.) Get the script `wordpos.js` and use it. (npm module may be coming.)
You may also want to manually download WordNet files from [here](http://wordnet.princeton.edu/wordnet/download/current-version/). Unpack into folder (say `dict`). [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files -- You may also want to manually download [WordNet files](http://wordnet.princeton.edu/wordnet/download/current-version/). Unpack into folder (say `dict`). [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files --
but I've found this to be unreliable as some of the files get truncated, leading the core program to hang. but I've found this to be unreliable as some of the files get truncated, leading the program to hang.
Note: `wordpos-bench` requires a customized [uubench](https://github.com/moos/uubench) module (forthcoming). Note: `wordpos-bench.js` requires a [forked uubench](https://github.com/moos/uubench) module.
API API
@ -48,7 +48,7 @@ WordPOS is a subclass of natural's [WordNet class](https://github.com/NaturalNod
Get POS from text. Get POS from text.
```js ```
wordpos.getPOS(str, callback) -- callback receives a result object: wordpos.getPOS(str, callback) -- callback receives a result object:
{ {
nouns:[], Array of str words that are nouns nouns:[], Array of str words that are nouns
@ -111,7 +111,7 @@ would be considered nouns. (see http://nltk.googlecode.com/svn/trunk/doc/book/c
Determine if a word is a particular POS. Determine if a word is a particular POS.
```js ```
wordpos.isNoun(word, callback) -- callback receives result (true/false) if word is a noun. wordpos.isNoun(word, callback) -- callback receives result (true/false) if word is a noun.
wordpos.isVerb(word, callback) -- callback receives result (true/false) if word is a verb. wordpos.isVerb(word, callback) -- callback receives result (true/false) if word is a verb.
@ -142,7 +142,7 @@ wordpos.isAdverb('fishly', console.log);
These calls are similar to natural's [lookup()](https://github.com/NaturalNode/natural#wordnet) call, except they can be faster if you These calls are similar to natural's [lookup()](https://github.com/NaturalNode/natural#wordnet) call, except they can be faster if you
already know the POS of the word. already know the POS of the word.
```js ```
wordpos.lookupNoun(word, callback) -- callback receives array of lookup objects for a noun wordpos.lookupNoun(word, callback) -- callback receives array of lookup objects for a noun
wordpos.lookupVerb(word, callback) -- callback receives array of lookup objects for a verb wordpos.lookupVerb(word, callback) -- callback receives array of lookup objects for a verb
@ -185,12 +185,22 @@ Benchmark
Generally slow as it requires loading and searching large WordNet index files. Generally slow as it requires loading and searching large WordNet index files.
Single word lookup: Single word lookup:
```
getPOS : 30 ops/s { iterations: 10, elapsed: 329 }
getNouns : 106 ops/s { iterations: 10, elapsed: 94 }
getVerbs : 111 ops/s { iterations: 10, elapsed: 90 }
getAdjectives : 132 ops/s { iterations: 10, elapsed: 76 }
getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 }
```
getPOS : 22 ops/s { iterations: 10, elapsed: 451 } 128-word lookup:
getNouns : 66 ops/s { iterations: 10, elapsed: 152 } ```
getVerbs : 66 ops/s { iterations: 10, elapsed: 152 } getPOS : 0 ops/s { iterations: 1, elapsed: 2210 }
getAdjectives : 67 ops/s { iterations: 10, elapsed: 150 } getNouns : 2 ops/s { iterations: 1, elapsed: 666 }
getAdverbs : 83 ops/s { iterations: 10, elapsed: 120 } getVerbs : 2 ops/s { iterations: 1, elapsed: 638 }
getAdjectives : 2 ops/s { iterations: 1, elapsed: 489 }
getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 }
```
On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files. On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.

23
package.json Normal file
View File

@ -0,0 +1,23 @@
{
"name": "wordpos",
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.0",
"homepage": "https://github.com/moos/wordpos",
"engines": {
"node": ">=0.4.10"
},
"dependencies": {
"natural": "latest",
"underscore": ">=1.3.1"
},
"devDependencies": {
"uubench": "git://github.com/moos/uubench.git"
},
"repository" : {
"type" : "git",
"url" : "http://github.com/moos/wordpos.git"
},
"author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"],
"main": "./wordpos.js"
}

View File

@ -1,15 +1,14 @@
var uubench = require('uubench'), var uubench = require('uubench'), // from: https://github.com/moos/uubench
fs = require('fs'), fs = require('fs'),
_ = require('underscore')._, _ = require('underscore')._,
WordPOS = require('./wordpos'), WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict'); wordpos = new WordPOS('dict');
suite = new uubench.Suite({ suite = new uubench.Suite({
type: 'fixed', type: 'fixed',
iterations: 10, iterations: 10,
//delay: 750, sync: true, // important!
sync: true,
start: function(tests){ start: function(tests){
console.log('starting %d tests', tests.length); console.log('starting %d tests', tests.length);
@ -20,7 +19,7 @@ suite = new uubench.Suite({
, ops = .5 + stats.iterations * persec; , ops = .5 + stats.iterations * persec;
console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats); console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats);
pos && console.log(out(pos)); pos && console.log(out(pos));
}, },
done: function(time){ done: function(time){
@ -28,65 +27,64 @@ suite = new uubench.Suite({
}, },
section: function(name, stats) { section: function(name, stats) {
console.log('\033[35m%s\033[0m',name); console.log('\033[35m%s\033[0m',name);
} }
}); });
function out(res){ function out(res){
return _(res).keys().map(function(k){ return k + ':' + res[k].length }); return _(res).keys().map(function(k){ return k + ':' + res[k].length });
} }
var text1 = 'laksasdf', var text1 = 'laksasdf',
text128 = fs.readFileSync('text-128.txt', 'utf8'), text128 = fs.readFileSync('text-128.txt', 'utf8'),
text, text,
pos, pos;
str = "This is some sample text. This text can contain multiple sentences. It also works with urls like.";
function getPOS(next){ function getPOS(next){
wordpos.getPOS(text, function(res){ wordpos.getPOS(text, function(res){
pos = res; pos = res;
next(); next();
}); });
} }
function getNouns(next){ function getNouns(next){
wordpos.getNouns(text, function(res){ wordpos.getNouns(text, function(res){
pos = {nouns: res}; pos = {nouns: res};
next(); next();
}); });
} }
function getVerbs(next){ function getVerbs(next){
wordpos.getVerbs(text, function(res){ wordpos.getVerbs(text, function(res){
pos = {verbs: res}; pos = {verbs: res};
next(); next();
}); });
} }
function getAdjectives(next){ function getAdjectives(next){
wordpos.getAdjectives(text, function(res){ wordpos.getAdjectives(text, function(res){
pos = {adjectives: res}; pos = {adjectives: res};
next(); next();
}); });
} }
function getAdverbs(next){ function getAdverbs(next){
wordpos.getAdverbs(text, function(res){ wordpos.getAdverbs(text, function(res){
pos = {adverbs: res}; pos = {adverbs: res};
next(); next();
}); });
} }
/* /*
* one word * one word
*/ */
suite.section('--1 word--', function(next){ suite.section('--1 word--', function(next){
text = text1; text = text1;
next(); next();
}); });
suite.bench('getPOS', getPOS); suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns); suite.bench('getNouns', getNouns);
@ -99,9 +97,9 @@ suite.bench('getAdverbs', getAdverbs);
* 128 words * 128 words
*/ */
suite.section('--128 words--', function(next){ suite.section('--128 words--', function(next){
suite.options.iterations = 1; suite.options.iterations = 1;
text = text128; text = text128;
next(); next();
}); });
suite.bench('getPOS', getPOS); suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns); suite.bench('getNouns', getNouns);

View File

@ -1,71 +1,71 @@
/*! /**
* wordpos * wordpos
* *
* part-of-speech utilities using natural's wordnet module. * Node.js part-of-speech utilities using natural's WordNet module.
* *
* Copyright (c) 2012 mooster@42at.com * Copyright (c) 2012 mooster@42at.com
* Released under MIT license * Released under MIT license
*/ */
var _ = require('underscore')._, var _ = require('underscore')._,
util = require('util'), util = require('util'),
natural = require('./lib/natural'), natural = require('natural'),
WordNet = natural.WordNet, WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(), tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' '; stopwords = ' '+ natural.stopwords.join(' ') +' ';
function normalize(word) { function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_'); return word.toLowerCase().replace(/\s+/g, '_');
} }
function isStopword(word) { function isStopword(word) {
return stopwords.indexOf(' '+word+' ') >= 0; return stopwords.indexOf(' '+word+' ') >= 0;
} }
function prepText(text) { function prepText(text) {
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword); return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
} }
function lookup(pos) { function lookup(pos) {
return function(word, callback) { return function(word, callback) {
word = normalize(word); word = normalize(word);
this.lookupFromFiles([ this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)} {index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, callback); ], [], word, callback);
}; };
} }
function is(pos){ function is(pos){
return function(word, callback) { return function(word, callback) {
var index = this.getIndexFile(pos); var index = this.getIndexFile(pos);
word = normalize(word); word = normalize(word);
index.lookup(word, function(record) { index.lookup(word, function(record) {
callback(!!record); callback(!!record);
}); });
}; };
} }
function get(isFn) { function get(isFn) {
return function(text, callback) { return function(text, callback) {
var words = prepText(text), var words = prepText(text),
n = words.length, n = words.length,
i = 0, i = 0,
self = this, self = this,
results = []; results = [];
if (!n) return callback(results); if (!n) return callback(results);
words.forEach(function(word,j){ words.forEach(function(word,j){
self[isFn](word, function(yes){ self[isFn](word, function(yes){
yes && results.push(word); yes && results.push(word);
(++i==n) && callback(results); (++i==n) && callback(results);
}); });
}); });
}; };
} }
var WordPOS = function() { var WordPOS = function() {
WordPOS.super_.apply(this, arguments); WordPOS.super_.apply(this, arguments);
}; };
util.inherits(WordPOS, WordNet); util.inherits(WordPOS, WordNet);
@ -111,19 +111,20 @@ wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun'); wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb'); wordposProto.getVerbs = get('isVerb');
if (!wordposProto.getIndexFile) if (!wordposProto.getIndexFile) {
wordposProto.getIndexFile = function getIndexFile(pos) { wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) { switch(pos) {
case 'n': case 'n':
return this.nounIndex; return this.nounIndex;
case 'v': case 'v':
return this.verbIndex; return this.verbIndex;
case 'a': case 's': case 'a': case 's':
return this.adjIndex; return this.adjIndex;
case 'r': case 'r':
return this.advIndex; return this.advIndex;
} }
}; };
}
/** /**
* getPOS() * getPOS()
@ -136,10 +137,10 @@ if (!wordposProto.getIndexFile)
*/ */
wordposProto.getPOS = function(text, callback) { wordposProto.getPOS = function(text, callback) {
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}, var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '), testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
parts = 'nouns verbs adjectives adverbs'.split(' '), parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text), words = prepText(text),
nTests = testFns.length, nTests = testFns.length,
nWords = words.length, nWords = words.length,
self = this, self = this,
c = 0; c = 0;
@ -148,31 +149,31 @@ wordposProto.getPOS = function(text, callback) {
words.forEach(lookup); words.forEach(lookup);
function lookup(word){ function lookup(word){
var any = false, var any = false,
t=0; t=0;
word = normalize(word); word = normalize(word);
testFns.forEach(lookupPOS); testFns.forEach(lookupPOS);
function lookupPOS(isFn,i,list){ function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){ self[isFn](word, function(yes){
yes && data[parts[i]].push(word); yes && data[parts[i]].push(word);
any |= yes; any |= yes;
donePOS(); donePOS();
}); });
} }
function donePOS() { function donePOS() {
if (++t == nTests) { if (++t == nTests) {
!any && data['rest'].push(word); !any && data['rest'].push(word);
done(); done();
} }
} }
} }
function done(){ function done(){
if (++c == nWords) { if (++c == nWords) {
callback(data); callback(data);
} }
} }
}; };

View File

@ -1,30 +1,36 @@
// npm install jasmine-node -g
// jasmine-node wordpos_spec.js --verbose
/* Note: 'dict' folder should contain WordNet files.
* Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/
*/
var WordPOS = require('./wordpos'), var WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict'); wordpos = new WordPOS('dict');
var str = "The angry bear chased the frightened little squirrel", var str = "The angry bear chased the frightened little squirrel",
expected = { expected = {
nouns: [ 'bear', 'squirrel', 'little', 'chased' ], nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
verbs: [ 'bear' ], verbs: [ 'bear' ],
adjectives: [ 'little', 'angry', 'frightened' ], adjectives: [ 'little', 'angry', 'frightened' ],
adverbs: [ 'little' ], adverbs: [ 'little' ],
rest: [ 'the' ] rest: [ 'the' ]
}, },
garble = 'garblegarble'; // expect not to find word garble = 'garblegarble'; // expect not to find word
describe('get POS', function() { describe('get POS', function() {
beforeEach(function() { beforeEach(function() {
this.addMatchers({ this.addMatchers({
// unordered (multiset) comparison -- NOTE: doesn't handle deep! // unordered (multiset) comparison -- NOTE: doesn't handle deep!
toEqualUnordered: function(expected) { toEqualUnordered: function(expected) {
var mismatchKeys=[], var mismatchKeys=[],
mismatchValues=[], mismatchValues=[],
result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues); result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
return result || (mismatchKeys.length == 0 && mismatchValues.length > 0); return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
} }
}); });
}); });
it('should get all POS', function() { it('should get all POS', function() {
@ -33,41 +39,42 @@ describe('get POS', function() {
expect(result.verbs).toEqualUnordered(expected.verbs); expect(result.verbs).toEqualUnordered(expected.verbs);
expect(result.adjectives).toEqualUnordered(expected.adjectives); expect(result.adjectives).toEqualUnordered(expected.adjectives);
expect(result.adverbs).toEqualUnordered(expected.adverbs); expect(result.adverbs).toEqualUnordered(expected.adverbs);
expect(result.rest).toEqualUnordered(expected.rest);
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should get nouns', function() { it('should get nouns', function() {
wordpos.getNouns(str, function(result) { wordpos.getNouns(str, function(result) {
expect(result).toEqualUnordered(expected.nouns); expect(result).toEqualUnordered(expected.nouns);
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should get verbs', function() { it('should get verbs', function() {
wordpos.getVerbs(str, function(result) { wordpos.getVerbs(str, function(result) {
expect(result).toEqualUnordered(expected.verbs); expect(result).toEqualUnordered(expected.verbs);
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should get adjectives', function() { it('should get adjectives', function() {
wordpos.getAdjectives(str, function(result) { wordpos.getAdjectives(str, function(result) {
expect(result).toEqualUnordered(expected.adjectives); expect(result).toEqualUnordered(expected.adjectives);
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should get adverbs', function() { it('should get adverbs', function() {
wordpos.getAdverbs(str, function(result) { wordpos.getAdverbs(str, function(result) {
expect(result).toEqualUnordered(expected.adverbs); expect(result).toEqualUnordered(expected.adverbs);
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
}); });
@ -80,91 +87,91 @@ describe('is POS', function() {
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if verb', function() { it('should check if verb', function() {
wordpos.isVerb(expected.verbs[0], function(result) { wordpos.isVerb(expected.verbs[0], function(result) {
expect(result).toBeTruthy(); expect(result).toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if adjective', function() { it('should check if adjective', function() {
wordpos.isAdjective(expected.adjectives[0], function(result) { wordpos.isAdjective(expected.adjectives[0], function(result) {
expect(result).toBeTruthy(); expect(result).toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if adverb', function() { it('should check if adverb', function() {
wordpos.isAdverb(expected.adverbs[0], function(result) { wordpos.isAdverb(expected.adverbs[0], function(result) {
expect(result).toBeTruthy(); expect(result).toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
}); });
describe('is !POS', function() { describe('is !POS', function() {
it('should check if !noun', function() { it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) { wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy(); expect(result).not.toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if !verb', function() { it('should check if !verb', function() {
wordpos.isVerb(garble, function(result) { wordpos.isVerb(garble, function(result) {
expect(result).not.toBeTruthy(); expect(result).not.toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if !adjective', function() { it('should check if !adjective', function() {
wordpos.isAdjective(garble, function(result) { wordpos.isAdjective(garble, function(result) {
expect(result).not.toBeTruthy(); expect(result).not.toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should check if !adverb', function() { it('should check if !adverb', function() {
wordpos.isAdverb(garble, function(result) { wordpos.isAdverb(garble, function(result) {
expect(result).not.toBeTruthy(); expect(result).not.toBeTruthy();
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
}); });
describe('lookup POS', function() { describe('lookup POS', function() {
it('should lookup noun', function() { it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) { wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n'); expect(result[0].pos).toBe('n');
expect(result[0].lemma).toBe('squirrel'); expect(result[0].lemma).toBe('squirrel');
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should lookup verb', function() { it('should lookup verb', function() {
wordpos.lookupVerb('bear', function(result) { wordpos.lookupVerb('bear', function(result) {
expect(result[0].pos).toBe('v'); expect(result[0].pos).toBe('v');
expect(result[0].lemma).toBe('have_a_bun_in_the_oven'); expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should lookup adjective', function() { it('should lookup adjective', function() {
wordpos.lookupAdjective('angry', function(result) { wordpos.lookupAdjective('angry', function(result) {
expect(result[0].pos).toBe('s'); expect(result[0].pos).toBe('s');
expect(result[0].lemma).toBe('angry'); expect(result[0].lemma).toBe('angry');
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
it('should lookup adverb', function() { it('should lookup adverb', function() {
wordpos.lookupAdverb('little', function(result) { wordpos.lookupAdverb('little', function(result) {
expect(result[0].pos).toBe('r'); expect(result[0].pos).toBe('r');
expect(result[0].lemma).toBe('little'); expect(result[0].lemma).toBe('little');
asyncSpecDone(); asyncSpecDone();
}); });
asyncSpecWait(); asyncSpecWait();
}); });
}); });