added package.json and cleanup

This commit is contained in:
moos 2012-05-04 12:23:28 -07:00
parent f084e31994
commit 518725c189
5 changed files with 332 additions and 293 deletions

View File

@ -30,10 +30,10 @@ Installation
Get the script `wordpos.js` and use it. (npm module may be coming.)
You may also want to manually download WordNet files from [here](http://wordnet.princeton.edu/wordnet/download/current-version/). Unpack into folder (say `dict`). [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files --
but I've found this to be unreliable as some of the files get truncated, leading the core program to hang.
You may also want to manually download [WordNet files](http://wordnet.princeton.edu/wordnet/download/current-version/). Unpack into folder (say `dict`). [natural](http://github.com/NaturalNode/natural) will auto-download WordNet files --
but I've found this to be unreliable as some of the files get truncated, leading the program to hang.
Note: `wordpos-bench` requires a customized [uubench](https://github.com/moos/uubench) module (forthcoming).
Note: `wordpos-bench.js` requires a [forked uubench](https://github.com/moos/uubench) module.
API
@ -48,7 +48,7 @@ WordPOS is a subclass of natural's [WordNet class](https://github.com/NaturalNod
Get POS from text.
```js
```
wordpos.getPOS(str, callback) -- callback receives a result object:
{
nouns:[], Array of str words that are nouns
@ -111,7 +111,7 @@ would be considered nouns. (see http://nltk.googlecode.com/svn/trunk/doc/book/c
Determine if a word is a particular POS.
```js
```
wordpos.isNoun(word, callback) -- callback receives result (true/false) if word is a noun.
wordpos.isVerb(word, callback) -- callback receives result (true/false) if word is a verb.
@ -142,7 +142,7 @@ wordpos.isAdverb('fishly', console.log);
These calls are similar to natural's [lookup()](https://github.com/NaturalNode/natural#wordnet) call, except they can be faster if you
already know the POS of the word.
```js
```
wordpos.lookupNoun(word, callback) -- callback receives array of lookup objects for a noun
wordpos.lookupVerb(word, callback) -- callback receives array of lookup objects for a verb
@ -185,12 +185,22 @@ Benchmark
Generally slow as it requires loading and searching large WordNet index files.
Single word lookup:
```
getPOS : 30 ops/s { iterations: 10, elapsed: 329 }
getNouns : 106 ops/s { iterations: 10, elapsed: 94 }
getVerbs : 111 ops/s { iterations: 10, elapsed: 90 }
getAdjectives : 132 ops/s { iterations: 10, elapsed: 76 }
getAdverbs : 137 ops/s { iterations: 10, elapsed: 73 }
```
getPOS : 22 ops/s { iterations: 10, elapsed: 451 }
getNouns : 66 ops/s { iterations: 10, elapsed: 152 }
getVerbs : 66 ops/s { iterations: 10, elapsed: 152 }
getAdjectives : 67 ops/s { iterations: 10, elapsed: 150 }
getAdverbs : 83 ops/s { iterations: 10, elapsed: 120 }
128-word lookup:
```
getPOS : 0 ops/s { iterations: 1, elapsed: 2210 }
getNouns : 2 ops/s { iterations: 1, elapsed: 666 }
getVerbs : 2 ops/s { iterations: 1, elapsed: 638 }
getAdjectives : 2 ops/s { iterations: 1, elapsed: 489 }
getAdverbs : 2 ops/s { iterations: 1, elapsed: 407 }
```
On a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.

23
package.json Normal file
View File

@ -0,0 +1,23 @@
{
"name": "wordpos",
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.0",
"homepage": "https://github.com/moos/wordpos",
"engines": {
"node": ">=0.4.10"
},
"dependencies": {
"natural": "latest",
"underscore": ">=1.3.1"
},
"devDependencies": {
"uubench": "git://github.com/moos/uubench.git"
},
"repository" : {
"type" : "git",
"url" : "http://github.com/moos/wordpos.git"
},
"author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"],
"main": "./wordpos.js"
}

View File

@ -1,15 +1,14 @@
var uubench = require('uubench'),
fs = require('fs'),
_ = require('underscore')._,
WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
var uubench = require('uubench'), // from: https://github.com/moos/uubench
fs = require('fs'),
_ = require('underscore')._,
WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
suite = new uubench.Suite({
type: 'fixed',
iterations: 10,
//delay: 750,
sync: true,
sync: true, // important!
start: function(tests){
console.log('starting %d tests', tests.length);
@ -20,7 +19,7 @@ suite = new uubench.Suite({
, ops = .5 + stats.iterations * persec;
console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats);
pos && console.log(out(pos));
pos && console.log(out(pos));
},
done: function(time){
@ -28,65 +27,64 @@ suite = new uubench.Suite({
},
section: function(name, stats) {
console.log('\033[35m%s\033[0m',name);
console.log('\033[35m%s\033[0m',name);
}
});
function out(res){
return _(res).keys().map(function(k){ return k + ':' + res[k].length });
return _(res).keys().map(function(k){ return k + ':' + res[k].length });
}
var text1 = 'laksasdf',
text128 = fs.readFileSync('text-128.txt', 'utf8'),
text,
pos,
str = "This is some sample text. This text can contain multiple sentences. It also works with urls like.";
text128 = fs.readFileSync('text-128.txt', 'utf8'),
text,
pos;
function getPOS(next){
wordpos.getPOS(text, function(res){
pos = res;
next();
});
wordpos.getPOS(text, function(res){
pos = res;
next();
});
}
function getNouns(next){
wordpos.getNouns(text, function(res){
pos = {nouns: res};
next();
});
wordpos.getNouns(text, function(res){
pos = {nouns: res};
next();
});
}
function getVerbs(next){
wordpos.getVerbs(text, function(res){
pos = {verbs: res};
next();
});
wordpos.getVerbs(text, function(res){
pos = {verbs: res};
next();
});
}
function getAdjectives(next){
wordpos.getAdjectives(text, function(res){
pos = {adjectives: res};
next();
});
wordpos.getAdjectives(text, function(res){
pos = {adjectives: res};
next();
});
}
function getAdverbs(next){
wordpos.getAdverbs(text, function(res){
pos = {adverbs: res};
next();
});
wordpos.getAdverbs(text, function(res){
pos = {adverbs: res};
next();
});
}
/*
* one word
*/
suite.section('--1 word--', function(next){
text = text1;
next();
text = text1;
next();
});
suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns);
@ -99,9 +97,9 @@ suite.bench('getAdverbs', getAdverbs);
* 128 words
*/
suite.section('--128 words--', function(next){
suite.options.iterations = 1;
text = text128;
next();
suite.options.iterations = 1;
text = text128;
next();
});
suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns);

View File

@ -1,71 +1,71 @@
/*!
/**
* wordpos
*
* part-of-speech utilities using natural's wordnet module.
* Node.js part-of-speech utilities using natural's WordNet module.
*
* Copyright (c) 2012 mooster@42at.com
* Released under MIT license
*/
var _ = require('underscore')._,
util = require('util'),
natural = require('./lib/natural'),
WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ';
util = require('util'),
natural = require('natural'),
WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ';
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(word) {
return stopwords.indexOf(' '+word+' ') >= 0;
return stopwords.indexOf(' '+word+' ') >= 0;
}
function prepText(text) {
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
}
function lookup(pos) {
return function(word, callback) {
word = normalize(word);
this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, callback);
};
return function(word, callback) {
word = normalize(word);
this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, callback);
};
}
function is(pos){
return function(word, callback) {
var index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
callback(!!record);
});
};
return function(word, callback) {
var index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
callback(!!record);
});
};
}
function get(isFn) {
return function(text, callback) {
var words = prepText(text),
n = words.length,
i = 0,
self = this,
results = [];
return function(text, callback) {
var words = prepText(text),
n = words.length,
i = 0,
self = this,
results = [];
if (!n) return callback(results);
words.forEach(function(word,j){
self[isFn](word, function(yes){
yes && results.push(word);
(++i==n) && callback(results);
});
});
};
if (!n) return callback(results);
words.forEach(function(word,j){
self[isFn](word, function(yes){
yes && results.push(word);
(++i==n) && callback(results);
});
});
};
}
var WordPOS = function() {
WordPOS.super_.apply(this, arguments);
WordPOS.super_.apply(this, arguments);
};
util.inherits(WordPOS, WordNet);
@ -111,19 +111,20 @@ wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
if (!wordposProto.getIndexFile)
wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) {
case 'n':
return this.nounIndex;
case 'v':
return this.verbIndex;
case 'a': case 's':
return this.adjIndex;
case 'r':
return this.advIndex;
}
};
if (!wordposProto.getIndexFile) {
wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) {
case 'n':
return this.nounIndex;
case 'v':
return this.verbIndex;
case 'a': case 's':
return this.adjIndex;
case 'r':
return this.advIndex;
}
};
}
/**
* getPOS()
@ -136,10 +137,10 @@ if (!wordposProto.getIndexFile)
*/
wordposProto.getPOS = function(text, callback) {
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text),
nTests = testFns.length,
nTests = testFns.length,
nWords = words.length,
self = this,
c = 0;
@ -148,31 +149,31 @@ wordposProto.getPOS = function(text, callback) {
words.forEach(lookup);
function lookup(word){
var any = false,
t=0;
word = normalize(word);
testFns.forEach(lookupPOS);
var any = false,
t=0;
word = normalize(word);
testFns.forEach(lookupPOS);
function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){
yes && data[parts[i]].push(word);
any |= yes;
donePOS();
});
}
function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){
yes && data[parts[i]].push(word);
any |= yes;
donePOS();
});
}
function donePOS() {
if (++t == nTests) {
!any && data['rest'].push(word);
done();
}
}
function donePOS() {
if (++t == nTests) {
!any && data['rest'].push(word);
done();
}
}
}
function done(){
if (++c == nWords) {
callback(data);
}
if (++c == nWords) {
callback(data);
}
}
};

View File

@ -1,30 +1,36 @@
// npm install jasmine-node -g
// jasmine-node wordpos_spec.js --verbose
/* Note: 'dict' folder should contain WordNet files.
* Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/
*/
var WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
wordpos = new WordPOS('dict');
var str = "The angry bear chased the frightened little squirrel",
expected = {
nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
verbs: [ 'bear' ],
adjectives: [ 'little', 'angry', 'frightened' ],
adverbs: [ 'little' ],
rest: [ 'the' ]
},
garble = 'garblegarble'; // expect not to find word
expected = {
nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
verbs: [ 'bear' ],
adjectives: [ 'little', 'angry', 'frightened' ],
adverbs: [ 'little' ],
rest: [ 'the' ]
},
garble = 'garblegarble'; // expect not to find word
describe('get POS', function() {
beforeEach(function() {
this.addMatchers({
// unordered (multiset) comparison -- NOTE: doesn't handle deep!
toEqualUnordered: function(expected) {
var mismatchKeys=[],
mismatchValues=[],
result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
}
});
this.addMatchers({
// unordered (multiset) comparison -- NOTE: doesn't handle deep!
toEqualUnordered: function(expected) {
var mismatchKeys=[],
mismatchValues=[],
result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
}
});
});
it('should get all POS', function() {
@ -33,41 +39,42 @@ describe('get POS', function() {
expect(result.verbs).toEqualUnordered(expected.verbs);
expect(result.adjectives).toEqualUnordered(expected.adjectives);
expect(result.adverbs).toEqualUnordered(expected.adverbs);
expect(result.rest).toEqualUnordered(expected.rest);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get nouns', function() {
wordpos.getNouns(str, function(result) {
expect(result).toEqualUnordered(expected.nouns);
asyncSpecDone();
});
asyncSpecWait();
wordpos.getNouns(str, function(result) {
expect(result).toEqualUnordered(expected.nouns);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get verbs', function() {
wordpos.getVerbs(str, function(result) {
expect(result).toEqualUnordered(expected.verbs);
asyncSpecDone();
});
asyncSpecWait();
wordpos.getVerbs(str, function(result) {
expect(result).toEqualUnordered(expected.verbs);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get adjectives', function() {
wordpos.getAdjectives(str, function(result) {
expect(result).toEqualUnordered(expected.adjectives);
asyncSpecDone();
});
asyncSpecWait();
wordpos.getAdjectives(str, function(result) {
expect(result).toEqualUnordered(expected.adjectives);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get adverbs', function() {
wordpos.getAdverbs(str, function(result) {
expect(result).toEqualUnordered(expected.adverbs);
asyncSpecDone();
});
asyncSpecWait();
wordpos.getAdverbs(str, function(result) {
expect(result).toEqualUnordered(expected.adverbs);
asyncSpecDone();
});
asyncSpecWait();
});
});
@ -80,91 +87,91 @@ describe('is POS', function() {
asyncSpecWait();
});
it('should check if verb', function() {
wordpos.isVerb(expected.verbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
wordpos.isVerb(expected.verbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if adjective', function() {
wordpos.isAdjective(expected.adjectives[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
wordpos.isAdjective(expected.adjectives[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if adverb', function() {
wordpos.isAdverb(expected.adverbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
wordpos.isAdverb(expected.adverbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
});
describe('is !POS', function() {
it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !verb', function() {
wordpos.isVerb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adjective', function() {
wordpos.isAdjective(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adverb', function() {
wordpos.isAdverb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !verb', function() {
wordpos.isVerb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adjective', function() {
wordpos.isAdjective(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adverb', function() {
wordpos.isAdverb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
});
describe('lookup POS', function() {
it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n');
expect(result[0].lemma).toBe('squirrel');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup verb', function() {
wordpos.lookupVerb('bear', function(result) {
expect(result[0].pos).toBe('v');
expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adjective', function() {
wordpos.lookupAdjective('angry', function(result) {
expect(result[0].pos).toBe('s');
expect(result[0].lemma).toBe('angry');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adverb', function() {
wordpos.lookupAdverb('little', function(result) {
expect(result[0].pos).toBe('r');
expect(result[0].lemma).toBe('little');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n');
expect(result[0].lemma).toBe('squirrel');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup verb', function() {
wordpos.lookupVerb('bear', function(result) {
expect(result[0].pos).toBe('v');
expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adjective', function() {
wordpos.lookupAdjective('angry', function(result) {
expect(result[0].pos).toBe('s');
expect(result[0].lemma).toBe('angry');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adverb', function() {
wordpos.lookupAdverb('little', function(result) {
expect(result[0].pos).toBe('r');
expect(result[0].lemma).toBe('little');
asyncSpecDone();
});
asyncSpecWait();
});
});