initial checkin

This commit is contained in:
moos 2012-05-02 16:18:10 -07:00
commit 1ea3887687
4 changed files with 509 additions and 0 deletions

47
README.md Normal file
View File

@ -0,0 +1,47 @@
wordpos
=======
wordpos is a set of part-of-speech utilities using [natrual's](http://github.com/NaturalNode/natural) WordNet module.
Installation
------------
Get the script and use it. (npm module may be coming.)
Usage
----------
var WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', function(results){
console.log(results);
});
// [ 'little', 'angry', 'frightened' ]
See wordpos_spec.js for full usage.
License
-------
Copyright (c) 2012, mooster@42at.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

113
wordpos-bench.js Normal file
View File

@ -0,0 +1,113 @@
var uubench = require('uubench'),
fs = require('fs'),
_ = require('underscore')._,
WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
suite = new uubench.Suite({
type: 'fixed',
iterations: 10,
//delay: 750,
sync: true,
start: function(tests){
console.log('starting %d tests', tests.length);
},
result: function(name, stats){
var persec = 1000 / stats.elapsed
, ops = .5 + stats.iterations * persec;
console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats);
pos && console.log(out(pos));
},
done: function(time){
console.log('done in %d msecs', time );
},
section: function(name, stats) {
console.log('\033[35m%s\033[0m',name);
}
});
function out(res){
return _(res).keys().map(function(k){ return k + ':' + res[k].length });
}
var text1 = 'laksasdf',
text128 = fs.readFileSync('text-128.txt', 'utf8'),
text,
pos,
str = "This is some sample text. This text can contain multiple sentences. It also works with urls like.";
function getPOS(next){
wordpos.getPOS(text, function(res){
pos = res;
next();
});
}
function getNouns(next){
wordpos.getNouns(text, function(res){
pos = {nouns: res};
next();
});
}
function getVerbs(next){
wordpos.getVerbs(text, function(res){
pos = {verbs: res};
next();
});
}
function getAdjectives(next){
wordpos.getAdjectives(text, function(res){
pos = {adjectives: res};
next();
});
}
function getAdverbs(next){
wordpos.getAdverbs(text, function(res){
pos = {adverbs: res};
next();
});
}
/*
* one word
*/
suite.section('--1 word--', function(next){
text = text1;
next();
});
suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns);
suite.bench('getVerbs', getVerbs);
suite.bench('getAdjectives', getAdjectives);
suite.bench('getAdverbs', getAdverbs);
/*
* 128 words
*/
suite.section('--128 words--', function(next){
suite.options.iterations = 1;
text = text128;
next();
});
suite.bench('getPOS', getPOS);
suite.bench('getNouns', getNouns);
suite.bench('getVerbs', getVerbs);
suite.bench('getAdjectives', getAdjectives);
suite.bench('getAdverbs', getAdverbs);
suite.run();

179
wordpos.js Normal file
View File

@ -0,0 +1,179 @@
/*!
* wordpos
*
* part-of-speech utilities using natural's wordnet module.
*
* Copyright (c) 2012 mooster@42at.com
* Released under MIT license
*/
var _ = require('underscore')._,
util = require('util'),
natural = require('./lib/natural'),
WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ';
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(word) {
return stopwords.indexOf(' '+word+' ') >= 0;
}
function prepText(text) {
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
}
function lookup(pos) {
return function(word, callback) {
word = normalize(word);
this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, callback);
};
}
function is(pos){
return function(word, callback) {
var index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
callback(!!record);
});
};
}
function get(isFn) {
return function(text, callback) {
var words = prepText(text),
n = words.length,
i = 0,
self = this,
results = [];
if (!n) return callback(results);
words.forEach(function(word,j){
self[isFn](word, function(yes){
yes && results.push(word);
(++i==n) && callback(results);
});
});
};
}
var WordPOS = function() {
WordPOS.super_.apply(this, arguments);
};
util.inherits(WordPOS, WordNet);
var wordposProto = WordPOS.prototype;
// fast POS lookups (only look in specified file)
/**
* lookupX()
* Lookup word definition if already know POS
*
* @param string word - word to lookup in given POS
* @param function callback receives array of definition objects or empty
* @return none
*/
wordposProto.lookupAdjective = lookup('a');
wordposProto.lookupAdverb = lookup('r');
wordposProto.lookupNoun = lookup('n');
wordposProto.lookupVerb = lookup('v');
/**
* isX()
* Test if word is given POS
*
* @param string word - word to test for given POS
* @param function Callback receives true or false if word is given POS
* @return none
*/
wordposProto.isAdjective = is('a');
wordposProto.isAdverb = is('r');
wordposProto.isNoun = is('n');
wordposProto.isVerb = is('v');
/**
* getX()
* Find all words in string that are given POS
*
* @param string Text Words to search
* @param function callback Receives array of words that are given POS
* @return none
*/
wordposProto.getAdjectives = get('isAdjective');
wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
if (!wordposProto.getIndexFile)
wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) {
case 'n':
return this.nounIndex;
case 'v':
return this.verbIndex;
case 'a': case 's':
return this.adjIndex;
case 'r':
return this.advIndex;
}
};
/**
* getPOS()
* Find all POS for all words in given string
*
* @param string text - words to lookup for POS
* @param function callback - receives object with words broken into POS or 'rest':
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return none
*/
wordposProto.getPOS = function(text, callback) {
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text),
nTests = testFns.length,
nWords = words.length,
self = this,
c = 0;
if (!nWords) return callback(data);
words.forEach(lookup);
function lookup(word){
var any = false,
t=0;
word = normalize(word);
testFns.forEach(lookupPOS);
function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){
yes && data[parts[i]].push(word);
any |= yes;
donePOS();
});
}
function donePOS() {
if (++t == nTests) {
!any && data['rest'].push(word);
done();
}
}
}
function done(){
if (++c == nWords) {
callback(data);
}
}
};
module.exports = WordPOS;

170
wordpos_spec.js Normal file
View File

@ -0,0 +1,170 @@
var WordPOS = require('./wordpos'),
wordpos = new WordPOS('dict');
var str = "The angry bear chased the frightened little squirrel",
expected = {
nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
verbs: [ 'bear' ],
adjectives: [ 'little', 'angry', 'frightened' ],
adverbs: [ 'little' ],
rest: [ 'the' ]
},
garble = 'garblegarble'; // expect not to find word
describe('get POS', function() {
beforeEach(function() {
this.addMatchers({
// unordered (multiset) comparison -- NOTE: doesn't handle deep!
toEqualUnordered: function(expected) {
var mismatchKeys=[],
mismatchValues=[],
result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
}
});
});
it('should get all POS', function() {
wordpos.getPOS(str, function(result) {
expect(result.nouns).toEqualUnordered(expected.nouns);
expect(result.verbs).toEqualUnordered(expected.verbs);
expect(result.adjectives).toEqualUnordered(expected.adjectives);
expect(result.adverbs).toEqualUnordered(expected.adverbs);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get nouns', function() {
wordpos.getNouns(str, function(result) {
expect(result).toEqualUnordered(expected.nouns);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get verbs', function() {
wordpos.getVerbs(str, function(result) {
expect(result).toEqualUnordered(expected.verbs);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get adjectives', function() {
wordpos.getAdjectives(str, function(result) {
expect(result).toEqualUnordered(expected.adjectives);
asyncSpecDone();
});
asyncSpecWait();
});
it('should get adverbs', function() {
wordpos.getAdverbs(str, function(result) {
expect(result).toEqualUnordered(expected.adverbs);
asyncSpecDone();
});
asyncSpecWait();
});
});
describe('is POS', function() {
it('should check if noun', function() {
wordpos.isNoun(expected.nouns[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if verb', function() {
wordpos.isVerb(expected.verbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if adjective', function() {
wordpos.isAdjective(expected.adjectives[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if adverb', function() {
wordpos.isAdverb(expected.adverbs[0], function(result) {
expect(result).toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
});
describe('is !POS', function() {
it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !verb', function() {
wordpos.isVerb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adjective', function() {
wordpos.isAdjective(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
it('should check if !adverb', function() {
wordpos.isAdverb(garble, function(result) {
expect(result).not.toBeTruthy();
asyncSpecDone();
});
asyncSpecWait();
});
});
describe('lookup POS', function() {
it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n');
expect(result[0].lemma).toBe('squirrel');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup verb', function() {
wordpos.lookupVerb('bear', function(result) {
expect(result[0].pos).toBe('v');
expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adjective', function() {
wordpos.lookupAdjective('angry', function(result) {
expect(result[0].pos).toBe('s');
expect(result[0].lemma).toBe('angry');
asyncSpecDone();
});
asyncSpecWait();
});
it('should lookup adverb', function() {
wordpos.lookupAdverb('little', function(result) {
expect(result[0].pos).toBe('r');
expect(result[0].lemma).toBe('little');
asyncSpecDone();
});
asyncSpecWait();
});
});