From 1ea388768704ecf30fd74d414916053d2694fd5f Mon Sep 17 00:00:00 2001 From: moos Date: Wed, 2 May 2012 16:18:10 -0700 Subject: [PATCH] initial checkin --- README.md | 47 +++++++++++++ wordpos-bench.js | 113 ++++++++++++++++++++++++++++++ wordpos.js | 179 +++++++++++++++++++++++++++++++++++++++++++++++ wordpos_spec.js | 170 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 509 insertions(+) create mode 100644 README.md create mode 100644 wordpos-bench.js create mode 100644 wordpos.js create mode 100644 wordpos_spec.js diff --git a/README.md b/README.md new file mode 100644 index 0000000..ce6c53f --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +wordpos +======= + +wordpos is a set of part-of-speech utilities using [natrual's](http://github.com/NaturalNode/natural) WordNet module. + + +Installation +------------ + +Get the script and use it. (npm module may be coming.) + +Usage +---------- + + var WordPOS = require('./wordpos'), + wordpos = new WordPOS('dict'); + + wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', function(results){ + console.log(results); + }); + // [ 'little', 'angry', 'frightened' ] + + +See wordpos_spec.js for full usage. + +License +------- + +Copyright (c) 2012, mooster@42at.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/wordpos-bench.js b/wordpos-bench.js new file mode 100644 index 0000000..f502058 --- /dev/null +++ b/wordpos-bench.js @@ -0,0 +1,113 @@ + +var uubench = require('uubench'), + fs = require('fs'), + _ = require('underscore')._, + WordPOS = require('./wordpos'), + wordpos = new WordPOS('dict'); + +suite = new uubench.Suite({ + type: 'fixed', + iterations: 10, + //delay: 750, + sync: true, + + start: function(tests){ + console.log('starting %d tests', tests.length); + }, + + result: function(name, stats){ + var persec = 1000 / stats.elapsed + , ops = .5 + stats.iterations * persec; + + console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats); + pos && console.log(out(pos)); + }, + + done: function(time){ + console.log('done in %d msecs', time ); + }, + + section: function(name, stats) { + console.log('\033[35m%s\033[0m',name); + } +}); + + +function out(res){ + return _(res).keys().map(function(k){ return k + ':' + res[k].length }); +} + + + +var text1 = 'laksasdf', + text128 = fs.readFileSync('text-128.txt', 'utf8'), + text, + pos, + str = "This is some sample text. This text can contain multiple sentences. It also works with urls like."; + + +function getPOS(next){ + wordpos.getPOS(text, function(res){ + pos = res; + next(); + }); +} + +function getNouns(next){ + wordpos.getNouns(text, function(res){ + pos = {nouns: res}; + next(); + }); +} + +function getVerbs(next){ + wordpos.getVerbs(text, function(res){ + pos = {verbs: res}; + next(); + }); +} + +function getAdjectives(next){ + wordpos.getAdjectives(text, function(res){ + pos = {adjectives: res}; + next(); + }); +} + +function getAdverbs(next){ + wordpos.getAdverbs(text, function(res){ + pos = {adverbs: res}; + next(); + }); +} + +/* + * one word + */ +suite.section('--1 word--', function(next){ + text = text1; + next(); +}); +suite.bench('getPOS', getPOS); +suite.bench('getNouns', getNouns); +suite.bench('getVerbs', getVerbs); +suite.bench('getAdjectives', getAdjectives); +suite.bench('getAdverbs', getAdverbs); + + +/* + * 128 words + */ +suite.section('--128 words--', function(next){ + suite.options.iterations = 1; + text = text128; + next(); +}); +suite.bench('getPOS', getPOS); +suite.bench('getNouns', getNouns); +suite.bench('getVerbs', getVerbs); +suite.bench('getAdjectives', getAdjectives); +suite.bench('getAdverbs', getAdverbs); + + +suite.run(); diff --git a/wordpos.js b/wordpos.js new file mode 100644 index 0000000..79f9b5d --- /dev/null +++ b/wordpos.js @@ -0,0 +1,179 @@ +/*! +* wordpos +* +* part-of-speech utilities using natural's wordnet module. +* +* Copyright (c) 2012 mooster@42at.com +* Released under MIT license +*/ + +var _ = require('underscore')._, + util = require('util'), + natural = require('./lib/natural'), + WordNet = natural.WordNet, + tokenizer = new natural.WordTokenizer(), + stopwords = ' '+ natural.stopwords.join(' ') +' '; + +function normalize(word) { + return word.toLowerCase().replace(/\s+/g, '_'); +} + +function isStopword(word) { + return stopwords.indexOf(' '+word+' ') >= 0; +} + +function prepText(text) { + return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword); +} + +function lookup(pos) { + return function(word, callback) { + word = normalize(word); + this.lookupFromFiles([ + {index: this.getIndexFile(pos), data: this.getDataFile(pos)} + ], [], word, callback); + }; +} + +function is(pos){ + return function(word, callback) { + var index = this.getIndexFile(pos); + word = normalize(word); + index.lookup(word, function(record) { + callback(!!record); + }); + }; +} + +function get(isFn) { + return function(text, callback) { + var words = prepText(text), + n = words.length, + i = 0, + self = this, + results = []; + + if (!n) return callback(results); + words.forEach(function(word,j){ + self[isFn](word, function(yes){ + yes && results.push(word); + (++i==n) && callback(results); + }); + }); + }; +} + + +var WordPOS = function() { + WordPOS.super_.apply(this, arguments); +}; +util.inherits(WordPOS, WordNet); + +var wordposProto = WordPOS.prototype; + +// fast POS lookups (only look in specified file) +/** + * lookupX() + * Lookup word definition if already know POS + * + * @param string word - word to lookup in given POS + * @param function callback receives array of definition objects or empty + * @return none + */ +wordposProto.lookupAdjective = lookup('a'); +wordposProto.lookupAdverb = lookup('r'); +wordposProto.lookupNoun = lookup('n'); +wordposProto.lookupVerb = lookup('v'); + +/** + * isX() + * Test if word is given POS + * + * @param string word - word to test for given POS + * @param function Callback receives true or false if word is given POS + * @return none + */ +wordposProto.isAdjective = is('a'); +wordposProto.isAdverb = is('r'); +wordposProto.isNoun = is('n'); +wordposProto.isVerb = is('v'); + +/** + * getX() + * Find all words in string that are given POS + * + * @param string Text Words to search + * @param function callback Receives array of words that are given POS + * @return none + */ +wordposProto.getAdjectives = get('isAdjective'); +wordposProto.getAdverbs = get('isAdverb'); +wordposProto.getNouns = get('isNoun'); +wordposProto.getVerbs = get('isVerb'); + +if (!wordposProto.getIndexFile) + wordposProto.getIndexFile = function getIndexFile(pos) { + switch(pos) { + case 'n': + return this.nounIndex; + case 'v': + return this.verbIndex; + case 'a': case 's': + return this.adjIndex; + case 'r': + return this.advIndex; + } + }; + +/** + * getPOS() + * Find all POS for all words in given string + * + * @param string text - words to lookup for POS + * @param function callback - receives object with words broken into POS or 'rest': + * Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]} + * @return none + */ +wordposProto.getPOS = function(text, callback) { + var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}, + testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '), + parts = 'nouns verbs adjectives adverbs'.split(' '), + words = prepText(text), + nTests = testFns.length, + nWords = words.length, + self = this, + c = 0; + + if (!nWords) return callback(data); + words.forEach(lookup); + + function lookup(word){ + var any = false, + t=0; + word = normalize(word); + testFns.forEach(lookupPOS); + + function lookupPOS(isFn,i,list){ + self[isFn](word, function(yes){ + yes && data[parts[i]].push(word); + any |= yes; + donePOS(); + }); + } + + function donePOS() { + if (++t == nTests) { + !any && data['rest'].push(word); + done(); + } + } + } + + function done(){ + if (++c == nWords) { + callback(data); + } + } +}; + +module.exports = WordPOS; diff --git a/wordpos_spec.js b/wordpos_spec.js new file mode 100644 index 0000000..bbefd98 --- /dev/null +++ b/wordpos_spec.js @@ -0,0 +1,170 @@ + +var WordPOS = require('./wordpos'), + wordpos = new WordPOS('dict'); + +var str = "The angry bear chased the frightened little squirrel", + expected = { + nouns: [ 'bear', 'squirrel', 'little', 'chased' ], + verbs: [ 'bear' ], + adjectives: [ 'little', 'angry', 'frightened' ], + adverbs: [ 'little' ], + rest: [ 'the' ] + }, + garble = 'garblegarble'; // expect not to find word + + +describe('get POS', function() { + + beforeEach(function() { + this.addMatchers({ + // unordered (multiset) comparison -- NOTE: doesn't handle deep! + toEqualUnordered: function(expected) { + var mismatchKeys=[], + mismatchValues=[], + result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues); + return result || (mismatchKeys.length == 0 && mismatchValues.length > 0); + } + }); + }); + + it('should get all POS', function() { + wordpos.getPOS(str, function(result) { + expect(result.nouns).toEqualUnordered(expected.nouns); + expect(result.verbs).toEqualUnordered(expected.verbs); + expect(result.adjectives).toEqualUnordered(expected.adjectives); + expect(result.adverbs).toEqualUnordered(expected.adverbs); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + + it('should get nouns', function() { + wordpos.getNouns(str, function(result) { + expect(result).toEqualUnordered(expected.nouns); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + + it('should get verbs', function() { + wordpos.getVerbs(str, function(result) { + expect(result).toEqualUnordered(expected.verbs); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + + it('should get adjectives', function() { + wordpos.getAdjectives(str, function(result) { + expect(result).toEqualUnordered(expected.adjectives); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + + it('should get adverbs', function() { + wordpos.getAdverbs(str, function(result) { + expect(result).toEqualUnordered(expected.adverbs); + asyncSpecDone(); + }); + asyncSpecWait(); + }); +}); + +describe('is POS', function() { + it('should check if noun', function() { + wordpos.isNoun(expected.nouns[0], function(result) { + expect(result).toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if verb', function() { + wordpos.isVerb(expected.verbs[0], function(result) { + expect(result).toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if adjective', function() { + wordpos.isAdjective(expected.adjectives[0], function(result) { + expect(result).toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if adverb', function() { + wordpos.isAdverb(expected.adverbs[0], function(result) { + expect(result).toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); +}); + +describe('is !POS', function() { + it('should check if !noun', function() { + wordpos.isNoun(garble, function(result) { + expect(result).not.toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if !verb', function() { + wordpos.isVerb(garble, function(result) { + expect(result).not.toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if !adjective', function() { + wordpos.isAdjective(garble, function(result) { + expect(result).not.toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should check if !adverb', function() { + wordpos.isAdverb(garble, function(result) { + expect(result).not.toBeTruthy(); + asyncSpecDone(); + }); + asyncSpecWait(); + }); +}); + +describe('lookup POS', function() { + it('should lookup noun', function() { + wordpos.lookupNoun('squirrel', function(result) { + expect(result[0].pos).toBe('n'); + expect(result[0].lemma).toBe('squirrel'); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should lookup verb', function() { + wordpos.lookupVerb('bear', function(result) { + expect(result[0].pos).toBe('v'); + expect(result[0].lemma).toBe('have_a_bun_in_the_oven'); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should lookup adjective', function() { + wordpos.lookupAdjective('angry', function(result) { + expect(result[0].pos).toBe('s'); + expect(result[0].lemma).toBe('angry'); + asyncSpecDone(); + }); + asyncSpecWait(); + }); + it('should lookup adverb', function() { + wordpos.lookupAdverb('little', function(result) { + expect(result[0].pos).toBe('r'); + expect(result[0].lemma).toBe('little'); + asyncSpecDone(); + }); + asyncSpecWait(); + }); +}); +