initial checkin
This commit is contained in:
commit
1ea3887687
|
@ -0,0 +1,47 @@
|
|||
wordpos
|
||||
=======
|
||||
|
||||
wordpos is a set of part-of-speech utilities using [natrual's](http://github.com/NaturalNode/natural) WordNet module.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Get the script and use it. (npm module may be coming.)
|
||||
|
||||
Usage
|
||||
----------
|
||||
|
||||
var WordPOS = require('./wordpos'),
|
||||
wordpos = new WordPOS('dict');
|
||||
|
||||
wordpos.getAdjectives('The angry bear chased the frightened little squirrel.', function(results){
|
||||
console.log(results);
|
||||
});
|
||||
// [ 'little', 'angry', 'frightened' ]
|
||||
|
||||
|
||||
See wordpos_spec.js for full usage.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
Copyright (c) 2012, mooster@42at.com
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
|
@ -0,0 +1,113 @@
|
|||
|
||||
var uubench = require('uubench'),
|
||||
fs = require('fs'),
|
||||
_ = require('underscore')._,
|
||||
WordPOS = require('./wordpos'),
|
||||
wordpos = new WordPOS('dict');
|
||||
|
||||
suite = new uubench.Suite({
|
||||
type: 'fixed',
|
||||
iterations: 10,
|
||||
//delay: 750,
|
||||
sync: true,
|
||||
|
||||
start: function(tests){
|
||||
console.log('starting %d tests', tests.length);
|
||||
},
|
||||
|
||||
result: function(name, stats){
|
||||
var persec = 1000 / stats.elapsed
|
||||
, ops = .5 + stats.iterations * persec;
|
||||
|
||||
console.log(' \033[90m%s : \033[36m%d \033[90mops/s\033[0m', name, ops | 0, stats);
|
||||
pos && console.log(out(pos));
|
||||
},
|
||||
|
||||
done: function(time){
|
||||
console.log('done in %d msecs', time );
|
||||
},
|
||||
|
||||
section: function(name, stats) {
|
||||
console.log('\033[35m%s\033[0m',name);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
function out(res){
|
||||
return _(res).keys().map(function(k){ return k + ':' + res[k].length });
|
||||
}
|
||||
|
||||
|
||||
|
||||
var text1 = 'laksasdf',
|
||||
text128 = fs.readFileSync('text-128.txt', 'utf8'),
|
||||
text,
|
||||
pos,
|
||||
str = "This is some sample text. This text can contain multiple sentences. It also works with urls like.";
|
||||
|
||||
|
||||
function getPOS(next){
|
||||
wordpos.getPOS(text, function(res){
|
||||
pos = res;
|
||||
next();
|
||||
});
|
||||
}
|
||||
|
||||
function getNouns(next){
|
||||
wordpos.getNouns(text, function(res){
|
||||
pos = {nouns: res};
|
||||
next();
|
||||
});
|
||||
}
|
||||
|
||||
function getVerbs(next){
|
||||
wordpos.getVerbs(text, function(res){
|
||||
pos = {verbs: res};
|
||||
next();
|
||||
});
|
||||
}
|
||||
|
||||
function getAdjectives(next){
|
||||
wordpos.getAdjectives(text, function(res){
|
||||
pos = {adjectives: res};
|
||||
next();
|
||||
});
|
||||
}
|
||||
|
||||
function getAdverbs(next){
|
||||
wordpos.getAdverbs(text, function(res){
|
||||
pos = {adverbs: res};
|
||||
next();
|
||||
});
|
||||
}
|
||||
|
||||
/*
|
||||
* one word
|
||||
*/
|
||||
suite.section('--1 word--', function(next){
|
||||
text = text1;
|
||||
next();
|
||||
});
|
||||
suite.bench('getPOS', getPOS);
|
||||
suite.bench('getNouns', getNouns);
|
||||
suite.bench('getVerbs', getVerbs);
|
||||
suite.bench('getAdjectives', getAdjectives);
|
||||
suite.bench('getAdverbs', getAdverbs);
|
||||
|
||||
|
||||
/*
|
||||
* 128 words
|
||||
*/
|
||||
suite.section('--128 words--', function(next){
|
||||
suite.options.iterations = 1;
|
||||
text = text128;
|
||||
next();
|
||||
});
|
||||
suite.bench('getPOS', getPOS);
|
||||
suite.bench('getNouns', getNouns);
|
||||
suite.bench('getVerbs', getVerbs);
|
||||
suite.bench('getAdjectives', getAdjectives);
|
||||
suite.bench('getAdverbs', getAdverbs);
|
||||
|
||||
|
||||
suite.run();
|
|
@ -0,0 +1,179 @@
|
|||
/*!
|
||||
* wordpos
|
||||
*
|
||||
* part-of-speech utilities using natural's wordnet module.
|
||||
*
|
||||
* Copyright (c) 2012 mooster@42at.com
|
||||
* Released under MIT license
|
||||
*/
|
||||
|
||||
var _ = require('underscore')._,
|
||||
util = require('util'),
|
||||
natural = require('./lib/natural'),
|
||||
WordNet = natural.WordNet,
|
||||
tokenizer = new natural.WordTokenizer(),
|
||||
stopwords = ' '+ natural.stopwords.join(' ') +' ';
|
||||
|
||||
function normalize(word) {
|
||||
return word.toLowerCase().replace(/\s+/g, '_');
|
||||
}
|
||||
|
||||
function isStopword(word) {
|
||||
return stopwords.indexOf(' '+word+' ') >= 0;
|
||||
}
|
||||
|
||||
function prepText(text) {
|
||||
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
|
||||
}
|
||||
|
||||
function lookup(pos) {
|
||||
return function(word, callback) {
|
||||
word = normalize(word);
|
||||
this.lookupFromFiles([
|
||||
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
||||
], [], word, callback);
|
||||
};
|
||||
}
|
||||
|
||||
function is(pos){
|
||||
return function(word, callback) {
|
||||
var index = this.getIndexFile(pos);
|
||||
word = normalize(word);
|
||||
index.lookup(word, function(record) {
|
||||
callback(!!record);
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
function get(isFn) {
|
||||
return function(text, callback) {
|
||||
var words = prepText(text),
|
||||
n = words.length,
|
||||
i = 0,
|
||||
self = this,
|
||||
results = [];
|
||||
|
||||
if (!n) return callback(results);
|
||||
words.forEach(function(word,j){
|
||||
self[isFn](word, function(yes){
|
||||
yes && results.push(word);
|
||||
(++i==n) && callback(results);
|
||||
});
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
var WordPOS = function() {
|
||||
WordPOS.super_.apply(this, arguments);
|
||||
};
|
||||
util.inherits(WordPOS, WordNet);
|
||||
|
||||
var wordposProto = WordPOS.prototype;
|
||||
|
||||
// fast POS lookups (only look in specified file)
|
||||
/**
|
||||
* lookupX()
|
||||
* Lookup word definition if already know POS
|
||||
*
|
||||
* @param string word - word to lookup in given POS
|
||||
* @param function callback receives array of definition objects or empty
|
||||
* @return none
|
||||
*/
|
||||
wordposProto.lookupAdjective = lookup('a');
|
||||
wordposProto.lookupAdverb = lookup('r');
|
||||
wordposProto.lookupNoun = lookup('n');
|
||||
wordposProto.lookupVerb = lookup('v');
|
||||
|
||||
/**
|
||||
* isX()
|
||||
* Test if word is given POS
|
||||
*
|
||||
* @param string word - word to test for given POS
|
||||
* @param function Callback receives true or false if word is given POS
|
||||
* @return none
|
||||
*/
|
||||
wordposProto.isAdjective = is('a');
|
||||
wordposProto.isAdverb = is('r');
|
||||
wordposProto.isNoun = is('n');
|
||||
wordposProto.isVerb = is('v');
|
||||
|
||||
/**
|
||||
* getX()
|
||||
* Find all words in string that are given POS
|
||||
*
|
||||
* @param string Text Words to search
|
||||
* @param function callback Receives array of words that are given POS
|
||||
* @return none
|
||||
*/
|
||||
wordposProto.getAdjectives = get('isAdjective');
|
||||
wordposProto.getAdverbs = get('isAdverb');
|
||||
wordposProto.getNouns = get('isNoun');
|
||||
wordposProto.getVerbs = get('isVerb');
|
||||
|
||||
if (!wordposProto.getIndexFile)
|
||||
wordposProto.getIndexFile = function getIndexFile(pos) {
|
||||
switch(pos) {
|
||||
case 'n':
|
||||
return this.nounIndex;
|
||||
case 'v':
|
||||
return this.verbIndex;
|
||||
case 'a': case 's':
|
||||
return this.adjIndex;
|
||||
case 'r':
|
||||
return this.advIndex;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* getPOS()
|
||||
* Find all POS for all words in given string
|
||||
*
|
||||
* @param string text - words to lookup for POS
|
||||
* @param function callback - receives object with words broken into POS or 'rest':
|
||||
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
|
||||
* @return none
|
||||
*/
|
||||
wordposProto.getPOS = function(text, callback) {
|
||||
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
|
||||
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
||||
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
||||
words = prepText(text),
|
||||
nTests = testFns.length,
|
||||
nWords = words.length,
|
||||
self = this,
|
||||
c = 0;
|
||||
|
||||
if (!nWords) return callback(data);
|
||||
words.forEach(lookup);
|
||||
|
||||
function lookup(word){
|
||||
var any = false,
|
||||
t=0;
|
||||
word = normalize(word);
|
||||
testFns.forEach(lookupPOS);
|
||||
|
||||
function lookupPOS(isFn,i,list){
|
||||
self[isFn](word, function(yes){
|
||||
yes && data[parts[i]].push(word);
|
||||
any |= yes;
|
||||
donePOS();
|
||||
});
|
||||
}
|
||||
|
||||
function donePOS() {
|
||||
if (++t == nTests) {
|
||||
!any && data['rest'].push(word);
|
||||
done();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function done(){
|
||||
if (++c == nWords) {
|
||||
callback(data);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = WordPOS;
|
|
@ -0,0 +1,170 @@
|
|||
|
||||
var WordPOS = require('./wordpos'),
|
||||
wordpos = new WordPOS('dict');
|
||||
|
||||
var str = "The angry bear chased the frightened little squirrel",
|
||||
expected = {
|
||||
nouns: [ 'bear', 'squirrel', 'little', 'chased' ],
|
||||
verbs: [ 'bear' ],
|
||||
adjectives: [ 'little', 'angry', 'frightened' ],
|
||||
adverbs: [ 'little' ],
|
||||
rest: [ 'the' ]
|
||||
},
|
||||
garble = 'garblegarble'; // expect not to find word
|
||||
|
||||
|
||||
describe('get POS', function() {
|
||||
|
||||
beforeEach(function() {
|
||||
this.addMatchers({
|
||||
// unordered (multiset) comparison -- NOTE: doesn't handle deep!
|
||||
toEqualUnordered: function(expected) {
|
||||
var mismatchKeys=[],
|
||||
mismatchValues=[],
|
||||
result = this.env.compareObjects_(this.actual, expected, mismatchKeys, mismatchValues);
|
||||
return result || (mismatchKeys.length == 0 && mismatchValues.length > 0);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
it('should get all POS', function() {
|
||||
wordpos.getPOS(str, function(result) {
|
||||
expect(result.nouns).toEqualUnordered(expected.nouns);
|
||||
expect(result.verbs).toEqualUnordered(expected.verbs);
|
||||
expect(result.adjectives).toEqualUnordered(expected.adjectives);
|
||||
expect(result.adverbs).toEqualUnordered(expected.adverbs);
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
|
||||
it('should get nouns', function() {
|
||||
wordpos.getNouns(str, function(result) {
|
||||
expect(result).toEqualUnordered(expected.nouns);
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
|
||||
it('should get verbs', function() {
|
||||
wordpos.getVerbs(str, function(result) {
|
||||
expect(result).toEqualUnordered(expected.verbs);
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
|
||||
it('should get adjectives', function() {
|
||||
wordpos.getAdjectives(str, function(result) {
|
||||
expect(result).toEqualUnordered(expected.adjectives);
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
|
||||
it('should get adverbs', function() {
|
||||
wordpos.getAdverbs(str, function(result) {
|
||||
expect(result).toEqualUnordered(expected.adverbs);
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
});
|
||||
|
||||
describe('is POS', function() {
|
||||
it('should check if noun', function() {
|
||||
wordpos.isNoun(expected.nouns[0], function(result) {
|
||||
expect(result).toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if verb', function() {
|
||||
wordpos.isVerb(expected.verbs[0], function(result) {
|
||||
expect(result).toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if adjective', function() {
|
||||
wordpos.isAdjective(expected.adjectives[0], function(result) {
|
||||
expect(result).toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if adverb', function() {
|
||||
wordpos.isAdverb(expected.adverbs[0], function(result) {
|
||||
expect(result).toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
});
|
||||
|
||||
describe('is !POS', function() {
|
||||
it('should check if !noun', function() {
|
||||
wordpos.isNoun(garble, function(result) {
|
||||
expect(result).not.toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if !verb', function() {
|
||||
wordpos.isVerb(garble, function(result) {
|
||||
expect(result).not.toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if !adjective', function() {
|
||||
wordpos.isAdjective(garble, function(result) {
|
||||
expect(result).not.toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should check if !adverb', function() {
|
||||
wordpos.isAdverb(garble, function(result) {
|
||||
expect(result).not.toBeTruthy();
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
});
|
||||
|
||||
describe('lookup POS', function() {
|
||||
it('should lookup noun', function() {
|
||||
wordpos.lookupNoun('squirrel', function(result) {
|
||||
expect(result[0].pos).toBe('n');
|
||||
expect(result[0].lemma).toBe('squirrel');
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should lookup verb', function() {
|
||||
wordpos.lookupVerb('bear', function(result) {
|
||||
expect(result[0].pos).toBe('v');
|
||||
expect(result[0].lemma).toBe('have_a_bun_in_the_oven');
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should lookup adjective', function() {
|
||||
wordpos.lookupAdjective('angry', function(result) {
|
||||
expect(result[0].pos).toBe('s');
|
||||
expect(result[0].lemma).toBe('angry');
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
it('should lookup adverb', function() {
|
||||
wordpos.lookupAdverb('little', function(result) {
|
||||
expect(result[0].pos).toBe('r');
|
||||
expect(result[0].lemma).toBe('little');
|
||||
asyncSpecDone();
|
||||
});
|
||||
asyncSpecWait();
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue