wordpos/wordpos.js

222 lines
5.5 KiB
JavaScript

/**
* wordpos
*
* Node.js part-of-speech utilities using natural's WordNet module.
*
* Copyright (c) 2012 mooster@42at.com
* Released under MIT license
*/
var _ = require('underscore')._,
util = require('util'),
natural = require('natural'),
WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ',
WNdb = require('WNdb');
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(word) {
return stopwords.indexOf(' '+word+' ') >= 0;
}
function prepText(text) {
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
}
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
args = [];
word = normalize(word);
this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, function(results){
args.push(results);
profile && args.push(new Date() - start);
callback.apply(null, args);
});
};
}
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
args.push(!!record);
profile && args.push(new Date() - start);
callback.apply(null, args);
});
};
}
function get(isFn) {
return function(text, callback) {
var profile = this.options.profile,
start = profile && new Date(),
words = prepText(text),
n = words.length,
i = 0,
self = this,
results = [],
args = [results],
done = function(){
profile && (args[1] = new Date() - start);
callback.apply(null, args)
};
if (!n) return (process.nextTick(done),0);
words.forEach(function(word,j){
self[isFn](word, function(yes){
yes && results.push(word);
(++i==n) && done();
}, /*_noprofile*/ true);
});
return n;
};
}
/**
* @class WordPOS
* @constructor
*/
var WordPOS = function(options) {
if (arguments.length == 0 || _.isObject(options)) {
WordPOS.super_.call(this, WNdb.path);
} else {
WordPOS.super_.apply(this, arguments);
}
this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults);
};
util.inherits(WordPOS, WordNet);
WordPOS.defaults = {
/**
* enable profiling, time in msec returned as second argument in callback
*/
profile: false
};
var wordposProto = WordPOS.prototype;
// fast POS lookups (only look in specified file)
/**
* lookupX()
* Lookup word definition if already know POS
*
* @param string word - word to lookup in given POS
* @param function callback receives array of definition objects or empty
* @return none
*/
wordposProto.lookupAdjective = lookup('a');
wordposProto.lookupAdverb = lookup('r');
wordposProto.lookupNoun = lookup('n');
wordposProto.lookupVerb = lookup('v');
/**
* isX()
* Test if word is given POS
*
* @param string word - word to test for given POS
* @param function Callback receives true or false if word is given POS
* @return none
*/
wordposProto.isAdjective = is('a');
wordposProto.isAdverb = is('r');
wordposProto.isNoun = is('n');
wordposProto.isVerb = is('v');
/**
* getX()
* Find all words in string that are given POS
*
* @param string Text Words to search
* @param function callback Receives array of words that are given POS
* @return none
*/
wordposProto.getAdjectives = get('isAdjective');
wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
wordposProto.parse = prepText;
if (!wordposProto.getIndexFile) {
wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) {
case 'n':
return this.nounIndex;
case 'v':
return this.verbIndex;
case 'a': case 's':
return this.adjIndex;
case 'r':
return this.advIndex;
}
};
}
/**
* getPOS()
* Find all POS for all words in given string
*
* @param string text - words to lookup for POS
* @param function callback - receives object with words broken into POS or 'rest':
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return none
*/
wordposProto.getPOS = function(text, callback) {
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
profile = this.options.profile,
start = profile && new Date(),
args = [data],
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text),
nTests = testFns.length,
nWords = words.length,
self = this,
c = 0,
done = function(){
profile && (args[1] = new Date() - start);
callback.apply(null, args)
};
if (!nWords) return (process.nextTick(done),0);
words.forEach(lookup);
function lookup(word){
var any = false,
t=0;
testFns.forEach(lookupPOS);
function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){
yes && data[parts[i]].push(word);
any |= yes;
donePOS();
});
}
function donePOS() {
if (++t == nTests) {
!any && data['rest'].push(word);
(++c == nWords) && done();
}
}
}
return nWords;
};
WordPOS.WNdb = WNdb;
module.exports = WordPOS;