2012-05-04 19:23:28 +00:00
|
|
|
/**
|
2012-05-02 23:18:10 +00:00
|
|
|
* wordpos
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
|
|
|
* Node.js part-of-speech utilities using natural's WordNet module.
|
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* Copyright (c) 2012 mooster@42at.com
|
|
|
|
* Released under MIT license
|
|
|
|
*/
|
|
|
|
|
|
|
|
var _ = require('underscore')._,
|
2012-05-04 19:23:28 +00:00
|
|
|
util = require('util'),
|
|
|
|
natural = require('natural'),
|
|
|
|
WordNet = natural.WordNet,
|
|
|
|
tokenizer = new natural.WordTokenizer(),
|
2012-05-06 09:44:21 +00:00
|
|
|
stopwords = ' '+ natural.stopwords.join(' ') +' ',
|
|
|
|
WNdb = require('WNdb');
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
function normalize(word) {
|
|
|
|
return word.toLowerCase().replace(/\s+/g, '_');
|
|
|
|
}
|
|
|
|
|
|
|
|
function isStopword(word) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return stopwords.indexOf(' '+word+' ') >= 0;
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function prepText(text) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function lookup(pos) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return function(word, callback) {
|
2012-05-08 04:41:46 +00:00
|
|
|
var profile = this.options.profile,
|
|
|
|
start = profile && new Date(),
|
|
|
|
args = [];
|
2012-05-04 19:23:28 +00:00
|
|
|
word = normalize(word);
|
|
|
|
this.lookupFromFiles([
|
|
|
|
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
2012-05-08 04:41:46 +00:00
|
|
|
], [], word, function(results){
|
|
|
|
args.push(results);
|
|
|
|
profile && args.push(new Date() - start);
|
|
|
|
callback.apply(null, args);
|
|
|
|
});
|
2012-05-04 19:23:28 +00:00
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function is(pos){
|
2012-05-08 04:41:46 +00:00
|
|
|
return function(word, callback, _noprofile) {
|
|
|
|
// disable profiling when isX() used internally
|
|
|
|
var profile = this.options.profile && !_noprofile,
|
|
|
|
start = profile && new Date(),
|
|
|
|
args = [],
|
|
|
|
index = this.getIndexFile(pos);
|
2012-05-04 19:23:28 +00:00
|
|
|
word = normalize(word);
|
|
|
|
index.lookup(word, function(record) {
|
2012-05-08 04:41:46 +00:00
|
|
|
args.push(!!record);
|
|
|
|
profile && args.push(new Date() - start);
|
|
|
|
callback.apply(null, args);
|
2012-05-04 19:23:28 +00:00
|
|
|
});
|
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 19:23:28 +00:00
|
|
|
function get(isFn) {
|
|
|
|
return function(text, callback) {
|
2012-05-08 04:41:46 +00:00
|
|
|
var profile = this.options.profile,
|
|
|
|
start = profile && new Date(),
|
|
|
|
words = prepText(text),
|
2012-05-04 19:23:28 +00:00
|
|
|
n = words.length,
|
|
|
|
i = 0,
|
|
|
|
self = this,
|
2012-05-08 04:41:46 +00:00
|
|
|
results = [],
|
|
|
|
args = [results];
|
|
|
|
profile && args.push(0);
|
|
|
|
if (!n) return callback.apply(null, args);
|
2012-05-04 19:23:28 +00:00
|
|
|
words.forEach(function(word,j){
|
|
|
|
self[isFn](word, function(yes){
|
|
|
|
yes && results.push(word);
|
2012-05-08 04:41:46 +00:00
|
|
|
if (++i==n) {
|
|
|
|
profile && (args[1] = new Date() - start);
|
|
|
|
callback.apply(null, args);
|
|
|
|
}
|
|
|
|
}, /*_noprofile*/ true);
|
2012-05-04 19:23:28 +00:00
|
|
|
});
|
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
2012-05-08 04:41:46 +00:00
|
|
|
/**
|
|
|
|
* @class WordPOS
|
|
|
|
* @constructor
|
|
|
|
*/
|
|
|
|
var WordPOS = function(options) {
|
|
|
|
if (arguments.length == 0 || _.isObject(options)) {
|
2012-05-06 09:44:21 +00:00
|
|
|
WordPOS.super_.call(this, WNdb.path);
|
|
|
|
} else {
|
|
|
|
WordPOS.super_.apply(this, arguments);
|
|
|
|
}
|
2012-05-08 04:41:46 +00:00
|
|
|
this.options = _.defaults({}, _.isObject(options) && options || {}, WordPOS.defaults);
|
2012-05-02 23:18:10 +00:00
|
|
|
};
|
|
|
|
util.inherits(WordPOS, WordNet);
|
|
|
|
|
2012-05-08 04:41:46 +00:00
|
|
|
WordPOS.defaults = {
|
|
|
|
/**
|
|
|
|
* enable profiling, time in msec returned as second argument in callback
|
|
|
|
*/
|
|
|
|
profile: false
|
|
|
|
};
|
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
var wordposProto = WordPOS.prototype;
|
|
|
|
|
|
|
|
// fast POS lookups (only look in specified file)
|
|
|
|
/**
|
|
|
|
* lookupX()
|
|
|
|
* Lookup word definition if already know POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string word - word to lookup in given POS
|
|
|
|
* @param function callback receives array of definition objects or empty
|
|
|
|
* @return none
|
|
|
|
*/
|
|
|
|
wordposProto.lookupAdjective = lookup('a');
|
|
|
|
wordposProto.lookupAdverb = lookup('r');
|
|
|
|
wordposProto.lookupNoun = lookup('n');
|
|
|
|
wordposProto.lookupVerb = lookup('v');
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
/**
|
|
|
|
* isX()
|
|
|
|
* Test if word is given POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string word - word to test for given POS
|
|
|
|
* @param function Callback receives true or false if word is given POS
|
|
|
|
* @return none
|
|
|
|
*/
|
2012-05-04 19:23:28 +00:00
|
|
|
wordposProto.isAdjective = is('a');
|
|
|
|
wordposProto.isAdverb = is('r');
|
|
|
|
wordposProto.isNoun = is('n');
|
|
|
|
wordposProto.isVerb = is('v');
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* getX()
|
|
|
|
* Find all words in string that are given POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string Text Words to search
|
2012-05-04 19:23:28 +00:00
|
|
|
* @param function callback Receives array of words that are given POS
|
2012-05-02 23:18:10 +00:00
|
|
|
* @return none
|
|
|
|
*/
|
2012-05-04 19:23:28 +00:00
|
|
|
wordposProto.getAdjectives = get('isAdjective');
|
|
|
|
wordposProto.getAdverbs = get('isAdverb');
|
|
|
|
wordposProto.getNouns = get('isNoun');
|
|
|
|
wordposProto.getVerbs = get('isVerb');
|
|
|
|
|
|
|
|
if (!wordposProto.getIndexFile) {
|
|
|
|
wordposProto.getIndexFile = function getIndexFile(pos) {
|
|
|
|
switch(pos) {
|
|
|
|
case 'n':
|
|
|
|
return this.nounIndex;
|
|
|
|
case 'v':
|
|
|
|
return this.verbIndex;
|
|
|
|
case 'a': case 's':
|
|
|
|
return this.adjIndex;
|
|
|
|
case 'r':
|
|
|
|
return this.advIndex;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* getPOS()
|
|
|
|
* Find all POS for all words in given string
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string text - words to lookup for POS
|
|
|
|
* @param function callback - receives object with words broken into POS or 'rest':
|
|
|
|
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
|
|
|
|
* @return none
|
|
|
|
*/
|
|
|
|
wordposProto.getPOS = function(text, callback) {
|
|
|
|
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
|
2012-05-08 04:41:46 +00:00
|
|
|
profile = this.options.profile,
|
|
|
|
start = profile && new Date(),
|
|
|
|
args = [data],
|
2012-05-04 19:23:28 +00:00
|
|
|
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
2012-05-02 23:18:10 +00:00
|
|
|
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
|
|
|
words = prepText(text),
|
2012-05-04 19:23:28 +00:00
|
|
|
nTests = testFns.length,
|
2012-05-02 23:18:10 +00:00
|
|
|
nWords = words.length,
|
|
|
|
self = this,
|
|
|
|
c = 0;
|
|
|
|
|
2012-05-08 04:41:46 +00:00
|
|
|
profile && args.push(0);
|
|
|
|
if (!nWords) return callback.apply(null, args);
|
2012-05-02 23:18:10 +00:00
|
|
|
words.forEach(lookup);
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
function lookup(word){
|
2012-05-04 19:23:28 +00:00
|
|
|
var any = false,
|
|
|
|
t=0;
|
|
|
|
word = normalize(word);
|
|
|
|
testFns.forEach(lookupPOS);
|
|
|
|
|
|
|
|
function lookupPOS(isFn,i,list){
|
|
|
|
self[isFn](word, function(yes){
|
|
|
|
yes && data[parts[i]].push(word);
|
|
|
|
any |= yes;
|
|
|
|
donePOS();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function donePOS() {
|
|
|
|
if (++t == nTests) {
|
|
|
|
!any && data['rest'].push(word);
|
|
|
|
done();
|
|
|
|
}
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
function done(){
|
2012-05-04 19:23:28 +00:00
|
|
|
if (++c == nWords) {
|
2012-05-08 04:41:46 +00:00
|
|
|
profile && (args[1] = new Date() - start);
|
|
|
|
callback.apply(null, args);
|
2012-05-04 19:23:28 +00:00
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
module.exports = WordPOS;
|