2012-05-04 19:23:28 +00:00
|
|
|
/**
|
2012-05-02 23:18:10 +00:00
|
|
|
* wordpos
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
|
|
|
* Node.js part-of-speech utilities using natural's WordNet module.
|
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* Copyright (c) 2012 mooster@42at.com
|
|
|
|
* Released under MIT license
|
|
|
|
*/
|
|
|
|
|
|
|
|
var _ = require('underscore')._,
|
2012-05-04 19:23:28 +00:00
|
|
|
util = require('util'),
|
|
|
|
natural = require('natural'),
|
|
|
|
WordNet = natural.WordNet,
|
|
|
|
tokenizer = new natural.WordTokenizer(),
|
2012-05-06 09:44:21 +00:00
|
|
|
stopwords = ' '+ natural.stopwords.join(' ') +' ',
|
|
|
|
WNdb = require('WNdb');
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
function normalize(word) {
|
|
|
|
return word.toLowerCase().replace(/\s+/g, '_');
|
|
|
|
}
|
|
|
|
|
|
|
|
function isStopword(word) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return stopwords.indexOf(' '+word+' ') >= 0;
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function prepText(text) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function lookup(pos) {
|
2012-05-04 19:23:28 +00:00
|
|
|
return function(word, callback) {
|
|
|
|
word = normalize(word);
|
|
|
|
this.lookupFromFiles([
|
|
|
|
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
|
|
|
|
], [], word, callback);
|
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function is(pos){
|
2012-05-04 19:23:28 +00:00
|
|
|
return function(word, callback) {
|
|
|
|
var index = this.getIndexFile(pos);
|
|
|
|
word = normalize(word);
|
|
|
|
index.lookup(word, function(record) {
|
|
|
|
callback(!!record);
|
|
|
|
});
|
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
2012-05-04 19:23:28 +00:00
|
|
|
function get(isFn) {
|
|
|
|
return function(text, callback) {
|
|
|
|
var words = prepText(text),
|
|
|
|
n = words.length,
|
|
|
|
i = 0,
|
|
|
|
self = this,
|
|
|
|
results = [];
|
|
|
|
|
|
|
|
if (!n) return callback(results);
|
|
|
|
words.forEach(function(word,j){
|
|
|
|
self[isFn](word, function(yes){
|
|
|
|
yes && results.push(word);
|
|
|
|
(++i==n) && callback(results);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
};
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
var WordPOS = function() {
|
2012-05-06 09:44:21 +00:00
|
|
|
if (arguments.length == 0) {
|
|
|
|
WordPOS.super_.call(this, WNdb.path);
|
|
|
|
} else {
|
|
|
|
WordPOS.super_.apply(this, arguments);
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
};
|
|
|
|
util.inherits(WordPOS, WordNet);
|
|
|
|
|
|
|
|
var wordposProto = WordPOS.prototype;
|
|
|
|
|
|
|
|
// fast POS lookups (only look in specified file)
|
|
|
|
/**
|
|
|
|
* lookupX()
|
|
|
|
* Lookup word definition if already know POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string word - word to lookup in given POS
|
|
|
|
* @param function callback receives array of definition objects or empty
|
|
|
|
* @return none
|
|
|
|
*/
|
|
|
|
wordposProto.lookupAdjective = lookup('a');
|
|
|
|
wordposProto.lookupAdverb = lookup('r');
|
|
|
|
wordposProto.lookupNoun = lookup('n');
|
|
|
|
wordposProto.lookupVerb = lookup('v');
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
/**
|
|
|
|
* isX()
|
|
|
|
* Test if word is given POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string word - word to test for given POS
|
|
|
|
* @param function Callback receives true or false if word is given POS
|
|
|
|
* @return none
|
|
|
|
*/
|
2012-05-04 19:23:28 +00:00
|
|
|
wordposProto.isAdjective = is('a');
|
|
|
|
wordposProto.isAdverb = is('r');
|
|
|
|
wordposProto.isNoun = is('n');
|
|
|
|
wordposProto.isVerb = is('v');
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* getX()
|
|
|
|
* Find all words in string that are given POS
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string Text Words to search
|
2012-05-04 19:23:28 +00:00
|
|
|
* @param function callback Receives array of words that are given POS
|
2012-05-02 23:18:10 +00:00
|
|
|
* @return none
|
|
|
|
*/
|
2012-05-04 19:23:28 +00:00
|
|
|
wordposProto.getAdjectives = get('isAdjective');
|
|
|
|
wordposProto.getAdverbs = get('isAdverb');
|
|
|
|
wordposProto.getNouns = get('isNoun');
|
|
|
|
wordposProto.getVerbs = get('isVerb');
|
|
|
|
|
|
|
|
if (!wordposProto.getIndexFile) {
|
|
|
|
wordposProto.getIndexFile = function getIndexFile(pos) {
|
|
|
|
switch(pos) {
|
|
|
|
case 'n':
|
|
|
|
return this.nounIndex;
|
|
|
|
case 'v':
|
|
|
|
return this.verbIndex;
|
|
|
|
case 'a': case 's':
|
|
|
|
return this.adjIndex;
|
|
|
|
case 'r':
|
|
|
|
return this.advIndex;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* getPOS()
|
|
|
|
* Find all POS for all words in given string
|
2012-05-04 19:23:28 +00:00
|
|
|
*
|
2012-05-02 23:18:10 +00:00
|
|
|
* @param string text - words to lookup for POS
|
|
|
|
* @param function callback - receives object with words broken into POS or 'rest':
|
|
|
|
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
|
|
|
|
* @return none
|
|
|
|
*/
|
|
|
|
wordposProto.getPOS = function(text, callback) {
|
|
|
|
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
|
2012-05-04 19:23:28 +00:00
|
|
|
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
|
2012-05-02 23:18:10 +00:00
|
|
|
parts = 'nouns verbs adjectives adverbs'.split(' '),
|
|
|
|
words = prepText(text),
|
2012-05-04 19:23:28 +00:00
|
|
|
nTests = testFns.length,
|
2012-05-02 23:18:10 +00:00
|
|
|
nWords = words.length,
|
|
|
|
self = this,
|
|
|
|
c = 0;
|
|
|
|
|
|
|
|
if (!nWords) return callback(data);
|
|
|
|
words.forEach(lookup);
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
function lookup(word){
|
2012-05-04 19:23:28 +00:00
|
|
|
var any = false,
|
|
|
|
t=0;
|
|
|
|
word = normalize(word);
|
|
|
|
testFns.forEach(lookupPOS);
|
|
|
|
|
|
|
|
function lookupPOS(isFn,i,list){
|
|
|
|
self[isFn](word, function(yes){
|
|
|
|
yes && data[parts[i]].push(word);
|
|
|
|
any |= yes;
|
|
|
|
donePOS();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function donePOS() {
|
|
|
|
if (++t == nTests) {
|
|
|
|
!any && data['rest'].push(word);
|
|
|
|
done();
|
|
|
|
}
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
2012-05-04 19:23:28 +00:00
|
|
|
|
2012-05-02 23:18:10 +00:00
|
|
|
function done(){
|
2012-05-04 19:23:28 +00:00
|
|
|
if (++c == nWords) {
|
|
|
|
callback(data);
|
|
|
|
}
|
2012-05-02 23:18:10 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
module.exports = WordPOS;
|