wordpos/wordpos.js

181 lines
4.2 KiB
JavaScript
Raw Normal View History

2012-05-04 19:23:28 +00:00
/**
2012-05-02 23:18:10 +00:00
* wordpos
2012-05-04 19:23:28 +00:00
*
* Node.js part-of-speech utilities using natural's WordNet module.
*
2012-05-02 23:18:10 +00:00
* Copyright (c) 2012 mooster@42at.com
* Released under MIT license
*/
var _ = require('underscore')._,
2012-05-04 19:23:28 +00:00
util = require('util'),
natural = require('natural'),
WordNet = natural.WordNet,
tokenizer = new natural.WordTokenizer(),
stopwords = ' '+ natural.stopwords.join(' ') +' ';
2012-05-02 23:18:10 +00:00
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(word) {
2012-05-04 19:23:28 +00:00
return stopwords.indexOf(' '+word+' ') >= 0;
2012-05-02 23:18:10 +00:00
}
function prepText(text) {
2012-05-04 19:23:28 +00:00
return _.reject(_.uniq(tokenizer.tokenize(text)), isStopword);
2012-05-02 23:18:10 +00:00
}
function lookup(pos) {
2012-05-04 19:23:28 +00:00
return function(word, callback) {
word = normalize(word);
this.lookupFromFiles([
{index: this.getIndexFile(pos), data: this.getDataFile(pos)}
], [], word, callback);
};
2012-05-02 23:18:10 +00:00
}
function is(pos){
2012-05-04 19:23:28 +00:00
return function(word, callback) {
var index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
callback(!!record);
});
};
2012-05-02 23:18:10 +00:00
}
2012-05-04 19:23:28 +00:00
function get(isFn) {
return function(text, callback) {
var words = prepText(text),
n = words.length,
i = 0,
self = this,
results = [];
if (!n) return callback(results);
words.forEach(function(word,j){
self[isFn](word, function(yes){
yes && results.push(word);
(++i==n) && callback(results);
});
});
};
2012-05-02 23:18:10 +00:00
}
var WordPOS = function() {
2012-05-04 19:23:28 +00:00
WordPOS.super_.apply(this, arguments);
2012-05-02 23:18:10 +00:00
};
util.inherits(WordPOS, WordNet);
var wordposProto = WordPOS.prototype;
// fast POS lookups (only look in specified file)
/**
* lookupX()
* Lookup word definition if already know POS
2012-05-04 19:23:28 +00:00
*
2012-05-02 23:18:10 +00:00
* @param string word - word to lookup in given POS
* @param function callback receives array of definition objects or empty
* @return none
*/
wordposProto.lookupAdjective = lookup('a');
wordposProto.lookupAdverb = lookup('r');
wordposProto.lookupNoun = lookup('n');
wordposProto.lookupVerb = lookup('v');
2012-05-04 19:23:28 +00:00
2012-05-02 23:18:10 +00:00
/**
* isX()
* Test if word is given POS
2012-05-04 19:23:28 +00:00
*
2012-05-02 23:18:10 +00:00
* @param string word - word to test for given POS
* @param function Callback receives true or false if word is given POS
* @return none
*/
2012-05-04 19:23:28 +00:00
wordposProto.isAdjective = is('a');
wordposProto.isAdverb = is('r');
wordposProto.isNoun = is('n');
wordposProto.isVerb = is('v');
2012-05-02 23:18:10 +00:00
/**
* getX()
* Find all words in string that are given POS
2012-05-04 19:23:28 +00:00
*
2012-05-02 23:18:10 +00:00
* @param string Text Words to search
2012-05-04 19:23:28 +00:00
* @param function callback Receives array of words that are given POS
2012-05-02 23:18:10 +00:00
* @return none
*/
2012-05-04 19:23:28 +00:00
wordposProto.getAdjectives = get('isAdjective');
wordposProto.getAdverbs = get('isAdverb');
wordposProto.getNouns = get('isNoun');
wordposProto.getVerbs = get('isVerb');
if (!wordposProto.getIndexFile) {
wordposProto.getIndexFile = function getIndexFile(pos) {
switch(pos) {
case 'n':
return this.nounIndex;
case 'v':
return this.verbIndex;
case 'a': case 's':
return this.adjIndex;
case 'r':
return this.advIndex;
}
};
}
2012-05-02 23:18:10 +00:00
/**
* getPOS()
* Find all POS for all words in given string
2012-05-04 19:23:28 +00:00
*
2012-05-02 23:18:10 +00:00
* @param string text - words to lookup for POS
* @param function callback - receives object with words broken into POS or 'rest':
* Object: {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]}
* @return none
*/
wordposProto.getPOS = function(text, callback) {
var data = {nouns:[], verbs:[], adjectives:[], adverbs:[], rest:[]},
2012-05-04 19:23:28 +00:00
testFns = 'isNoun isVerb isAdjective isAdverb'.split(' '),
2012-05-02 23:18:10 +00:00
parts = 'nouns verbs adjectives adverbs'.split(' '),
words = prepText(text),
2012-05-04 19:23:28 +00:00
nTests = testFns.length,
2012-05-02 23:18:10 +00:00
nWords = words.length,
self = this,
c = 0;
if (!nWords) return callback(data);
words.forEach(lookup);
2012-05-04 19:23:28 +00:00
2012-05-02 23:18:10 +00:00
function lookup(word){
2012-05-04 19:23:28 +00:00
var any = false,
t=0;
word = normalize(word);
testFns.forEach(lookupPOS);
function lookupPOS(isFn,i,list){
self[isFn](word, function(yes){
yes && data[parts[i]].push(word);
any |= yes;
donePOS();
});
}
function donePOS() {
if (++t == nTests) {
!any && data['rest'].push(word);
done();
}
}
2012-05-02 23:18:10 +00:00
}
2012-05-04 19:23:28 +00:00
2012-05-02 23:18:10 +00:00
function done(){
2012-05-04 19:23:28 +00:00
if (++c == nWords) {
callback(data);
}
2012-05-02 23:18:10 +00:00
}
};
module.exports = WordPOS;