refactor node version

This commit is contained in:
Moos 2018-10-14 22:31:45 -07:00
parent 359ada5e8a
commit 31422eafcf
5 changed files with 50 additions and 325 deletions

View File

@ -1,7 +1,7 @@
/*!
* dataFile.js
*
* Copyright (c) 2012-2018 mooster@42at.com
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
@ -11,7 +11,11 @@
var fs = require('fs'),
path = require('path'),
_ = require('underscore');
_ = require('underscore'),
{
lineDataToJSON,
LEX_NAMES
} = require('../common');
/**
* sanity check read data - line must start with zero-padded location
@ -25,64 +29,6 @@ function dataCheck(line, location) {
return line.indexOf(padded) === 0;
}
/**
* parse a single data file line, returning data object
*
* @param line {string} - a single line from WordNet data file
* @returns {object}
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line, location) {
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
wCnt = parseInt(tokens[3], 16),
synonyms = [],
i;
for(i = 0; i < wCnt; i++) {
synonyms.push(tokens[4 + i * 2]);
}
var ptrOffset = (wCnt - 1) * 2 + 6;
for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
ptrs.push({
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10),
pos: tokens[ptrOffset + 3 + i * 4],
sourceTarget: tokens[ptrOffset + 4 + i * 4]
});
}
// break "gloss" into definition vs. examples
var glossArray = data[1].split("; ");
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
}
return {
synsetOffset: parseInt(tokens[0], 10),
lexFilenum: lexFilenum,
lexName: DataFile.LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
synonyms: synonyms,
lexId: tokens[5],
ptrs: ptrs,
gloss: data[1],
def: definition,
exp: examples
};
}
/**
* read data file at location (bound to a data file).
* Reads nominal length and checks for EOL. Continue reading until EOL.
@ -98,6 +44,7 @@ function readLocation(location, callback) {
len = file.nominalLineLength,
buffer = new Buffer.alloc(len);
location = Number(location);
readChunk(location, function(err, count) {
if (err) {
//console.log(err);
@ -105,7 +52,11 @@ function readLocation(location, callback) {
return;
}
//console.log(' read %d bytes at <%d>', count, location);
callback(null, lineDataToJSON(str, location));
callback(null, function() {
if (!dataCheck(str, location)) return new Error('Bad data at location ' + location);
lineDataToJSON(str, location)
});
});
function readChunk(pos, cb) {
@ -213,7 +164,6 @@ function promisifyInto(collect) {
}
}
/**
* DataFile class
*
@ -258,55 +208,8 @@ DataFile.MAX_LINE_LENGTH = {
/**
* map of lexFilenum to lex names
*
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]}
*/
DataFile.LEX_NAMES = [
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
DataFile.LEX_NAMES = LEX_NAMES;
module.exports = DataFile;

View File

@ -1,162 +1,37 @@
/*!
* wordpos.js
* node/index.js
*
* Node.js part-of-speech utilities using WordNet database.
*
* Copyright (c) 2012-2016 mooster@42at.com
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var _ = require('underscore')._,
var
_ = require('underscore')._,
util = require('util'),
stopwords = require('../lib/natural/util/stopwords').words,
stopwordsStr = makeStopwordString(stopwords),
stopwordsStr,
WNdb = require('wordnet-db'),
DataFile = require('./dataFile'),
IndexFile = require('./indexFile');
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function makeStopwordString(stopwords) {
return ' '+ stopwords.join(' ') +' ';
}
function isStopword(stopwords, word) {
return stopwords.indexOf(' '+word+' ') >= 0;
}
function tokenizer(str) {
return str.split(/\W+/); //_.without(results,'',' ')
}
function prepText(text) {
if (_.isArray(text)) return text;
var deduped = _.uniq(tokenizer(text));
if (!this.options.stopwords) return deduped;
return _.reject(deduped, _.bind(isStopword, null,
_.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
));
}
/**
* factory for main lookup function
*
* @param pos {string} - n/v/a/r
* @returns {Function} - lookup function bound to POS
*/
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
files = this.getFilesFor(pos),
args = [];
word = normalize(word);
// lookup index
return files.index.lookup(word)
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result.synsetOffset).then(done);
} else {
// not found in index
return done([]);
}
})
.catch(done);
function done(results) {
if (results instanceof Error) {
args.push([], word);
} else {
args.push(results, word);
}
//console.log(3333, args)
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* isX() factory function
*
* @param pos {string} - n/v/a/r
* @returns {Function}
*/
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index;
word = normalize(word);
return index
.lookup(word)
.then(function(record) {
var result = !!record;
args.push(result, word);
profile && args.push(new Date() - start);
nextTick(callback, args);
return result;
});
};
}
/**
* getX() factory function
*
* @param isFn {function} - an isX() function
* @returns {Function}
*/
function get(isFn) {
return function(text, callback, _noprofile) {
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
words = this.parse(text),
results = [],
self = this;
//if (!n) return (process.nextTick(done),0);
return Promise
.all(words.map(exec))
.then(done);
function exec(word) {
return self[isFn]
.call(self, word, null, /*_noprofile*/ true)
.then(function collect(result) {
result && results.push(word);
});
}
function done(){
var args = [results];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
// setImmediate executes callback AFTER promise handlers.
// Without it, exceptions in callback may be caught by Promise.
function nextTick(fn, args) {
if (fn) {
fn.apply(null, args);
}
}
IndexFile = require('./indexFile'),
{
nextTick,
normalize,
tokenizer,
prepText,
makeStopwordString,
stopwords
} = require('../util'),
{
is,
get,
seek,
lookup
} = require('../common');
stopwordsStr = makeStopwordString(stopwords);
/**
* @class WordPOS
@ -183,7 +58,7 @@ var WordPOS = function(options) {
this.advData = new DataFile(dictPath, 'adv');
// define randX() functions
require('./rand').init(this);
require('../rand').init(this); // FIXME
if (_.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
@ -361,7 +236,6 @@ wordposProto.getVerbs = get('isVerb');
*/
wordposProto.parse = prepText;
/**
* seek - get record at offset for pos
*
@ -370,22 +244,7 @@ wordposProto.parse = prepText;
* @param callback {function} - optional callback
* @returns Promise
*/
wordposProto.seek = function(offset, pos, callback){
offset = Number(offset);
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
return data.lookup(offset, callback);
function error(msg) {
var err = new Error(msg);
callback && callback(err, {});
return Promise.reject(err);
}
};
wordposProto.seek = seek;
/**
* access to WordNet DB

View File

@ -1,9 +1,9 @@
/*!
* indexFile.js
* node/indexFile.js
*
* implements fast index lookup of WordNet's index files
*
* Copyright (c) 2012-2018 mooster@42at.com
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
@ -16,6 +16,7 @@ var _ = require('underscore')._,
path = require('path'),
fs = require('fs'),
piper = require('./piper'),
{ indexLookup } = require('../common'),
KEY_LENGTH = 3;
@ -133,49 +134,6 @@ function find(search, callback) {
}
}
/**
* find a word and prepare its lexical record
*
* @param word {string} - search word
* @param callback {function} - callback function receives result
* @returns none
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lookup(word, callback) {
var self = this;
return new Promise(function(resolve, reject){
self.find(word, function (record) {
var indexRecord = null,
i;
if (record.status == 'hit') {
var ptrs = [], offsets = [];
for (i = 0; i < parseInt(record.tokens[3]); i++)
ptrs.push(record.tokens[i]);
for (i = 0; i < parseInt(record.tokens[2]); i++)
offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10));
indexRecord = {
lemma : record.tokens[0],
pos : record.tokens[1],
ptrSymbol : ptrs,
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
synsetOffset: offsets
};
}
callback && callback(indexRecord);
resolve(indexRecord);
});
});
}
/**
* loads fast index data and return fast index find function
*
@ -216,7 +174,7 @@ var IndexFile = function(dictPath, name) {
initIndex(this);
};
IndexFile.prototype.lookup = lookup;
IndexFile.prototype.lookup = indexLookup;
IndexFile.prototype.find = find;
/**

View File

@ -4,7 +4,7 @@
* executes multiple async i/o tasks and pools similar callbacks,
* calling i/o open/close when all incoming tasks are done.
*
* Copyright (c) 2012-2016 mooster@42at.com
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
@ -79,4 +79,3 @@ piper.wrapper = function(self, task /*, result...*/){
module.exports = piper;

View File

@ -3,7 +3,7 @@
*
* define rand() and randX() functions on wordpos
*
* Copyright (c) 2012-2016 mooster@42at.com
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
@ -12,7 +12,14 @@
var _ = require('underscore')._,
util = require('util'),
Trie = require('../lib/natural/trie/trie'),
IndexFile = require('./indexFile'),
// FIXME
IndexFile = require('./node/indexFile'),
KEY_LENGTH = 3;
@ -264,4 +271,3 @@ module.exports = {
wordposProto.randVerb = makeRandX('v');
}
};