refactor node version

This commit is contained in:
Moos 2018-10-14 22:31:45 -07:00
parent 359ada5e8a
commit 31422eafcf
5 changed files with 50 additions and 325 deletions

View File

@ -1,7 +1,7 @@
/*! /*!
* dataFile.js * dataFile.js
* *
* Copyright (c) 2012-2018 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Portions: Copyright (c) 2011, Chris Umbel * Portions: Copyright (c) 2011, Chris Umbel
@ -11,7 +11,11 @@
var fs = require('fs'), var fs = require('fs'),
path = require('path'), path = require('path'),
_ = require('underscore'); _ = require('underscore'),
{
lineDataToJSON,
LEX_NAMES
} = require('../common');
/** /**
* sanity check read data - line must start with zero-padded location * sanity check read data - line must start with zero-padded location
@ -25,64 +29,6 @@ function dataCheck(line, location) {
return line.indexOf(padded) === 0; return line.indexOf(padded) === 0;
} }
/**
* parse a single data file line, returning data object
*
* @param line {string} - a single line from WordNet data file
* @returns {object}
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line, location) {
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
wCnt = parseInt(tokens[3], 16),
synonyms = [],
i;
for(i = 0; i < wCnt; i++) {
synonyms.push(tokens[4 + i * 2]);
}
var ptrOffset = (wCnt - 1) * 2 + 6;
for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
ptrs.push({
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10),
pos: tokens[ptrOffset + 3 + i * 4],
sourceTarget: tokens[ptrOffset + 4 + i * 4]
});
}
// break "gloss" into definition vs. examples
var glossArray = data[1].split("; ");
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
}
return {
synsetOffset: parseInt(tokens[0], 10),
lexFilenum: lexFilenum,
lexName: DataFile.LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
synonyms: synonyms,
lexId: tokens[5],
ptrs: ptrs,
gloss: data[1],
def: definition,
exp: examples
};
}
/** /**
* read data file at location (bound to a data file). * read data file at location (bound to a data file).
* Reads nominal length and checks for EOL. Continue reading until EOL. * Reads nominal length and checks for EOL. Continue reading until EOL.
@ -98,6 +44,7 @@ function readLocation(location, callback) {
len = file.nominalLineLength, len = file.nominalLineLength,
buffer = new Buffer.alloc(len); buffer = new Buffer.alloc(len);
location = Number(location);
readChunk(location, function(err, count) { readChunk(location, function(err, count) {
if (err) { if (err) {
//console.log(err); //console.log(err);
@ -105,7 +52,11 @@ function readLocation(location, callback) {
return; return;
} }
//console.log(' read %d bytes at <%d>', count, location); //console.log(' read %d bytes at <%d>', count, location);
callback(null, lineDataToJSON(str, location));
callback(null, function() {
if (!dataCheck(str, location)) return new Error('Bad data at location ' + location);
lineDataToJSON(str, location)
});
}); });
function readChunk(pos, cb) { function readChunk(pos, cb) {
@ -213,7 +164,6 @@ function promisifyInto(collect) {
} }
} }
/** /**
* DataFile class * DataFile class
* *
@ -258,55 +208,8 @@ DataFile.MAX_LINE_LENGTH = {
/** /**
* map of lexFilenum to lex names * map of lexFilenum to lex names
* *
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]} * @type {string[]}
*/ */
DataFile.LEX_NAMES = [ DataFile.LEX_NAMES = LEX_NAMES;
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
module.exports = DataFile; module.exports = DataFile;

View File

@ -1,162 +1,37 @@
/*! /*!
* wordpos.js * node/index.js
* *
* Node.js part-of-speech utilities using WordNet database. * Node.js part-of-speech utilities using WordNet database.
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
*/ */
var _ = require('underscore')._, var
_ = require('underscore')._,
util = require('util'), util = require('util'),
stopwords = require('../lib/natural/util/stopwords').words, stopwordsStr,
stopwordsStr = makeStopwordString(stopwords),
WNdb = require('wordnet-db'), WNdb = require('wordnet-db'),
DataFile = require('./dataFile'), DataFile = require('./dataFile'),
IndexFile = require('./indexFile'); IndexFile = require('./indexFile'),
{
nextTick,
function normalize(word) { normalize,
return word.toLowerCase().replace(/\s+/g, '_'); tokenizer,
} prepText,
makeStopwordString,
function makeStopwordString(stopwords) { stopwords
return ' '+ stopwords.join(' ') +' '; } = require('../util'),
} {
is,
function isStopword(stopwords, word) { get,
return stopwords.indexOf(' '+word+' ') >= 0; seek,
} lookup
} = require('../common');
function tokenizer(str) {
return str.split(/\W+/); //_.without(results,'',' ')
}
function prepText(text) {
if (_.isArray(text)) return text;
var deduped = _.uniq(tokenizer(text));
if (!this.options.stopwords) return deduped;
return _.reject(deduped, _.bind(isStopword, null,
_.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
));
}
/**
* factory for main lookup function
*
* @param pos {string} - n/v/a/r
* @returns {Function} - lookup function bound to POS
*/
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
files = this.getFilesFor(pos),
args = [];
word = normalize(word);
// lookup index
return files.index.lookup(word)
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result.synsetOffset).then(done);
} else {
// not found in index
return done([]);
}
})
.catch(done);
function done(results) {
if (results instanceof Error) {
args.push([], word);
} else {
args.push(results, word);
}
//console.log(3333, args)
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* isX() factory function
*
* @param pos {string} - n/v/a/r
* @returns {Function}
*/
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index;
word = normalize(word);
return index
.lookup(word)
.then(function(record) {
var result = !!record;
args.push(result, word);
profile && args.push(new Date() - start);
nextTick(callback, args);
return result;
});
};
}
/**
* getX() factory function
*
* @param isFn {function} - an isX() function
* @returns {Function}
*/
function get(isFn) {
return function(text, callback, _noprofile) {
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
words = this.parse(text),
results = [],
self = this;
//if (!n) return (process.nextTick(done),0);
return Promise
.all(words.map(exec))
.then(done);
function exec(word) {
return self[isFn]
.call(self, word, null, /*_noprofile*/ true)
.then(function collect(result) {
result && results.push(word);
});
}
function done(){
var args = [results];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
// setImmediate executes callback AFTER promise handlers.
// Without it, exceptions in callback may be caught by Promise.
function nextTick(fn, args) {
if (fn) {
fn.apply(null, args);
}
}
stopwordsStr = makeStopwordString(stopwords);
/** /**
* @class WordPOS * @class WordPOS
@ -183,7 +58,7 @@ var WordPOS = function(options) {
this.advData = new DataFile(dictPath, 'adv'); this.advData = new DataFile(dictPath, 'adv');
// define randX() functions // define randX() functions
require('./rand').init(this); require('../rand').init(this); // FIXME
if (_.isArray(this.options.stopwords)) { if (_.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords); this.options.stopwords = makeStopwordString(this.options.stopwords);
@ -361,7 +236,6 @@ wordposProto.getVerbs = get('isVerb');
*/ */
wordposProto.parse = prepText; wordposProto.parse = prepText;
/** /**
* seek - get record at offset for pos * seek - get record at offset for pos
* *
@ -370,22 +244,7 @@ wordposProto.parse = prepText;
* @param callback {function} - optional callback * @param callback {function} - optional callback
* @returns Promise * @returns Promise
*/ */
wordposProto.seek = function(offset, pos, callback){ wordposProto.seek = seek;
offset = Number(offset);
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
var data = this.getFilesFor(pos).data;
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
return data.lookup(offset, callback);
function error(msg) {
var err = new Error(msg);
callback && callback(err, {});
return Promise.reject(err);
}
};
/** /**
* access to WordNet DB * access to WordNet DB

View File

@ -1,9 +1,9 @@
/*! /*!
* indexFile.js * node/indexFile.js
* *
* implements fast index lookup of WordNet's index files * implements fast index lookup of WordNet's index files
* *
* Copyright (c) 2012-2018 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Portions: Copyright (c) 2011, Chris Umbel * Portions: Copyright (c) 2011, Chris Umbel
@ -16,6 +16,7 @@ var _ = require('underscore')._,
path = require('path'), path = require('path'),
fs = require('fs'), fs = require('fs'),
piper = require('./piper'), piper = require('./piper'),
{ indexLookup } = require('../common'),
KEY_LENGTH = 3; KEY_LENGTH = 3;
@ -133,49 +134,6 @@ function find(search, callback) {
} }
} }
/**
* find a word and prepare its lexical record
*
* @param word {string} - search word
* @param callback {function} - callback function receives result
* @returns none
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lookup(word, callback) {
var self = this;
return new Promise(function(resolve, reject){
self.find(word, function (record) {
var indexRecord = null,
i;
if (record.status == 'hit') {
var ptrs = [], offsets = [];
for (i = 0; i < parseInt(record.tokens[3]); i++)
ptrs.push(record.tokens[i]);
for (i = 0; i < parseInt(record.tokens[2]); i++)
offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10));
indexRecord = {
lemma : record.tokens[0],
pos : record.tokens[1],
ptrSymbol : ptrs,
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
synsetOffset: offsets
};
}
callback && callback(indexRecord);
resolve(indexRecord);
});
});
}
/** /**
* loads fast index data and return fast index find function * loads fast index data and return fast index find function
* *
@ -216,7 +174,7 @@ var IndexFile = function(dictPath, name) {
initIndex(this); initIndex(this);
}; };
IndexFile.prototype.lookup = lookup; IndexFile.prototype.lookup = indexLookup;
IndexFile.prototype.find = find; IndexFile.prototype.find = find;
/** /**

View File

@ -4,7 +4,7 @@
* executes multiple async i/o tasks and pools similar callbacks, * executes multiple async i/o tasks and pools similar callbacks,
* calling i/o open/close when all incoming tasks are done. * calling i/o open/close when all incoming tasks are done.
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
@ -79,4 +79,3 @@ piper.wrapper = function(self, task /*, result...*/){
module.exports = piper; module.exports = piper;

View File

@ -3,7 +3,7 @@
* *
* define rand() and randX() functions on wordpos * define rand() and randX() functions on wordpos
* *
* Copyright (c) 2012-2016 mooster@42at.com * Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * Released under MIT license
@ -12,7 +12,14 @@
var _ = require('underscore')._, var _ = require('underscore')._,
util = require('util'), util = require('util'),
Trie = require('../lib/natural/trie/trie'), Trie = require('../lib/natural/trie/trie'),
IndexFile = require('./indexFile'),
// FIXME
IndexFile = require('./node/indexFile'),
KEY_LENGTH = 3; KEY_LENGTH = 3;
@ -264,4 +271,3 @@ module.exports = {
wordposProto.randVerb = makeRandX('v'); wordposProto.randVerb = makeRandX('v');
} }
}; };