refactor node version
This commit is contained in:
parent
359ada5e8a
commit
31422eafcf
|
@ -1,7 +1,7 @@
|
|||
/*!
|
||||
* dataFile.js
|
||||
*
|
||||
* Copyright (c) 2012-2018 mooster@42at.com
|
||||
* Copyright (c) 2012-2019 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Portions: Copyright (c) 2011, Chris Umbel
|
||||
|
@ -11,7 +11,11 @@
|
|||
|
||||
var fs = require('fs'),
|
||||
path = require('path'),
|
||||
_ = require('underscore');
|
||||
_ = require('underscore'),
|
||||
{
|
||||
lineDataToJSON,
|
||||
LEX_NAMES
|
||||
} = require('../common');
|
||||
|
||||
/**
|
||||
* sanity check read data - line must start with zero-padded location
|
||||
|
@ -25,64 +29,6 @@ function dataCheck(line, location) {
|
|||
return line.indexOf(padded) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a single data file line, returning data object
|
||||
*
|
||||
* @param line {string} - a single line from WordNet data file
|
||||
* @returns {object}
|
||||
*
|
||||
* Credit for this routine to https://github.com/NaturalNode/natural
|
||||
*/
|
||||
function lineDataToJSON(line, location) {
|
||||
if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
|
||||
|
||||
var data = line.split('| '),
|
||||
tokens = data[0].split(/\s+/),
|
||||
ptrs = [],
|
||||
wCnt = parseInt(tokens[3], 16),
|
||||
synonyms = [],
|
||||
i;
|
||||
|
||||
for(i = 0; i < wCnt; i++) {
|
||||
synonyms.push(tokens[4 + i * 2]);
|
||||
}
|
||||
|
||||
var ptrOffset = (wCnt - 1) * 2 + 6;
|
||||
for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) {
|
||||
ptrs.push({
|
||||
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
|
||||
synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10),
|
||||
pos: tokens[ptrOffset + 3 + i * 4],
|
||||
sourceTarget: tokens[ptrOffset + 4 + i * 4]
|
||||
});
|
||||
}
|
||||
|
||||
// break "gloss" into definition vs. examples
|
||||
var glossArray = data[1].split("; ");
|
||||
var definition = glossArray[0];
|
||||
var examples = glossArray.slice(1);
|
||||
var lexFilenum = parseInt(tokens[1], 10);
|
||||
|
||||
for (var k = 0; k < examples.length; k++) {
|
||||
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
|
||||
}
|
||||
|
||||
return {
|
||||
synsetOffset: parseInt(tokens[0], 10),
|
||||
lexFilenum: lexFilenum,
|
||||
lexName: DataFile.LEX_NAMES[ lexFilenum ],
|
||||
pos: tokens[2],
|
||||
wCnt: wCnt,
|
||||
lemma: tokens[4],
|
||||
synonyms: synonyms,
|
||||
lexId: tokens[5],
|
||||
ptrs: ptrs,
|
||||
gloss: data[1],
|
||||
def: definition,
|
||||
exp: examples
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* read data file at location (bound to a data file).
|
||||
* Reads nominal length and checks for EOL. Continue reading until EOL.
|
||||
|
@ -98,6 +44,7 @@ function readLocation(location, callback) {
|
|||
len = file.nominalLineLength,
|
||||
buffer = new Buffer.alloc(len);
|
||||
|
||||
location = Number(location);
|
||||
readChunk(location, function(err, count) {
|
||||
if (err) {
|
||||
//console.log(err);
|
||||
|
@ -105,7 +52,11 @@ function readLocation(location, callback) {
|
|||
return;
|
||||
}
|
||||
//console.log(' read %d bytes at <%d>', count, location);
|
||||
callback(null, lineDataToJSON(str, location));
|
||||
|
||||
callback(null, function() {
|
||||
if (!dataCheck(str, location)) return new Error('Bad data at location ' + location);
|
||||
lineDataToJSON(str, location)
|
||||
});
|
||||
});
|
||||
|
||||
function readChunk(pos, cb) {
|
||||
|
@ -213,7 +164,6 @@ function promisifyInto(collect) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* DataFile class
|
||||
*
|
||||
|
@ -258,55 +208,8 @@ DataFile.MAX_LINE_LENGTH = {
|
|||
/**
|
||||
* map of lexFilenum to lex names
|
||||
*
|
||||
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
|
||||
* @type {string[]}
|
||||
*/
|
||||
DataFile.LEX_NAMES = [
|
||||
'adj.all',
|
||||
'adj.pert',
|
||||
'adv.all',
|
||||
'noun.Tops',
|
||||
'noun.act',
|
||||
'noun.animal',
|
||||
'noun.artifact',
|
||||
'noun.attribute',
|
||||
'noun.body',
|
||||
'noun.cognition',
|
||||
'noun.communication',
|
||||
'noun.event',
|
||||
'noun.feeling',
|
||||
'noun.food',
|
||||
'noun.group',
|
||||
'noun.location',
|
||||
'noun.motive',
|
||||
'noun.object',
|
||||
'noun.person',
|
||||
'noun.phenomenon',
|
||||
'noun.plant',
|
||||
'noun.possession',
|
||||
'noun.process',
|
||||
'noun.quantity',
|
||||
'noun.relation',
|
||||
'noun.shape',
|
||||
'noun.state',
|
||||
'noun.substance',
|
||||
'noun.time',
|
||||
'verb.body',
|
||||
'verb.change',
|
||||
'verb.cognition',
|
||||
'verb.communication',
|
||||
'verb.competition',
|
||||
'verb.consumption',
|
||||
'verb.contact',
|
||||
'verb.creation',
|
||||
'verb.emotion',
|
||||
'verb.motion',
|
||||
'verb.perception',
|
||||
'verb.possession',
|
||||
'verb.social',
|
||||
'verb.stative',
|
||||
'verb.weather',
|
||||
'adj.ppl'
|
||||
];
|
||||
DataFile.LEX_NAMES = LEX_NAMES;
|
||||
|
||||
module.exports = DataFile;
|
|
@ -1,162 +1,37 @@
|
|||
/*!
|
||||
* wordpos.js
|
||||
* node/index.js
|
||||
*
|
||||
* Node.js part-of-speech utilities using WordNet database.
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* Copyright (c) 2012-2019 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
*/
|
||||
|
||||
var _ = require('underscore')._,
|
||||
var
|
||||
_ = require('underscore')._,
|
||||
util = require('util'),
|
||||
stopwords = require('../lib/natural/util/stopwords').words,
|
||||
stopwordsStr = makeStopwordString(stopwords),
|
||||
stopwordsStr,
|
||||
WNdb = require('wordnet-db'),
|
||||
DataFile = require('./dataFile'),
|
||||
IndexFile = require('./indexFile');
|
||||
|
||||
|
||||
function normalize(word) {
|
||||
return word.toLowerCase().replace(/\s+/g, '_');
|
||||
}
|
||||
|
||||
function makeStopwordString(stopwords) {
|
||||
return ' '+ stopwords.join(' ') +' ';
|
||||
}
|
||||
|
||||
function isStopword(stopwords, word) {
|
||||
return stopwords.indexOf(' '+word+' ') >= 0;
|
||||
}
|
||||
|
||||
function tokenizer(str) {
|
||||
return str.split(/\W+/); //_.without(results,'',' ')
|
||||
}
|
||||
|
||||
function prepText(text) {
|
||||
if (_.isArray(text)) return text;
|
||||
var deduped = _.uniq(tokenizer(text));
|
||||
if (!this.options.stopwords) return deduped;
|
||||
return _.reject(deduped, _.bind(isStopword, null,
|
||||
_.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
|
||||
));
|
||||
}
|
||||
|
||||
/**
|
||||
* factory for main lookup function
|
||||
*
|
||||
* @param pos {string} - n/v/a/r
|
||||
* @returns {Function} - lookup function bound to POS
|
||||
*/
|
||||
function lookup(pos) {
|
||||
return function(word, callback) {
|
||||
var profile = this.options.profile,
|
||||
start = profile && new Date(),
|
||||
files = this.getFilesFor(pos),
|
||||
args = [];
|
||||
|
||||
word = normalize(word);
|
||||
|
||||
// lookup index
|
||||
return files.index.lookup(word)
|
||||
.then(function(result) {
|
||||
if (result) {
|
||||
// lookup data
|
||||
return files.data.lookup(result.synsetOffset).then(done);
|
||||
} else {
|
||||
// not found in index
|
||||
return done([]);
|
||||
}
|
||||
})
|
||||
.catch(done);
|
||||
|
||||
function done(results) {
|
||||
if (results instanceof Error) {
|
||||
args.push([], word);
|
||||
} else {
|
||||
args.push(results, word);
|
||||
}
|
||||
//console.log(3333, args)
|
||||
profile && args.push(new Date() - start);
|
||||
nextTick(callback, args);
|
||||
return results;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* isX() factory function
|
||||
*
|
||||
* @param pos {string} - n/v/a/r
|
||||
* @returns {Function}
|
||||
*/
|
||||
function is(pos){
|
||||
return function(word, callback, _noprofile) {
|
||||
// disable profiling when isX() used internally
|
||||
var profile = this.options.profile && !_noprofile,
|
||||
start = profile && new Date(),
|
||||
args = [],
|
||||
index = this.getFilesFor(pos).index;
|
||||
word = normalize(word);
|
||||
|
||||
return index
|
||||
.lookup(word)
|
||||
.then(function(record) {
|
||||
var result = !!record;
|
||||
args.push(result, word);
|
||||
profile && args.push(new Date() - start);
|
||||
nextTick(callback, args);
|
||||
return result;
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* getX() factory function
|
||||
*
|
||||
* @param isFn {function} - an isX() function
|
||||
* @returns {Function}
|
||||
*/
|
||||
function get(isFn) {
|
||||
return function(text, callback, _noprofile) {
|
||||
var profile = this.options.profile && !_noprofile,
|
||||
start = profile && new Date(),
|
||||
words = this.parse(text),
|
||||
results = [],
|
||||
self = this;
|
||||
|
||||
//if (!n) return (process.nextTick(done),0);
|
||||
return Promise
|
||||
.all(words.map(exec))
|
||||
.then(done);
|
||||
|
||||
function exec(word) {
|
||||
return self[isFn]
|
||||
.call(self, word, null, /*_noprofile*/ true)
|
||||
.then(function collect(result) {
|
||||
result && results.push(word);
|
||||
});
|
||||
}
|
||||
|
||||
function done(){
|
||||
var args = [results];
|
||||
profile && args.push(new Date() - start);
|
||||
nextTick(callback, args);
|
||||
return results;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// setImmediate executes callback AFTER promise handlers.
|
||||
// Without it, exceptions in callback may be caught by Promise.
|
||||
function nextTick(fn, args) {
|
||||
if (fn) {
|
||||
fn.apply(null, args);
|
||||
}
|
||||
}
|
||||
IndexFile = require('./indexFile'),
|
||||
{
|
||||
nextTick,
|
||||
normalize,
|
||||
tokenizer,
|
||||
prepText,
|
||||
makeStopwordString,
|
||||
stopwords
|
||||
} = require('../util'),
|
||||
{
|
||||
is,
|
||||
get,
|
||||
seek,
|
||||
lookup
|
||||
} = require('../common');
|
||||
|
||||
stopwordsStr = makeStopwordString(stopwords);
|
||||
|
||||
/**
|
||||
* @class WordPOS
|
||||
|
@ -183,7 +58,7 @@ var WordPOS = function(options) {
|
|||
this.advData = new DataFile(dictPath, 'adv');
|
||||
|
||||
// define randX() functions
|
||||
require('./rand').init(this);
|
||||
require('../rand').init(this); // FIXME
|
||||
|
||||
if (_.isArray(this.options.stopwords)) {
|
||||
this.options.stopwords = makeStopwordString(this.options.stopwords);
|
||||
|
@ -361,7 +236,6 @@ wordposProto.getVerbs = get('isVerb');
|
|||
*/
|
||||
wordposProto.parse = prepText;
|
||||
|
||||
|
||||
/**
|
||||
* seek - get record at offset for pos
|
||||
*
|
||||
|
@ -370,22 +244,7 @@ wordposProto.parse = prepText;
|
|||
* @param callback {function} - optional callback
|
||||
* @returns Promise
|
||||
*/
|
||||
wordposProto.seek = function(offset, pos, callback){
|
||||
offset = Number(offset);
|
||||
if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.');
|
||||
|
||||
var data = this.getFilesFor(pos).data;
|
||||
if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.');
|
||||
|
||||
return data.lookup(offset, callback);
|
||||
|
||||
function error(msg) {
|
||||
var err = new Error(msg);
|
||||
callback && callback(err, {});
|
||||
return Promise.reject(err);
|
||||
}
|
||||
};
|
||||
|
||||
wordposProto.seek = seek;
|
||||
|
||||
/**
|
||||
* access to WordNet DB
|
|
@ -1,9 +1,9 @@
|
|||
/*!
|
||||
* indexFile.js
|
||||
* node/indexFile.js
|
||||
*
|
||||
* implements fast index lookup of WordNet's index files
|
||||
*
|
||||
* Copyright (c) 2012-2018 mooster@42at.com
|
||||
* Copyright (c) 2012-2019 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Portions: Copyright (c) 2011, Chris Umbel
|
||||
|
@ -16,6 +16,7 @@ var _ = require('underscore')._,
|
|||
path = require('path'),
|
||||
fs = require('fs'),
|
||||
piper = require('./piper'),
|
||||
{ indexLookup } = require('../common'),
|
||||
KEY_LENGTH = 3;
|
||||
|
||||
|
||||
|
@ -133,49 +134,6 @@ function find(search, callback) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* find a word and prepare its lexical record
|
||||
*
|
||||
* @param word {string} - search word
|
||||
* @param callback {function} - callback function receives result
|
||||
* @returns none
|
||||
*
|
||||
* Credit for this routine to https://github.com/NaturalNode/natural
|
||||
*/
|
||||
function lookup(word, callback) {
|
||||
var self = this;
|
||||
|
||||
return new Promise(function(resolve, reject){
|
||||
self.find(word, function (record) {
|
||||
var indexRecord = null,
|
||||
i;
|
||||
|
||||
if (record.status == 'hit') {
|
||||
var ptrs = [], offsets = [];
|
||||
|
||||
for (i = 0; i < parseInt(record.tokens[3]); i++)
|
||||
ptrs.push(record.tokens[i]);
|
||||
|
||||
for (i = 0; i < parseInt(record.tokens[2]); i++)
|
||||
offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10));
|
||||
|
||||
indexRecord = {
|
||||
lemma : record.tokens[0],
|
||||
pos : record.tokens[1],
|
||||
ptrSymbol : ptrs,
|
||||
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
|
||||
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
|
||||
synsetOffset: offsets
|
||||
};
|
||||
}
|
||||
|
||||
callback && callback(indexRecord);
|
||||
resolve(indexRecord);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* loads fast index data and return fast index find function
|
||||
*
|
||||
|
@ -216,7 +174,7 @@ var IndexFile = function(dictPath, name) {
|
|||
initIndex(this);
|
||||
};
|
||||
|
||||
IndexFile.prototype.lookup = lookup;
|
||||
IndexFile.prototype.lookup = indexLookup;
|
||||
IndexFile.prototype.find = find;
|
||||
|
||||
/**
|
|
@ -4,7 +4,7 @@
|
|||
* executes multiple async i/o tasks and pools similar callbacks,
|
||||
* calling i/o open/close when all incoming tasks are done.
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* Copyright (c) 2012-2019 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -79,4 +79,3 @@ piper.wrapper = function(self, task /*, result...*/){
|
|||
|
||||
|
||||
module.exports = piper;
|
||||
|
12
src/rand.js
12
src/rand.js
|
@ -3,7 +3,7 @@
|
|||
*
|
||||
* define rand() and randX() functions on wordpos
|
||||
*
|
||||
* Copyright (c) 2012-2016 mooster@42at.com
|
||||
* Copyright (c) 2012-2019 mooster@42at.com
|
||||
* https://github.com/moos/wordpos
|
||||
*
|
||||
* Released under MIT license
|
||||
|
@ -12,7 +12,14 @@
|
|||
var _ = require('underscore')._,
|
||||
util = require('util'),
|
||||
Trie = require('../lib/natural/trie/trie'),
|
||||
IndexFile = require('./indexFile'),
|
||||
|
||||
|
||||
// FIXME
|
||||
IndexFile = require('./node/indexFile'),
|
||||
|
||||
|
||||
|
||||
|
||||
KEY_LENGTH = 3;
|
||||
|
||||
|
||||
|
@ -264,4 +271,3 @@ module.exports = {
|
|||
wordposProto.randVerb = makeRandX('v');
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue