From 31422eafcfcac1d781f78ff0c4a676de774c6740 Mon Sep 17 00:00:00 2001 From: Moos Date: Sun, 14 Oct 2018 22:31:45 -0700 Subject: [PATCH] refactor node version --- src/{ => node}/dataFile.js | 123 +++----------------- src/{wordpos.js => node/index.js} | 187 ++++-------------------------- src/{ => node}/indexFile.js | 50 +------- src/{ => node}/piper.js | 3 +- src/rand.js | 12 +- 5 files changed, 50 insertions(+), 325 deletions(-) rename src/{ => node}/dataFile.js (64%) rename src/{wordpos.js => node/index.js} (58%) rename src/{ => node}/indexFile.js (80%) rename src/{ => node}/piper.js (98%) diff --git a/src/dataFile.js b/src/node/dataFile.js similarity index 64% rename from src/dataFile.js rename to src/node/dataFile.js index 3d115c2..df3d863 100644 --- a/src/dataFile.js +++ b/src/node/dataFile.js @@ -1,7 +1,7 @@ /*! * dataFile.js * - * Copyright (c) 2012-2018 mooster@42at.com + * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Portions: Copyright (c) 2011, Chris Umbel @@ -11,7 +11,11 @@ var fs = require('fs'), path = require('path'), - _ = require('underscore'); + _ = require('underscore'), + { + lineDataToJSON, + LEX_NAMES + } = require('../common'); /** * sanity check read data - line must start with zero-padded location @@ -25,64 +29,6 @@ function dataCheck(line, location) { return line.indexOf(padded) === 0; } -/** - * parse a single data file line, returning data object - * - * @param line {string} - a single line from WordNet data file - * @returns {object} - * - * Credit for this routine to https://github.com/NaturalNode/natural - */ -function lineDataToJSON(line, location) { - if (!dataCheck(line, location)) return new Error('Bad data at location ' + location); - - var data = line.split('| '), - tokens = data[0].split(/\s+/), - ptrs = [], - wCnt = parseInt(tokens[3], 16), - synonyms = [], - i; - - for(i = 0; i < wCnt; i++) { - synonyms.push(tokens[4 + i * 2]); - } - - var ptrOffset = (wCnt - 1) * 2 + 6; - for(i = 0; i < parseInt(tokens[ptrOffset], 10); i++) { - ptrs.push({ - pointerSymbol: tokens[ptrOffset + 1 + i * 4], - synsetOffset: parseInt(tokens[ptrOffset + 2 + i * 4], 10), - pos: tokens[ptrOffset + 3 + i * 4], - sourceTarget: tokens[ptrOffset + 4 + i * 4] - }); - } - - // break "gloss" into definition vs. examples - var glossArray = data[1].split("; "); - var definition = glossArray[0]; - var examples = glossArray.slice(1); - var lexFilenum = parseInt(tokens[1], 10); - - for (var k = 0; k < examples.length; k++) { - examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); - } - - return { - synsetOffset: parseInt(tokens[0], 10), - lexFilenum: lexFilenum, - lexName: DataFile.LEX_NAMES[ lexFilenum ], - pos: tokens[2], - wCnt: wCnt, - lemma: tokens[4], - synonyms: synonyms, - lexId: tokens[5], - ptrs: ptrs, - gloss: data[1], - def: definition, - exp: examples - }; -} - /** * read data file at location (bound to a data file). * Reads nominal length and checks for EOL. Continue reading until EOL. @@ -98,6 +44,7 @@ function readLocation(location, callback) { len = file.nominalLineLength, buffer = new Buffer.alloc(len); + location = Number(location); readChunk(location, function(err, count) { if (err) { //console.log(err); @@ -105,7 +52,11 @@ function readLocation(location, callback) { return; } //console.log(' read %d bytes at <%d>', count, location); - callback(null, lineDataToJSON(str, location)); + + callback(null, function() { + if (!dataCheck(str, location)) return new Error('Bad data at location ' + location); + lineDataToJSON(str, location) + }); }); function readChunk(pos, cb) { @@ -213,7 +164,6 @@ function promisifyInto(collect) { } } - /** * DataFile class * @@ -258,55 +208,8 @@ DataFile.MAX_LINE_LENGTH = { /** * map of lexFilenum to lex names * - * @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html * @type {string[]} */ -DataFile.LEX_NAMES = [ - 'adj.all', - 'adj.pert', - 'adv.all', - 'noun.Tops', - 'noun.act', - 'noun.animal', - 'noun.artifact', - 'noun.attribute', - 'noun.body', - 'noun.cognition', - 'noun.communication', - 'noun.event', - 'noun.feeling', - 'noun.food', - 'noun.group', - 'noun.location', - 'noun.motive', - 'noun.object', - 'noun.person', - 'noun.phenomenon', - 'noun.plant', - 'noun.possession', - 'noun.process', - 'noun.quantity', - 'noun.relation', - 'noun.shape', - 'noun.state', - 'noun.substance', - 'noun.time', - 'verb.body', - 'verb.change', - 'verb.cognition', - 'verb.communication', - 'verb.competition', - 'verb.consumption', - 'verb.contact', - 'verb.creation', - 'verb.emotion', - 'verb.motion', - 'verb.perception', - 'verb.possession', - 'verb.social', - 'verb.stative', - 'verb.weather', - 'adj.ppl' -]; +DataFile.LEX_NAMES = LEX_NAMES; module.exports = DataFile; diff --git a/src/wordpos.js b/src/node/index.js similarity index 58% rename from src/wordpos.js rename to src/node/index.js index b5bb23a..ceea30b 100644 --- a/src/wordpos.js +++ b/src/node/index.js @@ -1,162 +1,37 @@ /*! -* wordpos.js +* node/index.js * * Node.js part-of-speech utilities using WordNet database. * -* Copyright (c) 2012-2016 mooster@42at.com +* Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license */ -var _ = require('underscore')._, +var + _ = require('underscore')._, util = require('util'), - stopwords = require('../lib/natural/util/stopwords').words, - stopwordsStr = makeStopwordString(stopwords), + stopwordsStr, WNdb = require('wordnet-db'), DataFile = require('./dataFile'), - IndexFile = require('./indexFile'); - - -function normalize(word) { - return word.toLowerCase().replace(/\s+/g, '_'); -} - -function makeStopwordString(stopwords) { - return ' '+ stopwords.join(' ') +' '; -} - -function isStopword(stopwords, word) { - return stopwords.indexOf(' '+word+' ') >= 0; -} - -function tokenizer(str) { - return str.split(/\W+/); //_.without(results,'',' ') -} - -function prepText(text) { - if (_.isArray(text)) return text; - var deduped = _.uniq(tokenizer(text)); - if (!this.options.stopwords) return deduped; - return _.reject(deduped, _.bind(isStopword, null, - _.isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr - )); -} - -/** - * factory for main lookup function - * - * @param pos {string} - n/v/a/r - * @returns {Function} - lookup function bound to POS - */ -function lookup(pos) { - return function(word, callback) { - var profile = this.options.profile, - start = profile && new Date(), - files = this.getFilesFor(pos), - args = []; - - word = normalize(word); - - // lookup index - return files.index.lookup(word) - .then(function(result) { - if (result) { - // lookup data - return files.data.lookup(result.synsetOffset).then(done); - } else { - // not found in index - return done([]); - } - }) - .catch(done); - - function done(results) { - if (results instanceof Error) { - args.push([], word); - } else { - args.push(results, word); - } - //console.log(3333, args) - profile && args.push(new Date() - start); - nextTick(callback, args); - return results; - } - }; -} - -/** - * isX() factory function - * - * @param pos {string} - n/v/a/r - * @returns {Function} - */ -function is(pos){ - return function(word, callback, _noprofile) { - // disable profiling when isX() used internally - var profile = this.options.profile && !_noprofile, - start = profile && new Date(), - args = [], - index = this.getFilesFor(pos).index; - word = normalize(word); - - return index - .lookup(word) - .then(function(record) { - var result = !!record; - args.push(result, word); - profile && args.push(new Date() - start); - nextTick(callback, args); - return result; - }); - }; -} - - -/** - * getX() factory function - * - * @param isFn {function} - an isX() function - * @returns {Function} - */ -function get(isFn) { - return function(text, callback, _noprofile) { - var profile = this.options.profile && !_noprofile, - start = profile && new Date(), - words = this.parse(text), - results = [], - self = this; - - //if (!n) return (process.nextTick(done),0); - return Promise - .all(words.map(exec)) - .then(done); - - function exec(word) { - return self[isFn] - .call(self, word, null, /*_noprofile*/ true) - .then(function collect(result) { - result && results.push(word); - }); - } - - function done(){ - var args = [results]; - profile && args.push(new Date() - start); - nextTick(callback, args); - return results; - } - }; -} - -// setImmediate executes callback AFTER promise handlers. -// Without it, exceptions in callback may be caught by Promise. -function nextTick(fn, args) { - if (fn) { - fn.apply(null, args); - } -} + IndexFile = require('./indexFile'), + { + nextTick, + normalize, + tokenizer, + prepText, + makeStopwordString, + stopwords + } = require('../util'), + { + is, + get, + seek, + lookup + } = require('../common'); +stopwordsStr = makeStopwordString(stopwords); /** * @class WordPOS @@ -183,7 +58,7 @@ var WordPOS = function(options) { this.advData = new DataFile(dictPath, 'adv'); // define randX() functions - require('./rand').init(this); + require('../rand').init(this); // FIXME if (_.isArray(this.options.stopwords)) { this.options.stopwords = makeStopwordString(this.options.stopwords); @@ -361,7 +236,6 @@ wordposProto.getVerbs = get('isVerb'); */ wordposProto.parse = prepText; - /** * seek - get record at offset for pos * @@ -370,22 +244,7 @@ wordposProto.parse = prepText; * @param callback {function} - optional callback * @returns Promise */ -wordposProto.seek = function(offset, pos, callback){ - offset = Number(offset); - if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.'); - - var data = this.getFilesFor(pos).data; - if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.'); - - return data.lookup(offset, callback); - - function error(msg) { - var err = new Error(msg); - callback && callback(err, {}); - return Promise.reject(err); - } -}; - +wordposProto.seek = seek; /** * access to WordNet DB diff --git a/src/indexFile.js b/src/node/indexFile.js similarity index 80% rename from src/indexFile.js rename to src/node/indexFile.js index acc4a46..74a4c3e 100644 --- a/src/indexFile.js +++ b/src/node/indexFile.js @@ -1,9 +1,9 @@ /*! - * indexFile.js + * node/indexFile.js * * implements fast index lookup of WordNet's index files * - * Copyright (c) 2012-2018 mooster@42at.com + * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Portions: Copyright (c) 2011, Chris Umbel @@ -16,6 +16,7 @@ var _ = require('underscore')._, path = require('path'), fs = require('fs'), piper = require('./piper'), + { indexLookup } = require('../common'), KEY_LENGTH = 3; @@ -133,49 +134,6 @@ function find(search, callback) { } } -/** - * find a word and prepare its lexical record - * - * @param word {string} - search word - * @param callback {function} - callback function receives result - * @returns none - * - * Credit for this routine to https://github.com/NaturalNode/natural - */ -function lookup(word, callback) { - var self = this; - - return new Promise(function(resolve, reject){ - self.find(word, function (record) { - var indexRecord = null, - i; - - if (record.status == 'hit') { - var ptrs = [], offsets = []; - - for (i = 0; i < parseInt(record.tokens[3]); i++) - ptrs.push(record.tokens[i]); - - for (i = 0; i < parseInt(record.tokens[2]); i++) - offsets.push(parseInt(record.tokens[ptrs.length + 6 + i], 10)); - - indexRecord = { - lemma : record.tokens[0], - pos : record.tokens[1], - ptrSymbol : ptrs, - senseCnt : parseInt(record.tokens[ptrs.length + 4], 10), - tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10), - synsetOffset: offsets - }; - } - - callback && callback(indexRecord); - resolve(indexRecord); - }); - }); -} - - /** * loads fast index data and return fast index find function * @@ -216,7 +174,7 @@ var IndexFile = function(dictPath, name) { initIndex(this); }; -IndexFile.prototype.lookup = lookup; +IndexFile.prototype.lookup = indexLookup; IndexFile.prototype.find = find; /** diff --git a/src/piper.js b/src/node/piper.js similarity index 98% rename from src/piper.js rename to src/node/piper.js index c0985de..fc13be9 100644 --- a/src/piper.js +++ b/src/node/piper.js @@ -4,7 +4,7 @@ * executes multiple async i/o tasks and pools similar callbacks, * calling i/o open/close when all incoming tasks are done. * - * Copyright (c) 2012-2016 mooster@42at.com + * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license @@ -79,4 +79,3 @@ piper.wrapper = function(self, task /*, result...*/){ module.exports = piper; - diff --git a/src/rand.js b/src/rand.js index 17808c8..81d57dd 100644 --- a/src/rand.js +++ b/src/rand.js @@ -3,7 +3,7 @@ * * define rand() and randX() functions on wordpos * - * Copyright (c) 2012-2016 mooster@42at.com + * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license @@ -12,7 +12,14 @@ var _ = require('underscore')._, util = require('util'), Trie = require('../lib/natural/trie/trie'), - IndexFile = require('./indexFile'), + + +// FIXME + IndexFile = require('./node/indexFile'), + + + + KEY_LENGTH = 3; @@ -264,4 +271,3 @@ module.exports = { wordposProto.randVerb = makeRandX('v'); } }; -