From 57c09957e8f559cd81161f98089b58c5c134d618 Mon Sep 17 00:00:00 2001 From: Moos Date: Fri, 2 Nov 2018 18:48:52 -0700 Subject: [PATCH] refactor rand() for both browser and node --- scripts/makeJsonDict.js | 7 ++- src/browser/baseFile.js | 15 ++++- src/browser/index.js | 29 ++++++--- src/browser/indexFile.js | 80 +++++++++++++++++++++++-- src/common.js | 32 +--------- src/node/index.js | 2 +- src/node/rand.js | 116 +++++++----------------------------- src/rand.js | 125 +++++++++++++++++++++++++++++++++++++++ src/util.js | 21 ++++++- test/validate_test.js | 2 +- test/wordpos_test.js | 74 ++++++++--------------- 11 files changed, 304 insertions(+), 199 deletions(-) create mode 100644 src/rand.js diff --git a/scripts/makeJsonDict.js b/scripts/makeJsonDict.js index f81ebd0..66d840a 100644 --- a/scripts/makeJsonDict.js +++ b/scripts/makeJsonDict.js @@ -8,10 +8,11 @@ let fs = require('fs'); let path = require('path'); -let outPath = './dict'; -let testPath = './test/dict'; +let outPath = './dict'; // browser-use files +let testPath = './test/dict'; // mocha files in CJS format + let posExt = ['adj', 'adv', 'noun', 'verb']; -let dictRoot = './node_modules/wordnet-db/dict/'; +let dictRoot = './node_modules/wordnet-db/dict/'; // source files const fileTypes = { data: true, index: true diff --git a/src/browser/baseFile.js b/src/browser/baseFile.js index 64d4b49..156d570 100644 --- a/src/browser/baseFile.js +++ b/src/browser/baseFile.js @@ -15,15 +15,28 @@ class BaseFile { */ file = {}; - constructor(type, dictPath, posName) { + /** + * constructor + * @param {type} type - 'index' or 'data' + * @param {string} dictPath - path to dict db + * @param {string} posName - one of 'noun', 'verb', 'adj', 'adv' + * @param {object} [options] - @see WordPOS options + */ + + constructor(type, dictPath, posName, options) { this.type = type; this.filePath = `${dictPath}/${type}.${posName}.js`; + this.posName = posName; this.loadError = null; + this.options = Object.assign({}, options); } load() { if (this.loadError) return Promise.reject(this.loadError); + + this.options.debug && console.time('index load ' + this.posName); let promise = Promise.resolve(require(this.filePath)); + this.options.debug && console.timeEnd('index load ' + this.posName) return promise .then(exports => { diff --git a/src/browser/index.js b/src/browser/index.js index f0edce0..9eaa59f 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -9,6 +9,7 @@ const { stopwords, prepText, makeStopwordString } = require('../util'); const { is, get, getPOS, lookup, seek, lookupPOS } = require('../common'); +const { randX, rand } = require('../rand'); const IndexFile = require('./indexFile'); const DataFile = require('./dataFile'); @@ -22,7 +23,6 @@ const POS = { class WordPOS { options = {}; - loaded = Promise.resolve(); constructor(config) { this.options = Object.assign({}, WordPOS.defaults, config); @@ -31,17 +31,11 @@ class WordPOS { if (Array.isArray(this.options.stopwords)) { this.options.stopwords = makeStopwordString(this.options.stopwords); } - - // TODO rand() - } - - ready() { - return this.loaded; } initFiles() { const keys = Object.keys(POS); - const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos]); + const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos], this.options); const loader = (Comp) => keys.map(loadOne.bind(null, Comp)); const reducer = (arr) => arr.reduce((coll, item, i) => (coll[keys[i]] = item, coll), {}); @@ -118,6 +112,17 @@ class WordPOS { lookupAdverb = lookup('r'); lookupNoun = lookup('n'); lookupVerb = lookup('v'); + + /** + * define randX() + * @see makeRandX + */ + rand = rand; + randAdjective = randX('a'); + randAdverb = randX('r'); + randNoun = randX('n'); + randVerb = randX('v'); + } WordPOS.defaults = { @@ -155,7 +160,13 @@ WordPOS.defaults = { * include data files in preload * @type {boolean} */ - includeData: false + includeData: false, + + /** + * set to true to enable debug logging + * @type {boolean} + */ + debug: false }; diff --git a/src/browser/indexFile.js b/src/browser/indexFile.js index 898b732..f36e0ad 100644 --- a/src/browser/indexFile.js +++ b/src/browser/indexFile.js @@ -8,7 +8,9 @@ */ const { indexLookup } = require('../common'); +const { sample } = require('../util'); const BaseFile = require('./baseFile'); +const Trie = require('../../lib/natural/trie/trie'); /** * find a search term in an index file (using fast index) @@ -43,16 +45,78 @@ function find(search, callback) { } /** - * IndexFile class + * Select words at random for POS * - * @param dictPath {string} - WordNet db dict path - * @param posName {string} - name of index: noun, verb, adj, adv - * @constructor + * @param {string} startsWith - string that results should start with + * @param {integer} count - number of results to return + * @param {Function} callback - receives (results, startsWith) + * @return {Promise} receives results + * @this IndexFile + */ +function rand(startsWith, count, callback) { + const done = (res) => { + callback(res, startsWith || ''); + return Promise.resolve(res); + }; + + const doSample = (values) => { + let res = sample(values, count); + // console.timeEnd('getkeys') + return done(res); + }; + + const time = (label) => { + this.options.debug && console.time(label + ' ' + this.posName); + }; + + const timeEnd = (label) => { + this.options.debug && console.timeEnd(label + ' ' + this.posName); + }; + + if (!startsWith) { + // console.time('getkeys') + return doSample(this.getKeys()); + } + + // calc trie if haven't done so yet + if (!this.trie) { + time('Trie'); + this.trie = new Trie(); + this.trie.addStrings(this.getKeys()); + timeEnd('Trie'); + } + + let keys = []; + time('trie-withprefix'); + keys = this.trie.keysWithPrefix(startsWith); + timeEnd('trie-withprefix'); + + // TODO cache results? + + return keys.length ? doSample(keys) : done([]); +} + +/** + * IndexFile class */ class IndexFile extends BaseFile { - constructor(dictPath, posName) { - super('index', dictPath, posName); + keys = null; + + /** + * @param dictPath {string} - WordNet db dict path + * @param posName {string} - name of index: noun, verb, adj, adv + * @param {object} [options] - @see WordPOS options + * @constructor + */ + constructor(dictPath, posName, options) { + super('index', dictPath, posName, options); + this.options = Object.assign({}, options); + this.posName = posName; + } + + getKeys() { + return this.keys || (this.keys = Object.keys(this.file)); } lookup() { @@ -62,6 +126,10 @@ class IndexFile extends BaseFile { find() { return this.ready(find, arguments); } + + rand() { + return this.ready(rand, arguments); + } } module.exports = IndexFile; diff --git a/src/common.js b/src/common.js index 30bd273..35b45d7 100644 --- a/src/common.js +++ b/src/common.js @@ -9,7 +9,7 @@ * Released under MIT license */ -var { normalize, nextTick, isString, uniq, diff, flat } = require('./util'); +var { normalize, nextTick, isString, uniq, sample, diff, flat } = require('./util'); function error(err, callback) { if (isString(err)) err = new RangeError(err); @@ -341,35 +341,6 @@ function seek(offset, pos, callback){ return data.lookup(offset, callback); } -/** - * factory function for randX() - * - * @param pos {string} - a,r,n,v - * @returns {Function} - rand function bound to an index file - * @this WordPOS - */ -function makeRandX(pos){ - return function(opts, callback, _noprofile) { - // disable profiling when isX() used internally - var profile = this.options.profile && !_noprofile, - start = profile && new Date(), - args = [], - index = this.getFilesFor(pos).index, - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1; - - if (typeof opts === 'function') { - callback = opts; - } - - return index.rand(startsWith, count, function (record) { - args.push(record, startsWith); - profile && args.push(new Date() - start); - callback && callback.apply(null, args); - }); - }; -} - const LEX_NAMES = [ 'adj.all', 'adj.pert', @@ -424,7 +395,6 @@ module.exports= { get, seek, getPOS, - makeRandX, lineDataToJSON, LEX_NAMES, diff --git a/src/node/index.js b/src/node/index.js index a15182a..eb724bd 100644 --- a/src/node/index.js +++ b/src/node/index.js @@ -60,7 +60,7 @@ var WordPOS = function(options) { this.advData = new DataFile(dictPath, 'adv'); // define randX() functions - require('../rand').init(this); + require('./rand').init(this); if (_.isArray(this.options.stopwords)) { this.options.stopwords = makeStopwordString(this.options.stopwords); diff --git a/src/node/rand.js b/src/node/rand.js index 9c7b2fe..62f9ca0 100644 --- a/src/node/rand.js +++ b/src/node/rand.js @@ -1,5 +1,5 @@ /*! - * rand.js + * node/rand.js * * define rand() and randX() functions on wordpos * @@ -10,13 +10,11 @@ */ var _ = require('underscore')._, - util = require('util'), - Trie = require('../lib/natural/trie/trie'), - indexPath = process.browser ? 'browser' : 'node', - IndexFile = require(`./${indexPath}/indexFile`), + { randX, rand } = require('../rand'), + Trie = require('../../lib/natural/trie/trie'), + IndexFile = require(`./indexFile`), KEY_LENGTH = 3; - /** * rand function (bound to index) * @@ -26,15 +24,14 @@ var _ = require('underscore')._, * @returns Promise * @this IndexFile */ -function rand(startsWith, num, callback){ +function randomizer(startsWith, num, callback){ var self = this, nextKey = null, trie = this.fastIndex.trie, key, keys; return new Promise(function(resolve, reject) { - - //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); + // console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); if (startsWith) { key = startsWith.slice(0, KEY_LENGTH); @@ -45,10 +42,12 @@ function rand(startsWith, num, callback){ // calc trie if haven't done so yet if (!trie) { + // console.time('trie'); trie = new Trie(); trie.addStrings(self.fastIndex.indexKeys); self.fastIndex.trie = trie; //console.log(' +++ Trie calc '); + // console.timeEnd('trie') } try { @@ -129,80 +128,6 @@ function rand(startsWith, num, callback){ }); // Promise } -// relative weight of each POS word count (DB 3.1 numbers) -var POS_factor = { - Noun: 26, - Verb: 3, - Adjective: 5, - Adverb: 1, - Total: 37 -}; - -/** - * rand() - for all Index files - * @returns Promise - */ -function randAll(opts, callback) { - - if (typeof opts === 'function') { - callback = opts; - opts = {}; - } else { - opts = _.clone(opts || {}); - } - - var - profile = this.options.profile, - start = profile && new Date(), - results = [], - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1, - args = [null, startsWith], - parts = 'Noun Verb Adjective Adverb'.split(' '), - self = this; - - - - return new Promise(function(resolve, reject) { - // select at random a POS to look at - var doParts = _.sample(parts, parts.length); - tryPart(); - - function tryPart() { - var part = doParts.pop(), - rand = 'rand' + part, - factor = POS_factor[part], - weight = factor / POS_factor.Total; - - // pick count according to relative weight - opts.count = Math.ceil(count * weight * 1.1); // guard against dupes - self[rand](opts, partCallback); - } - - function partCallback(result) { - if (result) { - results = _.uniq(results.concat(result)); // make sure it's unique! - } - - if (results.length < count && doParts.length) { - return tryPart(); - } - - // final random and trim excess - results = _.sample(results, count); - done(); - } - - function done() { - profile && (args.push(new Date() - start)); - args[0] = results; - callback && callback.apply(null, args); - resolve(results); - } - - }); // Promise -} - /** * bind rand() to index * @@ -210,31 +135,30 @@ function randAll(opts, callback) { * @returns {function} - bound rand function for index */ function randomify(index){ - if (!index.fastIndex) throw 'rand requires fastIndex'; - return _.bind(rand, index); + if (!index.fastIndex) throw new Error('rand requires fastIndex'); + index.rand = _.bind(randomizer, index); } - module.exports = { init: function(wordposProto) { - wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); - wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); - wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex); - wordposProto.advIndex.rand = randomify(wordposProto.advIndex); + randomify(wordposProto.nounIndex); + randomify(wordposProto.verbIndex); + randomify(wordposProto.adjIndex); + randomify(wordposProto.advIndex); /** - * define rand() + * define rand() (all POS) */ - wordposProto.rand = randAll; + wordposProto.rand = rand; /** * define randX() */ - wordposProto.randAdjective = makeRandX('a'); - wordposProto.randAdverb = makeRandX('r'); - wordposProto.randNoun = makeRandX('n'); - wordposProto.randVerb = makeRandX('v'); + wordposProto.randAdjective = randX('a'); + wordposProto.randAdverb = randX('r'); + wordposProto.randNoun = randX('n'); + wordposProto.randVerb = randX('v'); } }; diff --git a/src/rand.js b/src/rand.js new file mode 100644 index 0000000..c39ee03 --- /dev/null +++ b/src/rand.js @@ -0,0 +1,125 @@ +/** +* rand.js +* +* Copyright (c) 2012-2019 mooster@42at.com +* https://github.com/moos/wordpos +* +* Released under MIT license +*/ + +var { uniq, sample } = require('./util'); + +/** + * factory function for randX() + * + * @param pos {string} - a,r,n,v + * @returns {Function} - rand function bound to an index file + * @this WordPOS + */ +function randX(pos){ + return function(opts, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index, + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1; + + if (typeof opts === 'function') { + callback = opts; + } + + return index.rand(startsWith, count, function (record) { + args.push(record, startsWith); + profile && args.push(new Date() - start); + callback && callback.apply(null, args); + }); + }; +} + + +/** + * rand() - for all Index files + * + * @param [opts] {object} options + * @param opts.startsWith {string} string random words should start with + * @param opts.count {integer} number of random words to return + * @param callback {function} - callback receives (results, startsWith, profile) + * @returns {Promise} receives results + * @this WordPOS + */ +function rand(opts, callback) { + if (typeof opts === 'function') { + callback = opts; + opts = {}; + } else { + opts = Object.assign({ + startsWith: '', + count: 1 + }, opts); + } + + var + profile = this.options.profile, + start = profile && new Date(), + results = [], + count = opts.count, + args = [null, opts.startsWith], + parts = 'Noun Verb Adjective Adverb'.split(' '), + self = this; + + return new Promise(function(resolve, reject) { + // select at random a POS to look at + var doParts = sample(parts, parts.length); + tryPart(); + + function tryPart() { + var part = doParts.pop(), + rand = 'rand' + part, + factor = POS_factor[part], + weight = factor / POS_factor.Total; + + // pick count according to relative weight + opts.count = Math.ceil(count * weight * 1.1); // guard against dupes + self[rand](opts, partCallback); + } + + function partCallback(result) { + if (result) { + results = uniq(results.concat(result)); // make sure it's unique! + } + + if (results.length < count && doParts.length) { + return tryPart(); + } + + // final random and trim excess + results = sample(results, count); + done(); + } + + function done() { + profile && (args.push(new Date() - start)); + args[0] = results; + callback && callback.apply(null, args); + resolve(results); + } + + }); // Promise +} + + +// relative weight of each POS word count (DB 3.1 numbers) +const POS_factor = { + Noun: 26, + Verb: 3, + Adjective: 5, + Adverb: 1, + Total: 37 +}; + +module.exports = { + randX, + rand +}; diff --git a/src/util.js b/src/util.js index b75d76a..143a250 100644 --- a/src/util.js +++ b/src/util.js @@ -53,6 +53,24 @@ function flat(arr) { return [].concat.apply([], arr); } +// get random sample from array (note: count << array.length) +// https://stackoverflow.com/a/37834217 +function sample(array, count) { + var indices = []; + var result = new Array(count); + for (let i = 0; i < count; i++ ) { + let j = Math.floor(Math.random() * (array.length - i) + i); + let val = array[indices[j] === undefined ? j : indices[j]]; + if (val === undefined) { + result.length = i; + break; + } + result[i] = val; + indices[j] = indices[i] === undefined ? i : indices[i]; + } + return result; +} + function isString(s) { return typeof s === 'string'; } @@ -81,5 +99,6 @@ module.exports = { makeStopwordString, uniq, diff, - flat + flat, + sample }; diff --git a/test/validate_test.js b/test/validate_test.js index 52a6065..5bccb3d 100644 --- a/test/validate_test.js +++ b/test/validate_test.js @@ -50,4 +50,4 @@ function callback(error, stdout, stderr) { console.log(stdout); console.error(stderr); gDone(); -} \ No newline at end of file +} diff --git a/test/wordpos_test.js b/test/wordpos_test.js index 49b1679..2efd10f 100644 --- a/test/wordpos_test.js +++ b/test/wordpos_test.js @@ -1,7 +1,7 @@ /** * wordpos_test.js * - * test file for main wordpos functionality + * test file for main wordpos functionality (both node and browser) * * Usage: * npm install mocha -g @@ -11,14 +11,12 @@ * * npm test * - * Copyright (c) 2012-2016 mooster@42at.com + * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license */ -//import {describe, it} from 'mocha/lib/mocha.js'; - var chai = require('chai'), _ = require('underscore'), @@ -29,7 +27,8 @@ var dictPath = browser ? path.resolve('./test/dict') : undefined, wordpos = new WordPOS({ profile: false, - dictPath: dictPath + dictPath: dictPath, + // debug: true }); const assertNoData = (err) => { @@ -58,7 +57,6 @@ var str = "The angry bear chased the frightened little squirrel", offset = 1285602; - describe('lookup', function() { it('with callback', function () { @@ -301,73 +299,49 @@ describe('nested callbacks on same index key', function() { describe('rand()...', function() { - it('should get random word', function(done) { - wordpos.rand(function(result) { + it('should get random word', function() { + return wordpos.rand(function(result) { assert.equal(result.length, 1); - done(); }); }); - it('should get N random words', function(done) { - wordpos.rand({count: 3}, function(result) { + it('should get N random words', function() { + return wordpos.rand({count: 3}, function(result) { assert.equal(result.length, 3); - done(); }); }); - it('should get random word starting with', function(done) { - wordpos.rand({startsWith: 'foo'}, function(result, startsWith) { + it('should get random word starting with', function() { + return wordpos.rand({startsWith: 'foo'}, function(result, startsWith) { assert.equal(result[0].indexOf('foo'), 0); assert.equal(startsWith, 'foo'); - done(); }); }); - it('should get nothing starting with not found', function(done) { - wordpos.rand({startsWith: 'zzzz'}, function(result) { + it('should get nothing starting with not found', function() { + return wordpos.rand({startsWith: 'zzzz'}, function(result) { assert.equal(result.length, 0); - done(); }); }); }); describe('randX()...', function() { - it('should get random noun', function(done) { - wordpos.randNoun(function(result) { - assert.equal(result.length, 1); - done(); - }); - }); + let assertOneResult = (res) => { + assert.equal(res.length, 1); + }; - it('should get random verb', function(done) { - wordpos.randVerb(function(result) { - assert.equal(result.length, 1); - done(); - }); - }); - - it('should get random adjective', function(done) { - wordpos.randAdjective(function(result) { - assert.equal(result.length, 1); - done(); - }); - }); - - it('should get random adverb', function(done) { - wordpos.randAdverb(function(result) { - assert.equal(result.length, 1); - done(); - }); - }); + it('should get random noun', () => wordpos.randNoun(assertOneResult)); + it('should get random verb', () => wordpos.randVerb(assertOneResult)); + it('should get random adjective', () => wordpos.randAdjective(assertOneResult)); + it('should get random adverb', () => wordpos.randAdverb(assertOneResult)); // not found - it('should NOT get random noun starting with', function(done) { - wordpos.randNoun({startsWith: 'zzzz'},function(result, startsWith) { - assert.equal(result.length, 0); - done(); - }); - }); + it('should NOT get random noun starting with', () => + wordpos.randNoun({startsWith: 'zzzz'}, (result, startsWith) => + assert.equal(result.length, 0) + ) + ); });