From 364b2648f7a3ffb1e4af38812c43ac025f6f41bf Mon Sep 17 00:00:00 2001 From: Moos Date: Fri, 12 Oct 2018 20:35:11 -0700 Subject: [PATCH] first checkin for browser rework --- .babelrc | 4 + .gitignore | 4 +- package.json | 22 ++- samples/self-hosted/index.html | 51 ++++++ samples/self-hosted/main.js | 19 +++ scripts/makeJsonDict.js | 85 ++++++++++ src/browser/baseFile.js | 30 ++++ src/browser/dataFile.js | 92 +++++++++++ src/browser/index.js | 165 ++++++++++++++++++++ src/browser/indexFile.js | 71 +++++++++ src/browser/piper.js | 82 ++++++++++ src/browser/rand.js | 267 +++++++++++++++++++++++++++++++ src/common.js | 277 +++++++++++++++++++++++++++++++++ src/util.js | 56 +++++++ 14 files changed, 1221 insertions(+), 4 deletions(-) create mode 100644 .babelrc create mode 100644 samples/self-hosted/index.html create mode 100644 samples/self-hosted/main.js create mode 100644 scripts/makeJsonDict.js create mode 100644 src/browser/baseFile.js create mode 100644 src/browser/dataFile.js create mode 100644 src/browser/index.js create mode 100644 src/browser/indexFile.js create mode 100644 src/browser/piper.js create mode 100644 src/browser/rand.js create mode 100644 src/common.js create mode 100644 src/util.js diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..1067c95 --- /dev/null +++ b/.babelrc @@ -0,0 +1,4 @@ +{ + "presets": ["env", "stage-2"], + "plugins": ["transform-class-properties"] +} diff --git a/.gitignore b/.gitignore index 0db3560..bbecf68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ dict node_modules .idea -*.iml \ No newline at end of file +*.iml +.cache +dist diff --git a/package.json b/package.json index 9edf78e..fd19e0e 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "wordpos", - "version": "1.2.0", + "version": "2.0.0-alpha", "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", "author": "Moos ", "keywords": [ @@ -16,14 +16,26 @@ "engines": { "node": ">=4" }, - "files": ["bench","bin","lib","src","test","tools"], + "files": [ + "bench", + "bin", + "lib", + "src", + "test", + "tools" + ], "bin": "./bin/wordpos-cli.js", "dependencies": { "commander": "^2.0.0", + "dict": "^1.4.0", "underscore": ">=1.3.1", "wordnet-db": "^3.1.6" }, "devDependencies": { + "babel-core": "^6.26.3", + "babel-plugin-transform-class-properties": "^6.24.1", + "babel-preset-env": "^1.7.0", + "babel-preset-stage-2": "^6.24.1", "chai": "^4.0.2", "mini-bench": "^1.0.0", "mocha": "^5.2.0" @@ -35,7 +47,11 @@ "main": "./src/wordpos.js", "scripts": { "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun", - "test": "mocha test" + "postinstall-web": "node scripts/makeJsonDict.js index data", + "test": "mocha test", + "start": "npm run start-self", + "start-self": "parcel samples/self-hosted/index.html", + "start-cdn": "parcel samples/cdn/index.html" }, "license": "MIT" } diff --git a/samples/self-hosted/index.html b/samples/self-hosted/index.html new file mode 100644 index 0000000..05fee23 --- /dev/null +++ b/samples/self-hosted/index.html @@ -0,0 +1,51 @@ + + + + + + + + +

Self-hosted WordPOS sample

+ + + + diff --git a/samples/self-hosted/main.js b/samples/self-hosted/main.js new file mode 100644 index 0000000..4e5db59 --- /dev/null +++ b/samples/self-hosted/main.js @@ -0,0 +1,19 @@ +import WordPOS from '../../src/browser'; + +console.log(__dirname, WordPOS.defaults) + +let wordpos = window.wordpos = new WordPOS({ + // preload: true, + dictPath: './dict', + profile: true, + // stopwords: false +}); + +wordpos.isAdverb('likely').then(res => console.log('likely is adverb:', res)); +// wordpos.isAdverb('likely', (res, ...profile) => console.log('likely callback', res, profile)); +wordpos.getAdverbs('this is is likely a likely tricky business this is').then( + res => console.log('getAdverb', res) +); + +wordpos.lookupAdverb('likely').then(res => console.log('lookup ===', res)) +// wordpos.lookup('likely').then(res, console.log('lookup ===', res)) diff --git a/scripts/makeJsonDict.js b/scripts/makeJsonDict.js new file mode 100644 index 0000000..cabbedb --- /dev/null +++ b/scripts/makeJsonDict.js @@ -0,0 +1,85 @@ +#!/usr/bin/env node + +/** + * takes original WordNet index & data files and converts to + * exported JSON format with lemma as the key. + */ + +let fs = require('fs'); +let path = require('path'); + +let outPath = './dict'; +let posExt = ['adj', 'adv', 'noun', 'verb']; +let dictRoot = './node_modules/wordnet-db/dict/'; +const fileTypes = { + data: true, + index: true +}; +const [,, ...args] = process.argv; + +if (!args.length || args.filter(p => !(p in fileTypes)).length) { + console.log('Converts wordnet-db index & data files to JSON format for use in the browser.'); + console.log('\nUsage: makeJsonDict.js index|data'); + process.exit(1); +} + +function uniq(arr) { + return arr.filter((v, i) => arr.indexOf(v) === i); +} + +console.time('Done'); + +// create out directory +try { + fs.statSync(outPath); +} catch (e) { + fs.mkdirSync(outPath); +} + +function processFile(name) { + + // read the file as text + function loadFile(pos) { + console.time(' load'); + let inPath = path.resolve(dictRoot, name + '.' + pos); + let text = fs.readFileSync(inPath, 'utf8'); + console.timeEnd(' load'); + return text; + } + + // convert raw text to JSON and write to file + function processText(pos, text) { + let obj = {}; + let sp = ' '; + console.time(' process'); + text.split('\n').forEach(line => { + if (!line || line[0] === sp) return; + let spi = line.indexOf(sp); + let key = line.substr(0, spi); + line = line.substring(1 + spi, line.lastIndexOf(sp + sp)) + obj[key] = line; + }); + console.timeEnd(' process'); + return obj; + } + + function writeFile(pos, obj) { + console.time(' write'); + let text = JSON.stringify(obj); + text = 'export default ' + text; + fs.writeFileSync(path.resolve(outPath, name + '.' + pos + '.js'), text); + console.timeEnd(' write'); + } + + posExt.forEach(pos => { + console.log('\n', name, pos, ':'); + let text = loadFile(pos); + let obj = processText(pos, text); + writeFile(pos, obj); + }); +} + +uniq(args).forEach(processFile); + +console.log('\nWritten to', path.resolve(outPath)); +console.timeEnd('Done'); diff --git a/src/browser/baseFile.js b/src/browser/baseFile.js new file mode 100644 index 0000000..68af93b --- /dev/null +++ b/src/browser/baseFile.js @@ -0,0 +1,30 @@ + + +class BaseFile { + + /** + * file contents + * @type {Object} + */ + file = {}; + + constructor(type, dictPath, posName) { + this.filePath = `${dictPath}/${type}.${posName}.js`; + this.type = type; + } + + load() { + return import(this.filePath) + .then(exports => this.file = exports.default) + .catch(err => { + console.error(`Error loading ${this.type} file for ${this.filePath}.`, err); + throw err; + }); + } + + ready(fn, args) { + return this.load().then(() => fn.apply(this, args)); + } +} + +export default BaseFile; diff --git a/src/browser/dataFile.js b/src/browser/dataFile.js new file mode 100644 index 0000000..6c72327 --- /dev/null +++ b/src/browser/dataFile.js @@ -0,0 +1,92 @@ +/*! + * dataFile.js + * + * Copyright (c) 2012-2019 mooster@42at.com + * https://github.com/moos/wordpos + * + * Portions: Copyright (c) 2011, Chris Umbel + * + * Released under MIT license + */ + +import { lineDataToJSON, LEX_NAMES } from '../common'; +import BaseFile from './BaseFile'; + +/** + * get parsed line from data file + * + * @param {string} offset The offset key + * @return {object} Data record object + * @this DataFile + */ +function seek(offset) { + let str = this.file[offset]; + if (!str) return {}; + // offset was extracted for the key - add it back to line data + return lineDataToJSON(offset + ' ' + str); +} + +/** + * lookup offsets in data file + * + * @param offsets {array} - array of offsets to lookup (obtained from index.find()) + * @param callback{function} (optional) - callback function + * @returns {Promise.[]} array of or single data record + * @this DataFile + */ +function lookup(offsets, callback) { + var results = [], + self = this, + readLine = seek.bind(this), + valid = (item => item.pos), + single = !Array.isArray(offsets); + + if (single) offsets = [offsets]; + return new Promise(function(resolve, reject) { + results = offsets.map(readLine).filter(valid); + + if (!results.length) { + let err = new RangeError(`No data at offsets ${offsets.join()} in ${self.filePath}.`); + callback && callback(err, single ? {} :[]); + reject(err); + } else { + if (single) results = results[0]; + callback && callback(null, results); + resolve(results); + } + }); +} + +/** + * DataFile class + * + * @param dictPath {string} - path to dict folder + * @param name {string} - POS name + * @constructor + */ +class DataFile extends BaseFile { + + constructor(dictPath, posName) { + super('data', dictPath, posName); + } + + lookup() { + return this.ready(lookup, arguments); + } + + seek() { + // return this.ready(find, arguments); + } + +} + + +/** + * map of lexFilenum to lex names + * + * @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html + * @type {string[]} + */ +DataFile.LEX_NAMES = LEX_NAMES; + +export default DataFile; diff --git a/src/browser/index.js b/src/browser/index.js new file mode 100644 index 0000000..344b549 --- /dev/null +++ b/src/browser/index.js @@ -0,0 +1,165 @@ +import { stopwords, prepText, makeStopwordString } from '../util'; +import { is, get, lookup } from '../common'; +import IndexFile from './indexFile'; +import DataFile from './dataFile'; + +const POS = { + n: 'noun', + v: 'verb', + a: 'adj', + r: 'adv' +}; + + +class WordPOS { + + options = {}; + loaded = Promise.resolve(); + + constructor(config) { + this.options = Object.assign({}, WordPOS.defaults, config); + console.log('wpos ctor -- ', this.options) + + this.initFiles(); + if (Array.isArray(this.options.stopwords)) { + this.options.stopwords = makeStopwordString(this.options.stopwords); + } + + // TODO rand() + } + + ready() { + return this.loaded; + } + + initFiles() { + const keys = Object.keys(POS); + const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos]); + const loader = (Comp) => keys.map(loadOne.bind(null, Comp)); + const reducer = (arr) => arr.reduce((coll, item, i) => (coll[keys[i]] = item, coll), {}); + + this.indexFiles = reducer(loader(IndexFile)); + this.dataFiles = reducer(loader(DataFile)); + + if (this.options.preload) { + this.loaded = this.preloadIndexes(this.options.preload); + } + } + + getFilesFor(pos) { + return { + index: this.indexFiles[pos], + data: this.dataFiles[pos] + }; + } + + /** + * loads index files + * + * @param {string|Array} [pos] POS to load (default: all) + * @return {Promise.} + */ + preloadIndexes(pos) { + let file = this.indexFile[pos]; + let load = p => file.load(); + let promise; + + if (!pos || pos === true) { // preload all + promise = Promise.all(Object.keys(POS).map(load)); + } + else if (typeof pos === 'string' && file) { + promise = load(pos); + } + else if (pos instanceof Array) { + promise = pos.forEach(pos => file && load(pos)); + } + + // TODO includeData + + return promise || Promise.reject(new RangeError(`Unknown POS "${pos}" for preload.`)); + } + + parse = prepText; + + /** + * isX() - Test if word is given POS + * @see is + */ + isAdjective = is('a'); + isAdverb = is('r'); + isNoun = is('n'); + isVerb = is('v'); + + /** + * getX() - Find all words in string that are given POS + * @see get + */ + getAdjectives = get('isAdjective'); + getAdverbs = get('isAdverb'); + getNouns = get('isNoun'); + getVerbs = get('isVerb'); + + /** + * lookupX() - Lookup word definition if already know POS + * @see lookup + */ + lookupAdjective = lookup('a'); + lookupAdverb = lookup('r'); + lookupNoun = lookup('n'); + lookupVerb = lookup('v'); +} + +WordPOS.defaults = { + /** + * path to WordNet data (override only if not using wordnet-db) + * @type {string} + */ + dictPath: '', + + /** + * enable profiling, time in msec returned as second argument in callback + * @type {boolean} + */ + profile: false, + + /** + * if true, exclude standard stopwords. + * if array, stopwords to exclude, eg, ['all','of','this',...] + * if false, do not filter any stopwords. + * @type {boolean} + */ + stopwords: true, + + /** + * preload files. + * true - preload all POS + * false - do not preload any POS + * 'a' - preload adj + * ['a','v'] - preload adj & verb + * @type {boolean|string|Array} + */ + preload: false, + + /** + * include data files in preload + * @type {boolean} + */ + + includeData: false + +}; + + +/** + * access to WordNet DB + * @type {object} + */ +// WordPOS.WNdb = WNdb; + +/** + * access to stopwords + * @type {Array} + */ +WordPOS.stopwords = stopwords; + +export default WordPOS; diff --git a/src/browser/indexFile.js b/src/browser/indexFile.js new file mode 100644 index 0000000..88d0166 --- /dev/null +++ b/src/browser/indexFile.js @@ -0,0 +1,71 @@ +/*! + * indexFile.js + * + * implements fast index lookup of WordNet's index files + * + * Copyright (c) 2012-2019 mooster@42at.com + * https://github.com/moos/wordpos + * + * Portions: Copyright (c) 2011, Chris Umbel + * + * Released under MIT license + */ + +import { indexLookup } from '../common'; +import BaseFile from './BaseFile'; + +/** + * find a search term in an index file (using fast index) + * + * Calls to same bucket are queued for callback using the piper. + * + * @param search {string} - word to search for + * @param callback {function} - callback receives found line and tokens + * @returns none + * @this IndexFile + */ +function find(search, callback) { + var miss = {status: 'miss'}; + + if (!(search in this.file)) { + callback(miss); + return; + } + + var + line = this.file[search], + tokens = line.split(/\s+/), + result = { + status: 'hit', + key: search, + line: line, + tokens: tokens + }; + + result.tokens.unshift(search); + callback(result); +} + +/** + * IndexFile class + * + * @param dictPath {string} - WordNet db dict path + * @param name {string} - name of index: noun, verb, adj, adv + * @constructor + */ +class IndexFile extends BaseFile { + + constructor(dictPath, posName) { + super('index', dictPath, posName); + } + + lookup() { + return this.ready(indexLookup, arguments); + } + + find() { + return this.ready(find, arguments); + } +} + +export default IndexFile; diff --git a/src/browser/piper.js b/src/browser/piper.js new file mode 100644 index 0000000..c0985de --- /dev/null +++ b/src/browser/piper.js @@ -0,0 +1,82 @@ +/*! + * piper.js + * + * executes multiple async i/o tasks and pools similar callbacks, + * calling i/o open/close when all incoming tasks are done. + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + fs = require('fs'); + +/** + * run single 'task' method sharing callbacks. Method MUST take callback as LAST arg. + * piper is bound to an IndexFile. + * + * @param task {string} - task name unique to method! + * @param method {function} - method to execute, gets (args, ... , callback) + * @param args {Array} - args to pass to method + * @param context {object} - other params to remember and sent to callback + * @param callback {function} - result callback + */ +function piper(task, method, args, context, callback){ + var readCallbacks = this.callbackQueue, + memoArgs = _.rest(arguments, 2), + wrappedCallback; + + //console.log('piper', task, [method]); + + // queue up if already reading file for this task + if (task in readCallbacks){ + readCallbacks[task].push(memoArgs); + return; + } + readCallbacks[task] = [memoArgs]; + + if (!this.fd) { + //console.log(' ... opening', this.filePath); + this.fd = fs.openSync(this.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++this.refcount; + + wrappedCallback = _.partial(piper.wrapper, this, task); + + // call method -- replace original callback (last arg) with wrapped one + method.apply(null, [].concat( args, wrappedCallback )); +} + +// result is the *same* for same task +piper.wrapper = function(self, task /*, result...*/){ + var readCallbacks = self.callbackQueue, + result = _.rest(arguments, 2), + callback, args; + + // live access callbacks cache in case nested cb's + // add to the array. + while (args = readCallbacks[task].shift()) { + callback = args.pop(); // last arg MUST be callback + +// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString()) + callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result)); + } + + // now done - delete cb cache + delete readCallbacks[task]; + + if (--self.refcount === 0) { + //console.log(' ... closing', self.filePath); + fs.closeSync(self.fd); + self.fd = null; + } +}; + + +module.exports = piper; + diff --git a/src/browser/rand.js b/src/browser/rand.js new file mode 100644 index 0000000..17808c8 --- /dev/null +++ b/src/browser/rand.js @@ -0,0 +1,267 @@ +/*! + * rand.js + * + * define rand() and randX() functions on wordpos + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + Trie = require('../lib/natural/trie/trie'), + IndexFile = require('./indexFile'), + KEY_LENGTH = 3; + + +/** + * factory function for randX() + * + * @param pos {string} - a,r,n,v + * @returns {Function} - rand function bound to an index file + */ +function makeRandX(pos){ + return function(opts, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index, + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1; + + if (typeof opts === 'function') { + callback = opts; + } + + return index.rand(startsWith, count, function (record) { + args.push(record, startsWith); + profile && args.push(new Date() - start); + callback && callback.apply(null, args); + }); + }; +} + +/** + * rand function (bound to index) + * + * @param startsWith {string} - get random word(s) that start with this, or '' + * @param num {number} - number of words to return + * @param callback {function} - callback function, receives words array and startsWith + * @returns Promise + */ +function rand(startsWith, num, callback){ + var self = this, + nextKey = null, + trie = this.fastIndex.trie, + key, keys; + + return new Promise(function(resolve, reject) { + + //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); + if (startsWith) { + key = startsWith.slice(0, KEY_LENGTH); + + /** + * if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that. + */ + if (key.length < KEY_LENGTH) { + + // calc trie if haven't done so yet + if (!trie) { + trie = new Trie(); + trie.addStrings(self.fastIndex.indexKeys); + self.fastIndex.trie = trie; + //console.log(' +++ Trie calc '); + } + + try { + // trie throws if not found!!!!! + keys = trie.keysWithPrefix(startsWith); + } catch (e) { + keys = []; + } + + // read all keys then select random word. + // May be large disk read! + key = keys[0]; + nextKey = _.last(keys); + } + + if (!key || !(key in self.fastIndex.offsets)) { + callback && callback([], startsWith); + resolve([]); + } + + } else { + // no startWith given - random select among keys + keys = _.sample(self.fastIndex.indexKeys, num); + + // if num > 1, run each key independently and collect results + if (num > 1) { + var results = [], ii = 0; + _(keys).each(function (startsWith) { + self.rand(startsWith, 1, function (result) { + results.push(result[0]); + if (++ii == num) { + callback && callback(results, ''); + resolve(results); + } + }); + }); + return; + } + key = keys; + } + + // prepare the piper + var args = [key, nextKey, self], + task = 'rand:' + key + nextKey, + context = [startsWith, num, callback]; // last arg MUST be callback + + // pay the piper + self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector); + + function collector(key, nextKey, index, startsWith, num, callback, buffer) { + var lines = buffer.toString().split('\n'), + matches = lines.map(function (line) { + return line.substring(0, line.indexOf(' ')); + }); + //console.log(' got lines for key ', key, lines.length); + + // we got bunch of matches for key - now search within for startsWith + if (startsWith !== key) { + // binary search for startsWith within set of matches + var ind = _.sortedIndex(matches, startsWith); + if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) { + callback && callback([], startsWith); + resolve([]); + return; + } + + var trie = new Trie(); + trie.addStrings(matches); + //console.log('Trie > ', trie.matchesWithPrefix( startsWith )); + matches = trie.keysWithPrefix(startsWith); + } + + var words = _.sample(matches, num); + callback && callback(words, startsWith); + resolve(words); + } + + }); // Promise +} + +// relative weight of each POS word count (DB 3.1 numbers) +var POS_factor = { + Noun: 26, + Verb: 3, + Adjective: 5, + Adverb: 1, + Total: 37 +}; + +/** + * rand() - for all Index files + * @returns Promise + */ +function randAll(opts, callback) { + + if (typeof opts === 'function') { + callback = opts; + opts = {}; + } else { + opts = _.clone(opts || {}); + } + + var + profile = this.options.profile, + start = profile && new Date(), + results = [], + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1, + args = [null, startsWith], + parts = 'Noun Verb Adjective Adverb'.split(' '), + self = this; + + + + return new Promise(function(resolve, reject) { + // select at random a POS to look at + var doParts = _.sample(parts, parts.length); + tryPart(); + + function tryPart() { + var part = doParts.pop(), + rand = 'rand' + part, + factor = POS_factor[part], + weight = factor / POS_factor.Total; + + // pick count according to relative weight + opts.count = Math.ceil(count * weight * 1.1); // guard against dupes + self[rand](opts, partCallback); + } + + function partCallback(result) { + if (result) { + results = _.uniq(results.concat(result)); // make sure it's unique! + } + + if (results.length < count && doParts.length) { + return tryPart(); + } + + // final random and trim excess + results = _.sample(results, count); + done(); + } + + function done() { + profile && (args.push(new Date() - start)); + args[0] = results; + callback && callback.apply(null, args); + resolve(results); + } + + }); // Promise +} + +/** + * bind rand() to index + * + * @param index {object} - the IndexFile instance + * @returns {function} - bound rand function for index + */ +function randomify(index){ + if (!index.fastIndex) throw 'rand requires fastIndex'; + return _.bind(rand, index); +} + + + +module.exports = { + + init: function(wordposProto) { + wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); + wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); + wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex); + wordposProto.advIndex.rand = randomify(wordposProto.advIndex); + + /** + * define rand() + */ + wordposProto.rand = randAll; + + /** + * define randX() + */ + wordposProto.randAdjective = makeRandX('a'); + wordposProto.randAdverb = makeRandX('r'); + wordposProto.randNoun = makeRandX('n'); + wordposProto.randVerb = makeRandX('v'); + } +}; + diff --git a/src/common.js b/src/common.js new file mode 100644 index 0000000..a405af2 --- /dev/null +++ b/src/common.js @@ -0,0 +1,277 @@ +import { normalize, nextTick } from './util'; + + + +/** + * factory for main lookup function + * + * @param pos {string} - n/v/a/r + * @returns {Function} - lookup function bound to POS + * @this WordPOS + */ +function lookup(pos) { + return function(word, callback) { + var profile = this.options.profile, + start = profile && new Date(), + files = this.getFilesFor(pos), + args = []; + + word = normalize(word); + + // lookup index + return files.index.lookup(word) + .then(function(result) { + if (result) { + // lookup data + return files.data.lookup(result.synsetOffset).then(done); + } else { + // not found in index + return done([]); + } + }) + .catch(done); + + function done(results) { + if (results instanceof Error) { + args.push([], word); + } else { + args.push(results, word); + } + //console.log(3333, args) + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } + }; +} + +/** + * find a word and prepare its lexical record + * + * @param word {string} - search word + * @param callback {function} - callback function receives result + * @returns {Promise.} + * @this IndexFile + * + * Credit for this routine to https://github.com/NaturalNode/natural + */ +function indexLookup(word, callback) { + var self = this; + + return new Promise(function(resolve, reject){ + self.find(word, function (record) { + var indexRecord = null, + i; + + if (record.status == 'hit') { + var ptrs = [], offsets = []; + let n = parseInt(record.tokens[3]); + + for (i = 0; i < n; i++) { + ptrs.push(record.tokens[i]); + } + + n = parseInt(record.tokens[2]); + for (i = 0; i < n; i++) { + offsets.push(record.tokens[ptrs.length + 6 + i]); + } + + indexRecord = { + lemma : record.tokens[0], + pos : record.tokens[1], + ptrSymbol : ptrs, + senseCnt : parseInt(record.tokens[ptrs.length + 4], 10), + tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10), + synsetOffset: offsets + }; + } + callback && callback(indexRecord); + resolve(indexRecord); + }); + }); +} + + + +/** + * getX() factory function + * + * @param isFn {function} - an isX() function + * @returns {Function} + * @this IndexFile + */ +function get(isFn) { + return function(text, callback, _noprofile) { + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + words = this.parse(text), + results = [], + self = this; + + return Promise + .all(words.map(exec)) + .then(done); + + function exec(word) { + return self[isFn] + .call(self, word, null, /*_noprofile*/ true) + .then(function collect(result) { + result && results.push(word); + }); + } + + function done(){ + var args = [results]; + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } + }; +} + + +/** + * isX() factory function + * + * @param pos {string} - n/v/a/r + * @returns {Function} + * @this WordPOS + */ +function is(pos){ + return function(word, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index; + word = normalize(word); + + return index + .lookup(word) + .then(function(record) { + var result = !!record; + args.push(result, word); + profile && args.push(new Date() - start); + nextTick(callback, args); + return result; + }); + }; +} + + +/** + * parse a single data file line, returning data object + * + * @param line {string} - a single line from WordNet data file + * @returns {object} + * + * Credit for this routine to https://github.com/NaturalNode/natural + */ +function lineDataToJSON(line, location) { + // if (!dataCheck(line, location)) return new Error('Bad data at location ' + location); + + var data = line.split('| '), + tokens = data[0].split(/\s+/), + ptrs = [], + wCnt = parseInt(tokens[3], 16), + synonyms = [], + i; + + for(i = 0; i < wCnt; i++) { + synonyms.push(tokens[4 + i * 2]); + } + + var ptrOffset = (wCnt - 1) * 2 + 6; + let n = parseInt(tokens[ptrOffset], 10); + for(i = 0; i < n; i++) { + ptrs.push({ + pointerSymbol: tokens[ptrOffset + 1 + i * 4], + synsetOffset: tokens[ptrOffset + 2 + i * 4], + pos: tokens[ptrOffset + 3 + i * 4], + sourceTarget: tokens[ptrOffset + 4 + i * 4] + }); + } + + // break "gloss" into definition vs. examples + var glossArray = data[1].split('; '); + var definition = glossArray[0]; + var examples = glossArray.slice(1); + var lexFilenum = parseInt(tokens[1], 10); + + for (var k = 0; k < examples.length; k++) { + examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); + } + + return { + synsetOffset: tokens[0], + lexFilenum: lexFilenum, + lexName: LEX_NAMES[ lexFilenum ], + pos: tokens[2], + wCnt: wCnt, + lemma: tokens[4], + synonyms: synonyms, + lexId: tokens[5], + ptrs: ptrs, + gloss: data[1], + def: definition, + exp: examples + }; +} + +const LEX_NAMES = [ + 'adj.all', + 'adj.pert', + 'adv.all', + 'noun.Tops', + 'noun.act', + 'noun.animal', + 'noun.artifact', + 'noun.attribute', + 'noun.body', + 'noun.cognition', + 'noun.communication', + 'noun.event', + 'noun.feeling', + 'noun.food', + 'noun.group', + 'noun.location', + 'noun.motive', + 'noun.object', + 'noun.person', + 'noun.phenomenon', + 'noun.plant', + 'noun.possession', + 'noun.process', + 'noun.quantity', + 'noun.relation', + 'noun.shape', + 'noun.state', + 'noun.substance', + 'noun.time', + 'verb.body', + 'verb.change', + 'verb.cognition', + 'verb.communication', + 'verb.competition', + 'verb.consumption', + 'verb.contact', + 'verb.creation', + 'verb.emotion', + 'verb.motion', + 'verb.perception', + 'verb.possession', + 'verb.social', + 'verb.stative', + 'verb.weather', + 'adj.ppl' +]; + +export { + indexLookup, + is, + get, + + lineDataToJSON, + LEX_NAMES, + lookup +} diff --git a/src/util.js b/src/util.js new file mode 100644 index 0000000..0b2d7ba --- /dev/null +++ b/src/util.js @@ -0,0 +1,56 @@ +let stopwords = require('../lib/natural/util/stopwords').words; +let stopwordsStr = makeStopwordString(stopwords); + + +function makeStopwordString(stopwords) { + return ' ' + stopwords.join(' ') + ' '; +} + +// setImmediate executes callback AFTER promise handlers. +// Without it, exceptions in callback may be caught by Promise. +function nextTick(fn, args) { + if (fn) { + fn.apply(null, args); + } +} + +function normalize(word) { + return word.toLowerCase().replace(/\s+/g, '_'); +} + +function isStopword(stopwords, word) { + return stopwords.indexOf(' '+word+' ') >= 0; +} + +function tokenizer(str) { + return str.split(/\W+/); +} + +function uniq(arr) { + return arr.filter((v, i) => arr.indexOf(v) === i); +} + +function isString(s) { + return typeof s === 'string'; +} + +function reject(arr, predicate) { + return arr.filter(item => !predicate(item)) +} + +function prepText(text) { + if (Array.isArray(text)) return text; + var deduped = uniq(tokenizer(text)); + if (!this.options.stopwords) return deduped; + return reject(deduped, isStopword.bind(null, + isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr + )); +} + +export { + nextTick, + normalize, + tokenizer, + prepText, + makeStopwordString +}