From 364b2648f7a3ffb1e4af38812c43ac025f6f41bf Mon Sep 17 00:00:00 2001 From: Moos Date: Fri, 12 Oct 2018 20:35:11 -0700 Subject: [PATCH 01/20] first checkin for browser rework --- .babelrc | 4 + .gitignore | 4 +- package.json | 22 ++- samples/self-hosted/index.html | 51 ++++++ samples/self-hosted/main.js | 19 +++ scripts/makeJsonDict.js | 85 ++++++++++ src/browser/baseFile.js | 30 ++++ src/browser/dataFile.js | 92 +++++++++++ src/browser/index.js | 165 ++++++++++++++++++++ src/browser/indexFile.js | 71 +++++++++ src/browser/piper.js | 82 ++++++++++ src/browser/rand.js | 267 +++++++++++++++++++++++++++++++ src/common.js | 277 +++++++++++++++++++++++++++++++++ src/util.js | 56 +++++++ 14 files changed, 1221 insertions(+), 4 deletions(-) create mode 100644 .babelrc create mode 100644 samples/self-hosted/index.html create mode 100644 samples/self-hosted/main.js create mode 100644 scripts/makeJsonDict.js create mode 100644 src/browser/baseFile.js create mode 100644 src/browser/dataFile.js create mode 100644 src/browser/index.js create mode 100644 src/browser/indexFile.js create mode 100644 src/browser/piper.js create mode 100644 src/browser/rand.js create mode 100644 src/common.js create mode 100644 src/util.js diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..1067c95 --- /dev/null +++ b/.babelrc @@ -0,0 +1,4 @@ +{ + "presets": ["env", "stage-2"], + "plugins": ["transform-class-properties"] +} diff --git a/.gitignore b/.gitignore index 0db3560..bbecf68 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ dict node_modules .idea -*.iml \ No newline at end of file +*.iml +.cache +dist diff --git a/package.json b/package.json index 9edf78e..fd19e0e 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "wordpos", - "version": "1.2.0", + "version": "2.0.0-alpha", "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", "author": "Moos ", "keywords": [ @@ -16,14 +16,26 @@ "engines": { "node": ">=4" }, - "files": ["bench","bin","lib","src","test","tools"], + "files": [ + "bench", + "bin", + "lib", + "src", + "test", + "tools" + ], "bin": "./bin/wordpos-cli.js", "dependencies": { "commander": "^2.0.0", + "dict": "^1.4.0", "underscore": ">=1.3.1", "wordnet-db": "^3.1.6" }, "devDependencies": { + "babel-core": "^6.26.3", + "babel-plugin-transform-class-properties": "^6.24.1", + "babel-preset-env": "^1.7.0", + "babel-preset-stage-2": "^6.24.1", "chai": "^4.0.2", "mini-bench": "^1.0.0", "mocha": "^5.2.0" @@ -35,7 +47,11 @@ "main": "./src/wordpos.js", "scripts": { "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun", - "test": "mocha test" + "postinstall-web": "node scripts/makeJsonDict.js index data", + "test": "mocha test", + "start": "npm run start-self", + "start-self": "parcel samples/self-hosted/index.html", + "start-cdn": "parcel samples/cdn/index.html" }, "license": "MIT" } diff --git a/samples/self-hosted/index.html b/samples/self-hosted/index.html new file mode 100644 index 0000000..05fee23 --- /dev/null +++ b/samples/self-hosted/index.html @@ -0,0 +1,51 @@ + + + + + + + + +

Self-hosted WordPOS sample

+ + + + diff --git a/samples/self-hosted/main.js b/samples/self-hosted/main.js new file mode 100644 index 0000000..4e5db59 --- /dev/null +++ b/samples/self-hosted/main.js @@ -0,0 +1,19 @@ +import WordPOS from '../../src/browser'; + +console.log(__dirname, WordPOS.defaults) + +let wordpos = window.wordpos = new WordPOS({ + // preload: true, + dictPath: './dict', + profile: true, + // stopwords: false +}); + +wordpos.isAdverb('likely').then(res => console.log('likely is adverb:', res)); +// wordpos.isAdverb('likely', (res, ...profile) => console.log('likely callback', res, profile)); +wordpos.getAdverbs('this is is likely a likely tricky business this is').then( + res => console.log('getAdverb', res) +); + +wordpos.lookupAdverb('likely').then(res => console.log('lookup ===', res)) +// wordpos.lookup('likely').then(res, console.log('lookup ===', res)) diff --git a/scripts/makeJsonDict.js b/scripts/makeJsonDict.js new file mode 100644 index 0000000..cabbedb --- /dev/null +++ b/scripts/makeJsonDict.js @@ -0,0 +1,85 @@ +#!/usr/bin/env node + +/** + * takes original WordNet index & data files and converts to + * exported JSON format with lemma as the key. + */ + +let fs = require('fs'); +let path = require('path'); + +let outPath = './dict'; +let posExt = ['adj', 'adv', 'noun', 'verb']; +let dictRoot = './node_modules/wordnet-db/dict/'; +const fileTypes = { + data: true, + index: true +}; +const [,, ...args] = process.argv; + +if (!args.length || args.filter(p => !(p in fileTypes)).length) { + console.log('Converts wordnet-db index & data files to JSON format for use in the browser.'); + console.log('\nUsage: makeJsonDict.js index|data'); + process.exit(1); +} + +function uniq(arr) { + return arr.filter((v, i) => arr.indexOf(v) === i); +} + +console.time('Done'); + +// create out directory +try { + fs.statSync(outPath); +} catch (e) { + fs.mkdirSync(outPath); +} + +function processFile(name) { + + // read the file as text + function loadFile(pos) { + console.time(' load'); + let inPath = path.resolve(dictRoot, name + '.' + pos); + let text = fs.readFileSync(inPath, 'utf8'); + console.timeEnd(' load'); + return text; + } + + // convert raw text to JSON and write to file + function processText(pos, text) { + let obj = {}; + let sp = ' '; + console.time(' process'); + text.split('\n').forEach(line => { + if (!line || line[0] === sp) return; + let spi = line.indexOf(sp); + let key = line.substr(0, spi); + line = line.substring(1 + spi, line.lastIndexOf(sp + sp)) + obj[key] = line; + }); + console.timeEnd(' process'); + return obj; + } + + function writeFile(pos, obj) { + console.time(' write'); + let text = JSON.stringify(obj); + text = 'export default ' + text; + fs.writeFileSync(path.resolve(outPath, name + '.' + pos + '.js'), text); + console.timeEnd(' write'); + } + + posExt.forEach(pos => { + console.log('\n', name, pos, ':'); + let text = loadFile(pos); + let obj = processText(pos, text); + writeFile(pos, obj); + }); +} + +uniq(args).forEach(processFile); + +console.log('\nWritten to', path.resolve(outPath)); +console.timeEnd('Done'); diff --git a/src/browser/baseFile.js b/src/browser/baseFile.js new file mode 100644 index 0000000..68af93b --- /dev/null +++ b/src/browser/baseFile.js @@ -0,0 +1,30 @@ + + +class BaseFile { + + /** + * file contents + * @type {Object} + */ + file = {}; + + constructor(type, dictPath, posName) { + this.filePath = `${dictPath}/${type}.${posName}.js`; + this.type = type; + } + + load() { + return import(this.filePath) + .then(exports => this.file = exports.default) + .catch(err => { + console.error(`Error loading ${this.type} file for ${this.filePath}.`, err); + throw err; + }); + } + + ready(fn, args) { + return this.load().then(() => fn.apply(this, args)); + } +} + +export default BaseFile; diff --git a/src/browser/dataFile.js b/src/browser/dataFile.js new file mode 100644 index 0000000..6c72327 --- /dev/null +++ b/src/browser/dataFile.js @@ -0,0 +1,92 @@ +/*! + * dataFile.js + * + * Copyright (c) 2012-2019 mooster@42at.com + * https://github.com/moos/wordpos + * + * Portions: Copyright (c) 2011, Chris Umbel + * + * Released under MIT license + */ + +import { lineDataToJSON, LEX_NAMES } from '../common'; +import BaseFile from './BaseFile'; + +/** + * get parsed line from data file + * + * @param {string} offset The offset key + * @return {object} Data record object + * @this DataFile + */ +function seek(offset) { + let str = this.file[offset]; + if (!str) return {}; + // offset was extracted for the key - add it back to line data + return lineDataToJSON(offset + ' ' + str); +} + +/** + * lookup offsets in data file + * + * @param offsets {array} - array of offsets to lookup (obtained from index.find()) + * @param callback{function} (optional) - callback function + * @returns {Promise.[]} array of or single data record + * @this DataFile + */ +function lookup(offsets, callback) { + var results = [], + self = this, + readLine = seek.bind(this), + valid = (item => item.pos), + single = !Array.isArray(offsets); + + if (single) offsets = [offsets]; + return new Promise(function(resolve, reject) { + results = offsets.map(readLine).filter(valid); + + if (!results.length) { + let err = new RangeError(`No data at offsets ${offsets.join()} in ${self.filePath}.`); + callback && callback(err, single ? {} :[]); + reject(err); + } else { + if (single) results = results[0]; + callback && callback(null, results); + resolve(results); + } + }); +} + +/** + * DataFile class + * + * @param dictPath {string} - path to dict folder + * @param name {string} - POS name + * @constructor + */ +class DataFile extends BaseFile { + + constructor(dictPath, posName) { + super('data', dictPath, posName); + } + + lookup() { + return this.ready(lookup, arguments); + } + + seek() { + // return this.ready(find, arguments); + } + +} + + +/** + * map of lexFilenum to lex names + * + * @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html + * @type {string[]} + */ +DataFile.LEX_NAMES = LEX_NAMES; + +export default DataFile; diff --git a/src/browser/index.js b/src/browser/index.js new file mode 100644 index 0000000..344b549 --- /dev/null +++ b/src/browser/index.js @@ -0,0 +1,165 @@ +import { stopwords, prepText, makeStopwordString } from '../util'; +import { is, get, lookup } from '../common'; +import IndexFile from './indexFile'; +import DataFile from './dataFile'; + +const POS = { + n: 'noun', + v: 'verb', + a: 'adj', + r: 'adv' +}; + + +class WordPOS { + + options = {}; + loaded = Promise.resolve(); + + constructor(config) { + this.options = Object.assign({}, WordPOS.defaults, config); + console.log('wpos ctor -- ', this.options) + + this.initFiles(); + if (Array.isArray(this.options.stopwords)) { + this.options.stopwords = makeStopwordString(this.options.stopwords); + } + + // TODO rand() + } + + ready() { + return this.loaded; + } + + initFiles() { + const keys = Object.keys(POS); + const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos]); + const loader = (Comp) => keys.map(loadOne.bind(null, Comp)); + const reducer = (arr) => arr.reduce((coll, item, i) => (coll[keys[i]] = item, coll), {}); + + this.indexFiles = reducer(loader(IndexFile)); + this.dataFiles = reducer(loader(DataFile)); + + if (this.options.preload) { + this.loaded = this.preloadIndexes(this.options.preload); + } + } + + getFilesFor(pos) { + return { + index: this.indexFiles[pos], + data: this.dataFiles[pos] + }; + } + + /** + * loads index files + * + * @param {string|Array} [pos] POS to load (default: all) + * @return {Promise.} + */ + preloadIndexes(pos) { + let file = this.indexFile[pos]; + let load = p => file.load(); + let promise; + + if (!pos || pos === true) { // preload all + promise = Promise.all(Object.keys(POS).map(load)); + } + else if (typeof pos === 'string' && file) { + promise = load(pos); + } + else if (pos instanceof Array) { + promise = pos.forEach(pos => file && load(pos)); + } + + // TODO includeData + + return promise || Promise.reject(new RangeError(`Unknown POS "${pos}" for preload.`)); + } + + parse = prepText; + + /** + * isX() - Test if word is given POS + * @see is + */ + isAdjective = is('a'); + isAdverb = is('r'); + isNoun = is('n'); + isVerb = is('v'); + + /** + * getX() - Find all words in string that are given POS + * @see get + */ + getAdjectives = get('isAdjective'); + getAdverbs = get('isAdverb'); + getNouns = get('isNoun'); + getVerbs = get('isVerb'); + + /** + * lookupX() - Lookup word definition if already know POS + * @see lookup + */ + lookupAdjective = lookup('a'); + lookupAdverb = lookup('r'); + lookupNoun = lookup('n'); + lookupVerb = lookup('v'); +} + +WordPOS.defaults = { + /** + * path to WordNet data (override only if not using wordnet-db) + * @type {string} + */ + dictPath: '', + + /** + * enable profiling, time in msec returned as second argument in callback + * @type {boolean} + */ + profile: false, + + /** + * if true, exclude standard stopwords. + * if array, stopwords to exclude, eg, ['all','of','this',...] + * if false, do not filter any stopwords. + * @type {boolean} + */ + stopwords: true, + + /** + * preload files. + * true - preload all POS + * false - do not preload any POS + * 'a' - preload adj + * ['a','v'] - preload adj & verb + * @type {boolean|string|Array} + */ + preload: false, + + /** + * include data files in preload + * @type {boolean} + */ + + includeData: false + +}; + + +/** + * access to WordNet DB + * @type {object} + */ +// WordPOS.WNdb = WNdb; + +/** + * access to stopwords + * @type {Array} + */ +WordPOS.stopwords = stopwords; + +export default WordPOS; diff --git a/src/browser/indexFile.js b/src/browser/indexFile.js new file mode 100644 index 0000000..88d0166 --- /dev/null +++ b/src/browser/indexFile.js @@ -0,0 +1,71 @@ +/*! + * indexFile.js + * + * implements fast index lookup of WordNet's index files + * + * Copyright (c) 2012-2019 mooster@42at.com + * https://github.com/moos/wordpos + * + * Portions: Copyright (c) 2011, Chris Umbel + * + * Released under MIT license + */ + +import { indexLookup } from '../common'; +import BaseFile from './BaseFile'; + +/** + * find a search term in an index file (using fast index) + * + * Calls to same bucket are queued for callback using the piper. + * + * @param search {string} - word to search for + * @param callback {function} - callback receives found line and tokens + * @returns none + * @this IndexFile + */ +function find(search, callback) { + var miss = {status: 'miss'}; + + if (!(search in this.file)) { + callback(miss); + return; + } + + var + line = this.file[search], + tokens = line.split(/\s+/), + result = { + status: 'hit', + key: search, + line: line, + tokens: tokens + }; + + result.tokens.unshift(search); + callback(result); +} + +/** + * IndexFile class + * + * @param dictPath {string} - WordNet db dict path + * @param name {string} - name of index: noun, verb, adj, adv + * @constructor + */ +class IndexFile extends BaseFile { + + constructor(dictPath, posName) { + super('index', dictPath, posName); + } + + lookup() { + return this.ready(indexLookup, arguments); + } + + find() { + return this.ready(find, arguments); + } +} + +export default IndexFile; diff --git a/src/browser/piper.js b/src/browser/piper.js new file mode 100644 index 0000000..c0985de --- /dev/null +++ b/src/browser/piper.js @@ -0,0 +1,82 @@ +/*! + * piper.js + * + * executes multiple async i/o tasks and pools similar callbacks, + * calling i/o open/close when all incoming tasks are done. + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + fs = require('fs'); + +/** + * run single 'task' method sharing callbacks. Method MUST take callback as LAST arg. + * piper is bound to an IndexFile. + * + * @param task {string} - task name unique to method! + * @param method {function} - method to execute, gets (args, ... , callback) + * @param args {Array} - args to pass to method + * @param context {object} - other params to remember and sent to callback + * @param callback {function} - result callback + */ +function piper(task, method, args, context, callback){ + var readCallbacks = this.callbackQueue, + memoArgs = _.rest(arguments, 2), + wrappedCallback; + + //console.log('piper', task, [method]); + + // queue up if already reading file for this task + if (task in readCallbacks){ + readCallbacks[task].push(memoArgs); + return; + } + readCallbacks[task] = [memoArgs]; + + if (!this.fd) { + //console.log(' ... opening', this.filePath); + this.fd = fs.openSync(this.filePath, 'r'); + } + + // ref count so we know when to close the main index file + ++this.refcount; + + wrappedCallback = _.partial(piper.wrapper, this, task); + + // call method -- replace original callback (last arg) with wrapped one + method.apply(null, [].concat( args, wrappedCallback )); +} + +// result is the *same* for same task +piper.wrapper = function(self, task /*, result...*/){ + var readCallbacks = self.callbackQueue, + result = _.rest(arguments, 2), + callback, args; + + // live access callbacks cache in case nested cb's + // add to the array. + while (args = readCallbacks[task].shift()) { + callback = args.pop(); // last arg MUST be callback + +// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString()) + callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result)); + } + + // now done - delete cb cache + delete readCallbacks[task]; + + if (--self.refcount === 0) { + //console.log(' ... closing', self.filePath); + fs.closeSync(self.fd); + self.fd = null; + } +}; + + +module.exports = piper; + diff --git a/src/browser/rand.js b/src/browser/rand.js new file mode 100644 index 0000000..17808c8 --- /dev/null +++ b/src/browser/rand.js @@ -0,0 +1,267 @@ +/*! + * rand.js + * + * define rand() and randX() functions on wordpos + * + * Copyright (c) 2012-2016 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ + +var _ = require('underscore')._, + util = require('util'), + Trie = require('../lib/natural/trie/trie'), + IndexFile = require('./indexFile'), + KEY_LENGTH = 3; + + +/** + * factory function for randX() + * + * @param pos {string} - a,r,n,v + * @returns {Function} - rand function bound to an index file + */ +function makeRandX(pos){ + return function(opts, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index, + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1; + + if (typeof opts === 'function') { + callback = opts; + } + + return index.rand(startsWith, count, function (record) { + args.push(record, startsWith); + profile && args.push(new Date() - start); + callback && callback.apply(null, args); + }); + }; +} + +/** + * rand function (bound to index) + * + * @param startsWith {string} - get random word(s) that start with this, or '' + * @param num {number} - number of words to return + * @param callback {function} - callback function, receives words array and startsWith + * @returns Promise + */ +function rand(startsWith, num, callback){ + var self = this, + nextKey = null, + trie = this.fastIndex.trie, + key, keys; + + return new Promise(function(resolve, reject) { + + //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); + if (startsWith) { + key = startsWith.slice(0, KEY_LENGTH); + + /** + * if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that. + */ + if (key.length < KEY_LENGTH) { + + // calc trie if haven't done so yet + if (!trie) { + trie = new Trie(); + trie.addStrings(self.fastIndex.indexKeys); + self.fastIndex.trie = trie; + //console.log(' +++ Trie calc '); + } + + try { + // trie throws if not found!!!!! + keys = trie.keysWithPrefix(startsWith); + } catch (e) { + keys = []; + } + + // read all keys then select random word. + // May be large disk read! + key = keys[0]; + nextKey = _.last(keys); + } + + if (!key || !(key in self.fastIndex.offsets)) { + callback && callback([], startsWith); + resolve([]); + } + + } else { + // no startWith given - random select among keys + keys = _.sample(self.fastIndex.indexKeys, num); + + // if num > 1, run each key independently and collect results + if (num > 1) { + var results = [], ii = 0; + _(keys).each(function (startsWith) { + self.rand(startsWith, 1, function (result) { + results.push(result[0]); + if (++ii == num) { + callback && callback(results, ''); + resolve(results); + } + }); + }); + return; + } + key = keys; + } + + // prepare the piper + var args = [key, nextKey, self], + task = 'rand:' + key + nextKey, + context = [startsWith, num, callback]; // last arg MUST be callback + + // pay the piper + self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector); + + function collector(key, nextKey, index, startsWith, num, callback, buffer) { + var lines = buffer.toString().split('\n'), + matches = lines.map(function (line) { + return line.substring(0, line.indexOf(' ')); + }); + //console.log(' got lines for key ', key, lines.length); + + // we got bunch of matches for key - now search within for startsWith + if (startsWith !== key) { + // binary search for startsWith within set of matches + var ind = _.sortedIndex(matches, startsWith); + if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) { + callback && callback([], startsWith); + resolve([]); + return; + } + + var trie = new Trie(); + trie.addStrings(matches); + //console.log('Trie > ', trie.matchesWithPrefix( startsWith )); + matches = trie.keysWithPrefix(startsWith); + } + + var words = _.sample(matches, num); + callback && callback(words, startsWith); + resolve(words); + } + + }); // Promise +} + +// relative weight of each POS word count (DB 3.1 numbers) +var POS_factor = { + Noun: 26, + Verb: 3, + Adjective: 5, + Adverb: 1, + Total: 37 +}; + +/** + * rand() - for all Index files + * @returns Promise + */ +function randAll(opts, callback) { + + if (typeof opts === 'function') { + callback = opts; + opts = {}; + } else { + opts = _.clone(opts || {}); + } + + var + profile = this.options.profile, + start = profile && new Date(), + results = [], + startsWith = opts && opts.startsWith || '', + count = opts && opts.count || 1, + args = [null, startsWith], + parts = 'Noun Verb Adjective Adverb'.split(' '), + self = this; + + + + return new Promise(function(resolve, reject) { + // select at random a POS to look at + var doParts = _.sample(parts, parts.length); + tryPart(); + + function tryPart() { + var part = doParts.pop(), + rand = 'rand' + part, + factor = POS_factor[part], + weight = factor / POS_factor.Total; + + // pick count according to relative weight + opts.count = Math.ceil(count * weight * 1.1); // guard against dupes + self[rand](opts, partCallback); + } + + function partCallback(result) { + if (result) { + results = _.uniq(results.concat(result)); // make sure it's unique! + } + + if (results.length < count && doParts.length) { + return tryPart(); + } + + // final random and trim excess + results = _.sample(results, count); + done(); + } + + function done() { + profile && (args.push(new Date() - start)); + args[0] = results; + callback && callback.apply(null, args); + resolve(results); + } + + }); // Promise +} + +/** + * bind rand() to index + * + * @param index {object} - the IndexFile instance + * @returns {function} - bound rand function for index + */ +function randomify(index){ + if (!index.fastIndex) throw 'rand requires fastIndex'; + return _.bind(rand, index); +} + + + +module.exports = { + + init: function(wordposProto) { + wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); + wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); + wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex); + wordposProto.advIndex.rand = randomify(wordposProto.advIndex); + + /** + * define rand() + */ + wordposProto.rand = randAll; + + /** + * define randX() + */ + wordposProto.randAdjective = makeRandX('a'); + wordposProto.randAdverb = makeRandX('r'); + wordposProto.randNoun = makeRandX('n'); + wordposProto.randVerb = makeRandX('v'); + } +}; + diff --git a/src/common.js b/src/common.js new file mode 100644 index 0000000..a405af2 --- /dev/null +++ b/src/common.js @@ -0,0 +1,277 @@ +import { normalize, nextTick } from './util'; + + + +/** + * factory for main lookup function + * + * @param pos {string} - n/v/a/r + * @returns {Function} - lookup function bound to POS + * @this WordPOS + */ +function lookup(pos) { + return function(word, callback) { + var profile = this.options.profile, + start = profile && new Date(), + files = this.getFilesFor(pos), + args = []; + + word = normalize(word); + + // lookup index + return files.index.lookup(word) + .then(function(result) { + if (result) { + // lookup data + return files.data.lookup(result.synsetOffset).then(done); + } else { + // not found in index + return done([]); + } + }) + .catch(done); + + function done(results) { + if (results instanceof Error) { + args.push([], word); + } else { + args.push(results, word); + } + //console.log(3333, args) + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } + }; +} + +/** + * find a word and prepare its lexical record + * + * @param word {string} - search word + * @param callback {function} - callback function receives result + * @returns {Promise.} + * @this IndexFile + * + * Credit for this routine to https://github.com/NaturalNode/natural + */ +function indexLookup(word, callback) { + var self = this; + + return new Promise(function(resolve, reject){ + self.find(word, function (record) { + var indexRecord = null, + i; + + if (record.status == 'hit') { + var ptrs = [], offsets = []; + let n = parseInt(record.tokens[3]); + + for (i = 0; i < n; i++) { + ptrs.push(record.tokens[i]); + } + + n = parseInt(record.tokens[2]); + for (i = 0; i < n; i++) { + offsets.push(record.tokens[ptrs.length + 6 + i]); + } + + indexRecord = { + lemma : record.tokens[0], + pos : record.tokens[1], + ptrSymbol : ptrs, + senseCnt : parseInt(record.tokens[ptrs.length + 4], 10), + tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10), + synsetOffset: offsets + }; + } + callback && callback(indexRecord); + resolve(indexRecord); + }); + }); +} + + + +/** + * getX() factory function + * + * @param isFn {function} - an isX() function + * @returns {Function} + * @this IndexFile + */ +function get(isFn) { + return function(text, callback, _noprofile) { + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + words = this.parse(text), + results = [], + self = this; + + return Promise + .all(words.map(exec)) + .then(done); + + function exec(word) { + return self[isFn] + .call(self, word, null, /*_noprofile*/ true) + .then(function collect(result) { + result && results.push(word); + }); + } + + function done(){ + var args = [results]; + profile && args.push(new Date() - start); + nextTick(callback, args); + return results; + } + }; +} + + +/** + * isX() factory function + * + * @param pos {string} - n/v/a/r + * @returns {Function} + * @this WordPOS + */ +function is(pos){ + return function(word, callback, _noprofile) { + // disable profiling when isX() used internally + var profile = this.options.profile && !_noprofile, + start = profile && new Date(), + args = [], + index = this.getFilesFor(pos).index; + word = normalize(word); + + return index + .lookup(word) + .then(function(record) { + var result = !!record; + args.push(result, word); + profile && args.push(new Date() - start); + nextTick(callback, args); + return result; + }); + }; +} + + +/** + * parse a single data file line, returning data object + * + * @param line {string} - a single line from WordNet data file + * @returns {object} + * + * Credit for this routine to https://github.com/NaturalNode/natural + */ +function lineDataToJSON(line, location) { + // if (!dataCheck(line, location)) return new Error('Bad data at location ' + location); + + var data = line.split('| '), + tokens = data[0].split(/\s+/), + ptrs = [], + wCnt = parseInt(tokens[3], 16), + synonyms = [], + i; + + for(i = 0; i < wCnt; i++) { + synonyms.push(tokens[4 + i * 2]); + } + + var ptrOffset = (wCnt - 1) * 2 + 6; + let n = parseInt(tokens[ptrOffset], 10); + for(i = 0; i < n; i++) { + ptrs.push({ + pointerSymbol: tokens[ptrOffset + 1 + i * 4], + synsetOffset: tokens[ptrOffset + 2 + i * 4], + pos: tokens[ptrOffset + 3 + i * 4], + sourceTarget: tokens[ptrOffset + 4 + i * 4] + }); + } + + // break "gloss" into definition vs. examples + var glossArray = data[1].split('; '); + var definition = glossArray[0]; + var examples = glossArray.slice(1); + var lexFilenum = parseInt(tokens[1], 10); + + for (var k = 0; k < examples.length; k++) { + examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); + } + + return { + synsetOffset: tokens[0], + lexFilenum: lexFilenum, + lexName: LEX_NAMES[ lexFilenum ], + pos: tokens[2], + wCnt: wCnt, + lemma: tokens[4], + synonyms: synonyms, + lexId: tokens[5], + ptrs: ptrs, + gloss: data[1], + def: definition, + exp: examples + }; +} + +const LEX_NAMES = [ + 'adj.all', + 'adj.pert', + 'adv.all', + 'noun.Tops', + 'noun.act', + 'noun.animal', + 'noun.artifact', + 'noun.attribute', + 'noun.body', + 'noun.cognition', + 'noun.communication', + 'noun.event', + 'noun.feeling', + 'noun.food', + 'noun.group', + 'noun.location', + 'noun.motive', + 'noun.object', + 'noun.person', + 'noun.phenomenon', + 'noun.plant', + 'noun.possession', + 'noun.process', + 'noun.quantity', + 'noun.relation', + 'noun.shape', + 'noun.state', + 'noun.substance', + 'noun.time', + 'verb.body', + 'verb.change', + 'verb.cognition', + 'verb.communication', + 'verb.competition', + 'verb.consumption', + 'verb.contact', + 'verb.creation', + 'verb.emotion', + 'verb.motion', + 'verb.perception', + 'verb.possession', + 'verb.social', + 'verb.stative', + 'verb.weather', + 'adj.ppl' +]; + +export { + indexLookup, + is, + get, + + lineDataToJSON, + LEX_NAMES, + lookup +} diff --git a/src/util.js b/src/util.js new file mode 100644 index 0000000..0b2d7ba --- /dev/null +++ b/src/util.js @@ -0,0 +1,56 @@ +let stopwords = require('../lib/natural/util/stopwords').words; +let stopwordsStr = makeStopwordString(stopwords); + + +function makeStopwordString(stopwords) { + return ' ' + stopwords.join(' ') + ' '; +} + +// setImmediate executes callback AFTER promise handlers. +// Without it, exceptions in callback may be caught by Promise. +function nextTick(fn, args) { + if (fn) { + fn.apply(null, args); + } +} + +function normalize(word) { + return word.toLowerCase().replace(/\s+/g, '_'); +} + +function isStopword(stopwords, word) { + return stopwords.indexOf(' '+word+' ') >= 0; +} + +function tokenizer(str) { + return str.split(/\W+/); +} + +function uniq(arr) { + return arr.filter((v, i) => arr.indexOf(v) === i); +} + +function isString(s) { + return typeof s === 'string'; +} + +function reject(arr, predicate) { + return arr.filter(item => !predicate(item)) +} + +function prepText(text) { + if (Array.isArray(text)) return text; + var deduped = uniq(tokenizer(text)); + if (!this.options.stopwords) return deduped; + return reject(deduped, isStopword.bind(null, + isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr + )); +} + +export { + nextTick, + normalize, + tokenizer, + prepText, + makeStopwordString +} From e56463f94da59a5ac9559cfcb8454f253e008e3f Mon Sep 17 00:00:00 2001 From: Moos Date: Sun, 14 Oct 2018 22:20:56 -0700 Subject: [PATCH 02/20] browser complete --- samples/self-hosted/index.html | 69 ++++----- samples/self-hosted/main.js | 38 ++++- samples/self-hosted/main.txt | 1 + src/browser/baseFile.js | 13 +- src/browser/dataFile.js | 12 +- src/browser/index.js | 18 ++- src/browser/indexFile.js | 10 +- src/browser/piper.js | 82 ---------- src/browser/rand.js | 267 --------------------------------- src/common.js | 48 +++++- src/util.js | 17 ++- 11 files changed, 144 insertions(+), 431 deletions(-) create mode 120000 samples/self-hosted/main.txt delete mode 100644 src/browser/piper.js delete mode 100644 src/browser/rand.js diff --git a/samples/self-hosted/index.html b/samples/self-hosted/index.html index 05fee23..e71dc4d 100644 --- a/samples/self-hosted/index.html +++ b/samples/self-hosted/index.html @@ -1,51 +1,42 @@ + + + + - +

Self-hosted WordPOS sample

- + Open console to see results. + +

+   var a = "foo"
+ 
+ + + diff --git a/samples/self-hosted/main.js b/samples/self-hosted/main.js index 4e5db59..a57c33e 100644 --- a/samples/self-hosted/main.js +++ b/samples/self-hosted/main.js @@ -1,4 +1,4 @@ -import WordPOS from '../../src/browser'; +import WordPOS from '../../src/wordpos'; console.log(__dirname, WordPOS.defaults) @@ -9,11 +9,35 @@ let wordpos = window.wordpos = new WordPOS({ // stopwords: false }); -wordpos.isAdverb('likely').then(res => console.log('likely is adverb:', res)); -// wordpos.isAdverb('likely', (res, ...profile) => console.log('likely callback', res, profile)); -wordpos.getAdverbs('this is is likely a likely tricky business this is').then( - res => console.log('getAdverb', res) -); +let assertLikely = (r) => { + console.assert(r.def === 'with considerable certainty'); + console.assert(r.pos === 'r'); + console.assert(r.synsetOffset === '00139421'); +}; -wordpos.lookupAdverb('likely').then(res => console.log('lookup ===', res)) +console.group('Likely'); +wordpos.isAdverb('likely').then(res => console.assert(res)); +wordpos.isAdverb('likely', (res, ...profile) => console.log('callback with profile', res, profile)); + +wordpos.getAdverbs('this is is lately a likely tricky business this is') + .then(res => { + console.log('getAdverbs:', res); + console.assert(res[0] === 'lately'); + console.assert(res[1] === 'likely'); + }); + +wordpos.lookupAdverb('likely') + .then(res => { + console.log('lookupAdverb:', res); + assertLikely(res[0]); + + }); // wordpos.lookup('likely').then(res, console.log('lookup ===', res)) + +wordpos.seek('00139421', 'r') + .then(res => { + console.log('seek:', res); + assertLikely(res); + }); + +// console.groupEnd('Likely'); diff --git a/samples/self-hosted/main.txt b/samples/self-hosted/main.txt new file mode 120000 index 0000000..82df346 --- /dev/null +++ b/samples/self-hosted/main.txt @@ -0,0 +1 @@ +main.js \ No newline at end of file diff --git a/src/browser/baseFile.js b/src/browser/baseFile.js index 68af93b..acb56d1 100644 --- a/src/browser/baseFile.js +++ b/src/browser/baseFile.js @@ -1,16 +1,23 @@ - +/** + * browser/baseFile.js + * + * Copyright (c) 2012-2019 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ class BaseFile { /** - * file contents + * file contents - in browser it's just a string & not a file! * @type {Object} */ file = {}; constructor(type, dictPath, posName) { - this.filePath = `${dictPath}/${type}.${posName}.js`; this.type = type; + this.filePath = `${dictPath}/${type}.${posName}.js`; } load() { diff --git a/src/browser/dataFile.js b/src/browser/dataFile.js index 6c72327..1238423 100644 --- a/src/browser/dataFile.js +++ b/src/browser/dataFile.js @@ -1,5 +1,5 @@ -/*! - * dataFile.js +/** + * browser/dataFile.js * * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos @@ -61,7 +61,7 @@ function lookup(offsets, callback) { * DataFile class * * @param dictPath {string} - path to dict folder - * @param name {string} - POS name + * @param posName {string} - POS name * @constructor */ class DataFile extends BaseFile { @@ -73,14 +73,8 @@ class DataFile extends BaseFile { lookup() { return this.ready(lookup, arguments); } - - seek() { - // return this.ready(find, arguments); - } - } - /** * map of lexFilenum to lex names * diff --git a/src/browser/index.js b/src/browser/index.js index 344b549..4bd8d21 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -1,5 +1,14 @@ +/** +* browser/index.js +* +* Copyright (c) 2012-2019 mooster@42at.com +* https://github.com/moos/wordpos +* +* Released under MIT license +*/ + import { stopwords, prepText, makeStopwordString } from '../util'; -import { is, get, lookup } from '../common'; +import { is, get, lookup, seek } from '../common'; import IndexFile from './indexFile'; import DataFile from './dataFile'; @@ -10,7 +19,6 @@ const POS = { r: 'adv' }; - class WordPOS { options = {}; @@ -18,7 +26,6 @@ class WordPOS { constructor(config) { this.options = Object.assign({}, WordPOS.defaults, config); - console.log('wpos ctor -- ', this.options) this.initFiles(); if (Array.isArray(this.options.stopwords)) { @@ -81,6 +88,8 @@ class WordPOS { parse = prepText; + seek = seek; + /** * isX() - Test if word is given POS * @see is @@ -144,7 +153,6 @@ WordPOS.defaults = { * include data files in preload * @type {boolean} */ - includeData: false }; @@ -154,7 +162,7 @@ WordPOS.defaults = { * access to WordNet DB * @type {object} */ -// WordPOS.WNdb = WNdb; +// WordPOS.WNdb = WNdb; // FIXME /** * access to stopwords diff --git a/src/browser/indexFile.js b/src/browser/indexFile.js index 88d0166..2c0308a 100644 --- a/src/browser/indexFile.js +++ b/src/browser/indexFile.js @@ -1,13 +1,9 @@ -/*! - * indexFile.js - * - * implements fast index lookup of WordNet's index files +/** + * browser/indexFile.js * * Copyright (c) 2012-2019 mooster@42at.com * https://github.com/moos/wordpos * - * Portions: Copyright (c) 2011, Chris Umbel - * * Released under MIT license */ @@ -50,7 +46,7 @@ function find(search, callback) { * IndexFile class * * @param dictPath {string} - WordNet db dict path - * @param name {string} - name of index: noun, verb, adj, adv + * @param posName {string} - name of index: noun, verb, adj, adv * @constructor */ class IndexFile extends BaseFile { diff --git a/src/browser/piper.js b/src/browser/piper.js deleted file mode 100644 index c0985de..0000000 --- a/src/browser/piper.js +++ /dev/null @@ -1,82 +0,0 @@ -/*! - * piper.js - * - * executes multiple async i/o tasks and pools similar callbacks, - * calling i/o open/close when all incoming tasks are done. - * - * Copyright (c) 2012-2016 mooster@42at.com - * https://github.com/moos/wordpos - * - * Released under MIT license - */ - -var _ = require('underscore')._, - util = require('util'), - fs = require('fs'); - -/** - * run single 'task' method sharing callbacks. Method MUST take callback as LAST arg. - * piper is bound to an IndexFile. - * - * @param task {string} - task name unique to method! - * @param method {function} - method to execute, gets (args, ... , callback) - * @param args {Array} - args to pass to method - * @param context {object} - other params to remember and sent to callback - * @param callback {function} - result callback - */ -function piper(task, method, args, context, callback){ - var readCallbacks = this.callbackQueue, - memoArgs = _.rest(arguments, 2), - wrappedCallback; - - //console.log('piper', task, [method]); - - // queue up if already reading file for this task - if (task in readCallbacks){ - readCallbacks[task].push(memoArgs); - return; - } - readCallbacks[task] = [memoArgs]; - - if (!this.fd) { - //console.log(' ... opening', this.filePath); - this.fd = fs.openSync(this.filePath, 'r'); - } - - // ref count so we know when to close the main index file - ++this.refcount; - - wrappedCallback = _.partial(piper.wrapper, this, task); - - // call method -- replace original callback (last arg) with wrapped one - method.apply(null, [].concat( args, wrappedCallback )); -} - -// result is the *same* for same task -piper.wrapper = function(self, task /*, result...*/){ - var readCallbacks = self.callbackQueue, - result = _.rest(arguments, 2), - callback, args; - - // live access callbacks cache in case nested cb's - // add to the array. - while (args = readCallbacks[task].shift()) { - callback = args.pop(); // last arg MUST be callback - -// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString()) - callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result)); - } - - // now done - delete cb cache - delete readCallbacks[task]; - - if (--self.refcount === 0) { - //console.log(' ... closing', self.filePath); - fs.closeSync(self.fd); - self.fd = null; - } -}; - - -module.exports = piper; - diff --git a/src/browser/rand.js b/src/browser/rand.js deleted file mode 100644 index 17808c8..0000000 --- a/src/browser/rand.js +++ /dev/null @@ -1,267 +0,0 @@ -/*! - * rand.js - * - * define rand() and randX() functions on wordpos - * - * Copyright (c) 2012-2016 mooster@42at.com - * https://github.com/moos/wordpos - * - * Released under MIT license - */ - -var _ = require('underscore')._, - util = require('util'), - Trie = require('../lib/natural/trie/trie'), - IndexFile = require('./indexFile'), - KEY_LENGTH = 3; - - -/** - * factory function for randX() - * - * @param pos {string} - a,r,n,v - * @returns {Function} - rand function bound to an index file - */ -function makeRandX(pos){ - return function(opts, callback, _noprofile) { - // disable profiling when isX() used internally - var profile = this.options.profile && !_noprofile, - start = profile && new Date(), - args = [], - index = this.getFilesFor(pos).index, - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1; - - if (typeof opts === 'function') { - callback = opts; - } - - return index.rand(startsWith, count, function (record) { - args.push(record, startsWith); - profile && args.push(new Date() - start); - callback && callback.apply(null, args); - }); - }; -} - -/** - * rand function (bound to index) - * - * @param startsWith {string} - get random word(s) that start with this, or '' - * @param num {number} - number of words to return - * @param callback {function} - callback function, receives words array and startsWith - * @returns Promise - */ -function rand(startsWith, num, callback){ - var self = this, - nextKey = null, - trie = this.fastIndex.trie, - key, keys; - - return new Promise(function(resolve, reject) { - - //console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length); - if (startsWith) { - key = startsWith.slice(0, KEY_LENGTH); - - /** - * if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that. - */ - if (key.length < KEY_LENGTH) { - - // calc trie if haven't done so yet - if (!trie) { - trie = new Trie(); - trie.addStrings(self.fastIndex.indexKeys); - self.fastIndex.trie = trie; - //console.log(' +++ Trie calc '); - } - - try { - // trie throws if not found!!!!! - keys = trie.keysWithPrefix(startsWith); - } catch (e) { - keys = []; - } - - // read all keys then select random word. - // May be large disk read! - key = keys[0]; - nextKey = _.last(keys); - } - - if (!key || !(key in self.fastIndex.offsets)) { - callback && callback([], startsWith); - resolve([]); - } - - } else { - // no startWith given - random select among keys - keys = _.sample(self.fastIndex.indexKeys, num); - - // if num > 1, run each key independently and collect results - if (num > 1) { - var results = [], ii = 0; - _(keys).each(function (startsWith) { - self.rand(startsWith, 1, function (result) { - results.push(result[0]); - if (++ii == num) { - callback && callback(results, ''); - resolve(results); - } - }); - }); - return; - } - key = keys; - } - - // prepare the piper - var args = [key, nextKey, self], - task = 'rand:' + key + nextKey, - context = [startsWith, num, callback]; // last arg MUST be callback - - // pay the piper - self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector); - - function collector(key, nextKey, index, startsWith, num, callback, buffer) { - var lines = buffer.toString().split('\n'), - matches = lines.map(function (line) { - return line.substring(0, line.indexOf(' ')); - }); - //console.log(' got lines for key ', key, lines.length); - - // we got bunch of matches for key - now search within for startsWith - if (startsWith !== key) { - // binary search for startsWith within set of matches - var ind = _.sortedIndex(matches, startsWith); - if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) { - callback && callback([], startsWith); - resolve([]); - return; - } - - var trie = new Trie(); - trie.addStrings(matches); - //console.log('Trie > ', trie.matchesWithPrefix( startsWith )); - matches = trie.keysWithPrefix(startsWith); - } - - var words = _.sample(matches, num); - callback && callback(words, startsWith); - resolve(words); - } - - }); // Promise -} - -// relative weight of each POS word count (DB 3.1 numbers) -var POS_factor = { - Noun: 26, - Verb: 3, - Adjective: 5, - Adverb: 1, - Total: 37 -}; - -/** - * rand() - for all Index files - * @returns Promise - */ -function randAll(opts, callback) { - - if (typeof opts === 'function') { - callback = opts; - opts = {}; - } else { - opts = _.clone(opts || {}); - } - - var - profile = this.options.profile, - start = profile && new Date(), - results = [], - startsWith = opts && opts.startsWith || '', - count = opts && opts.count || 1, - args = [null, startsWith], - parts = 'Noun Verb Adjective Adverb'.split(' '), - self = this; - - - - return new Promise(function(resolve, reject) { - // select at random a POS to look at - var doParts = _.sample(parts, parts.length); - tryPart(); - - function tryPart() { - var part = doParts.pop(), - rand = 'rand' + part, - factor = POS_factor[part], - weight = factor / POS_factor.Total; - - // pick count according to relative weight - opts.count = Math.ceil(count * weight * 1.1); // guard against dupes - self[rand](opts, partCallback); - } - - function partCallback(result) { - if (result) { - results = _.uniq(results.concat(result)); // make sure it's unique! - } - - if (results.length < count && doParts.length) { - return tryPart(); - } - - // final random and trim excess - results = _.sample(results, count); - done(); - } - - function done() { - profile && (args.push(new Date() - start)); - args[0] = results; - callback && callback.apply(null, args); - resolve(results); - } - - }); // Promise -} - -/** - * bind rand() to index - * - * @param index {object} - the IndexFile instance - * @returns {function} - bound rand function for index - */ -function randomify(index){ - if (!index.fastIndex) throw 'rand requires fastIndex'; - return _.bind(rand, index); -} - - - -module.exports = { - - init: function(wordposProto) { - wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex); - wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex); - wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex); - wordposProto.advIndex.rand = randomify(wordposProto.advIndex); - - /** - * define rand() - */ - wordposProto.rand = randAll; - - /** - * define randX() - */ - wordposProto.randAdjective = makeRandX('a'); - wordposProto.randAdverb = makeRandX('r'); - wordposProto.randNoun = makeRandX('n'); - wordposProto.randVerb = makeRandX('v'); - } -}; - diff --git a/src/common.js b/src/common.js index a405af2..057df38 100644 --- a/src/common.js +++ b/src/common.js @@ -1,6 +1,15 @@ -import { normalize, nextTick } from './util'; - +/** +* common.js +* +* Copyright (c) 2012-2019 mooster@42at.com +* https://github.com/moos/wordpos +* +* Portions: Copyright (c) 2011, Chris Umbel +* +* Released under MIT license +*/ +var { normalize, nextTick } = require('./util'); /** * factory for main lookup function @@ -57,7 +66,6 @@ function lookup(pos) { */ function indexLookup(word, callback) { var self = this; - return new Promise(function(resolve, reject){ self.find(word, function (record) { var indexRecord = null, @@ -91,8 +99,6 @@ function indexLookup(word, callback) { }); } - - /** * getX() factory function * @@ -129,7 +135,6 @@ function get(isFn) { }; } - /** * isX() factory function * @@ -158,7 +163,6 @@ function is(pos){ }; } - /** * parse a single data file line, returning data object * @@ -218,6 +222,32 @@ function lineDataToJSON(line, location) { }; } + +/** + * seek - get record at offset for pos + * + * @param offset {number} - synset offset + * @param pos {string} - POS a/r/n/v + * @param callback {function} - optional callback + * @returns Promise + * @this WordPOS + */ +function seek(offset, pos, callback){ + var offsetTmp = Number(offset); + if (isNaN(offsetTmp) || offsetTmp <= 0) return error('Offset must be valid positive number: ' + offset); + + var data = this.getFilesFor(pos).data; + if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.'); + + return data.lookup(offset, callback); + + function error(msg) { + var err = new Error(msg); + callback && callback(err, {}); + return Promise.reject(err); + } +} + const LEX_NAMES = [ 'adj.all', 'adj.pert', @@ -266,10 +296,12 @@ const LEX_NAMES = [ 'adj.ppl' ]; -export { +// console.log(333, typeof export) +module.exports= { indexLookup, is, get, + seek, lineDataToJSON, LEX_NAMES, diff --git a/src/util.js b/src/util.js index 0b2d7ba..28e6718 100644 --- a/src/util.js +++ b/src/util.js @@ -1,7 +1,15 @@ +/** +* util.js +* +* Copyright (c) 2012-2019 mooster@42at.com +* https://github.com/moos/wordpos +* +* Released under MIT license +*/ + let stopwords = require('../lib/natural/util/stopwords').words; let stopwordsStr = makeStopwordString(stopwords); - function makeStopwordString(stopwords) { return ' ' + stopwords.join(' ') + ' '; } @@ -18,8 +26,8 @@ function normalize(word) { return word.toLowerCase().replace(/\s+/g, '_'); } -function isStopword(stopwords, word) { - return stopwords.indexOf(' '+word+' ') >= 0; +function isStopword(stopwordsStr, word) { + return stopwordsStr.indexOf(' '+word+' ') >= 0; } function tokenizer(str) { @@ -47,7 +55,8 @@ function prepText(text) { )); } -export { +module.exports = { + stopwords, nextTick, normalize, tokenizer, From 359ada5e8a6acbd7c62bc532bce8d36c490382af Mon Sep 17 00:00:00 2001 From: Moos Date: Sun, 14 Oct 2018 22:21:50 -0700 Subject: [PATCH 03/20] browser complete --- samples/self-hosted/index.html | 5 +---- src/common.js | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/samples/self-hosted/index.html b/samples/self-hosted/index.html index e71dc4d..f20cb0b 100644 --- a/samples/self-hosted/index.html +++ b/samples/self-hosted/index.html @@ -18,9 +18,7 @@

Self-hosted WordPOS sample

Open console to see results. -

-   var a = "foo"
- 
+
 
- + + - + + + + + +

CDN WordPOS sample

+ Open console to see results. + +

Coming soon...

+ +
 
+ + + + + + diff --git a/samples/self-hosted/index.html b/samples/self-hosted/index.html index d35bea8..1c1b8d3 100644 --- a/samples/self-hosted/index.html +++ b/samples/self-hosted/index.html @@ -1,6 +1,9 @@ + + Wordpos in the browser + @@ -28,7 +31,7 @@ .then(res => res.text()) .then(txt => { el.innerText = txt; - hljs.initHighlightingOnLoad(); + window.hljs && hljs.initHighlightingOnLoad(); }); } else { el.innerHTML = 'Open main.js.'; diff --git a/src/browser/baseFile.js b/src/browser/baseFile.js index 3903f32..abce7c6 100644 --- a/src/browser/baseFile.js +++ b/src/browser/baseFile.js @@ -39,7 +39,7 @@ class BaseFile { let promise = isTest ? Promise.resolve(require(this.filePath)) - : eval(`import('${this.filePath}')`); // prevent parcel from clobbering dynamic import + : ES6_IMPORT(`${this.filePath}`); // prevent parcel from clobbering dynamic import this.options.debug && console.timeEnd('index load ' + this.posName) return promise diff --git a/src/browser/index.js b/src/browser/index.js index 9eaa59f..131a539 100644 --- a/src/browser/index.js +++ b/src/browser/index.js @@ -20,6 +20,7 @@ const POS = { r: 'adv' }; + class WordPOS { options = {}; From 74bd2a75d07fe422ff9d9f5ea058022c14e1d893 Mon Sep 17 00:00:00 2001 From: moos Date: Fri, 24 May 2019 14:18:55 -0700 Subject: [PATCH 14/20] add cdn browser sample --- samples/cdn/index.html | 20 +++++++++++++++----- samples/{self-hosted => }/main.js | 7 ------- samples/self-hosted/index.html | 17 +++++++++++++---- 3 files changed, 28 insertions(+), 16 deletions(-) rename samples/{self-hosted => }/main.js (86%) diff --git a/samples/cdn/index.html b/samples/cdn/index.html index a3ce54b..ce1f425 100644 --- a/samples/cdn/index.html +++ b/samples/cdn/index.html @@ -7,7 +7,19 @@ - + + + + + + + + + +

CDN WordPOS sample

+ Open console to see results. + +
 
+ + + + + + diff --git a/docs/main.js b/docs/main.js new file mode 100644 index 0000000..5c3349d --- /dev/null +++ b/docs/main.js @@ -0,0 +1,32 @@ +let assertLikely = (r) => { + console.assert(r.def === 'with considerable certainty'); + console.assert(r.pos === 'r'); + console.assert(r.synsetOffset === '00139421'); +}; + +console.group('Likely'); +wordpos.isAdverb('likely').then(res => console.assert(res)); +wordpos.isAdverb('likely', (res, ...profile) => console.log('callback with profile', res, profile)); + +wordpos.getAdverbs('this is is lately a likely tricky business this is') + .then(res => { + let expect = {lately: 1, likely: 1}; + console.log('getAdverbs:', res); + console.assert(res[0] in expect); // NOTE: order is NOT gauranteed! + console.assert(res[1] in expect); + }); + +wordpos.lookupAdverb('likely') + .then(res => { + console.log('lookupAdverb:', res[0]); + assertLikely(res[0]); + }); +// wordpos.lookup('likely').then(res, console.log('lookup ===', res)) + +wordpos.seek('00139421', 'r') + .then(res => { + console.log('seek:', res); + assertLikely(res); + }); + +setTimeout(() => console.groupEnd('Likely'), 1000); diff --git a/docs/self-hosted/index.html b/docs/self-hosted/index.html new file mode 100644 index 0000000..dcdbc03 --- /dev/null +++ b/docs/self-hosted/index.html @@ -0,0 +1,52 @@ + + + + + Wordpos in the browser + + + + + + + + + + + + + +

Self-hosted WordPOS sample

+ Open console to see results. + +
 
+ + + + + + From 04b1cd285feba0890336f9a96b14efeeee1ed0e2 Mon Sep 17 00:00:00 2001 From: Moos Date: Fri, 31 May 2019 05:52:48 -0700 Subject: [PATCH 20/20] update readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d6f033f..5785d1e 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ wordpos is a set of *fast* part-of-speech (POS) utilities for Node.js **and** br Version 1.x is a major update with no direct dependence on [natural's](https://github.com/NaturalNode/natural#wordnet) WordNet module, with support for [Promises](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise), and roughly 5x speed improvement over previous version. -~~**CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](https://wordnet.princeton.edu/documentation/wnstats7wn) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment.~~ +> ~~**CAUTION** The WordNet database [wordnet-db](https://github.com/moos/wordnet-db) comprises [155,287 words](https://wordnet.princeton.edu/documentation/wnstats7wn) (3.0 numbers) which uncompress to over **30 MB** of data in several *un*[browserify](https://github.com/substack/node-browserify)-able files. It is *not* meant for the browser environment.~~ -:zap: v2.x can work in browsers -- see below for example. +:zap: v2.x can work in browsers -- to try it out `npm i wordpos@beta` or [see it in action](https://moos.github.io/wordpos). See below for usage. ## Installation @@ -298,7 +298,7 @@ Note that callback receives full arguments (including profile, if enabled), whil v2.0 introduces the capability of running wordpos in the browser. The dictionary files are optimized for fast access (lookup by lemma), but they must be fetched, parsed and loaded into browser memory. The files are loaded on-demand (unless the option `preload: true` is given). -The dict files can be served locally or from CDN (see [samples/cdn](samples/cdn/).). Include the following scripts in your `index.html`: +The dict files can be served locally or from CDN (see [samples/cdn](samples/cdn/) for code, or [see it in action](https://moos.github.io/wordpos)). Include the following scripts in your `index.html`: ```html