first checkin for browser rework

This commit is contained in:
Moos 2018-10-12 20:35:11 -07:00
parent b972581640
commit 364b2648f7
14 changed files with 1221 additions and 4 deletions

4
.babelrc Normal file
View File

@ -0,0 +1,4 @@
{
"presets": ["env", "stage-2"],
"plugins": ["transform-class-properties"]
}

2
.gitignore vendored
View File

@ -2,3 +2,5 @@ dict
node_modules
.idea
*.iml
.cache
dist

View File

@ -1,6 +1,6 @@
{
"name": "wordpos",
"version": "1.2.0",
"version": "2.0.0-alpha",
"description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.",
"author": "Moos <mooster@42at.com>",
"keywords": [
@ -16,14 +16,26 @@
"engines": {
"node": ">=4"
},
"files": ["bench","bin","lib","src","test","tools"],
"files": [
"bench",
"bin",
"lib",
"src",
"test",
"tools"
],
"bin": "./bin/wordpos-cli.js",
"dependencies": {
"commander": "^2.0.0",
"dict": "^1.4.0",
"underscore": ">=1.3.1",
"wordnet-db": "^3.1.6"
},
"devDependencies": {
"babel-core": "^6.26.3",
"babel-plugin-transform-class-properties": "^6.24.1",
"babel-preset-env": "^1.7.0",
"babel-preset-stage-2": "^6.24.1",
"chai": "^4.0.2",
"mini-bench": "^1.0.0",
"mocha": "^5.2.0"
@ -35,7 +47,11 @@
"main": "./src/wordpos.js",
"scripts": {
"postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun",
"test": "mocha test"
"postinstall-web": "node scripts/makeJsonDict.js index data",
"test": "mocha test",
"start": "npm run start-self",
"start-self": "parcel samples/self-hosted/index.html",
"start-cdn": "parcel samples/cdn/index.html"
},
"license": "MIT"
}

View File

@ -0,0 +1,51 @@
<!doctype html>
<html>
<head>
<script src="./main.js"></script>
<script type="ignore-me">
// import IndexFile from "../../src/browser/indexFile.js";
console.log(333, IndexFile)
let posExt = ['adj', 'adv', 'noun', 'verb'];
let dictRoot = '../../dict/';
let files = {};
function loadPos(pos) {
return import(dictRoot + 'index.' + pos + '.js');
console.time('load-' + pos);
let get = (name) => {
let path = dictRoot + name + '.' + pos + '.json';
return fetch(path).then(res => res.json()).then(obj => {
// console.log(`got ${path}: `, text);
files[pos] = files[pos] || {};
files[pos][name] = obj;
console.timeEnd('load-' + pos);
});
};
// get('data');
return get('index');
}
let pos = 'adv';
loadPos(pos).then(result => {
console.log('got', pos ,result);
window.res = result.default;
});
</script>
</head>
<body>
<h1>Self-hosted WordPOS sample</h1>
<script>
</script>
</body>
</html>

View File

@ -0,0 +1,19 @@
import WordPOS from '../../src/browser';
console.log(__dirname, WordPOS.defaults)
let wordpos = window.wordpos = new WordPOS({
// preload: true,
dictPath: './dict',
profile: true,
// stopwords: false
});
wordpos.isAdverb('likely').then(res => console.log('likely is adverb:', res));
// wordpos.isAdverb('likely', (res, ...profile) => console.log('likely callback', res, profile));
wordpos.getAdverbs('this is is likely a likely tricky business this is').then(
res => console.log('getAdverb', res)
);
wordpos.lookupAdverb('likely').then(res => console.log('lookup ===', res))
// wordpos.lookup('likely').then(res, console.log('lookup ===', res))

85
scripts/makeJsonDict.js Normal file
View File

@ -0,0 +1,85 @@
#!/usr/bin/env node
/**
* takes original WordNet index & data files and converts to
* exported JSON format with lemma as the key.
*/
let fs = require('fs');
let path = require('path');
let outPath = './dict';
let posExt = ['adj', 'adv', 'noun', 'verb'];
let dictRoot = './node_modules/wordnet-db/dict/';
const fileTypes = {
data: true,
index: true
};
const [,, ...args] = process.argv;
if (!args.length || args.filter(p => !(p in fileTypes)).length) {
console.log('Converts wordnet-db index & data files to JSON format for use in the browser.');
console.log('\nUsage: makeJsonDict.js index|data');
process.exit(1);
}
function uniq(arr) {
return arr.filter((v, i) => arr.indexOf(v) === i);
}
console.time('Done');
// create out directory
try {
fs.statSync(outPath);
} catch (e) {
fs.mkdirSync(outPath);
}
function processFile(name) {
// read the file as text
function loadFile(pos) {
console.time(' load');
let inPath = path.resolve(dictRoot, name + '.' + pos);
let text = fs.readFileSync(inPath, 'utf8');
console.timeEnd(' load');
return text;
}
// convert raw text to JSON and write to file
function processText(pos, text) {
let obj = {};
let sp = ' ';
console.time(' process');
text.split('\n').forEach(line => {
if (!line || line[0] === sp) return;
let spi = line.indexOf(sp);
let key = line.substr(0, spi);
line = line.substring(1 + spi, line.lastIndexOf(sp + sp))
obj[key] = line;
});
console.timeEnd(' process');
return obj;
}
function writeFile(pos, obj) {
console.time(' write');
let text = JSON.stringify(obj);
text = 'export default ' + text;
fs.writeFileSync(path.resolve(outPath, name + '.' + pos + '.js'), text);
console.timeEnd(' write');
}
posExt.forEach(pos => {
console.log('\n', name, pos, ':');
let text = loadFile(pos);
let obj = processText(pos, text);
writeFile(pos, obj);
});
}
uniq(args).forEach(processFile);
console.log('\nWritten to', path.resolve(outPath));
console.timeEnd('Done');

30
src/browser/baseFile.js Normal file
View File

@ -0,0 +1,30 @@
class BaseFile {
/**
* file contents
* @type {Object}
*/
file = {};
constructor(type, dictPath, posName) {
this.filePath = `${dictPath}/${type}.${posName}.js`;
this.type = type;
}
load() {
return import(this.filePath)
.then(exports => this.file = exports.default)
.catch(err => {
console.error(`Error loading ${this.type} file for ${this.filePath}.`, err);
throw err;
});
}
ready(fn, args) {
return this.load().then(() => fn.apply(this, args));
}
}
export default BaseFile;

92
src/browser/dataFile.js Normal file
View File

@ -0,0 +1,92 @@
/*!
* dataFile.js
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
*
* Released under MIT license
*/
import { lineDataToJSON, LEX_NAMES } from '../common';
import BaseFile from './BaseFile';
/**
* get parsed line from data file
*
* @param {string} offset The offset key
* @return {object} Data record object
* @this DataFile
*/
function seek(offset) {
let str = this.file[offset];
if (!str) return {};
// offset was extracted for the key - add it back to line data
return lineDataToJSON(offset + ' ' + str);
}
/**
* lookup offsets in data file
*
* @param offsets {array} - array of offsets to lookup (obtained from index.find())
* @param callback{function} (optional) - callback function
* @returns {Promise.[<Object>]} array of or single data record
* @this DataFile
*/
function lookup(offsets, callback) {
var results = [],
self = this,
readLine = seek.bind(this),
valid = (item => item.pos),
single = !Array.isArray(offsets);
if (single) offsets = [offsets];
return new Promise(function(resolve, reject) {
results = offsets.map(readLine).filter(valid);
if (!results.length) {
let err = new RangeError(`No data at offsets ${offsets.join()} in ${self.filePath}.`);
callback && callback(err, single ? {} :[]);
reject(err);
} else {
if (single) results = results[0];
callback && callback(null, results);
resolve(results);
}
});
}
/**
* DataFile class
*
* @param dictPath {string} - path to dict folder
* @param name {string} - POS name
* @constructor
*/
class DataFile extends BaseFile {
constructor(dictPath, posName) {
super('data', dictPath, posName);
}
lookup() {
return this.ready(lookup, arguments);
}
seek() {
// return this.ready(find, arguments);
}
}
/**
* map of lexFilenum to lex names
*
* @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html
* @type {string[]}
*/
DataFile.LEX_NAMES = LEX_NAMES;
export default DataFile;

165
src/browser/index.js Normal file
View File

@ -0,0 +1,165 @@
import { stopwords, prepText, makeStopwordString } from '../util';
import { is, get, lookup } from '../common';
import IndexFile from './indexFile';
import DataFile from './dataFile';
const POS = {
n: 'noun',
v: 'verb',
a: 'adj',
r: 'adv'
};
class WordPOS {
options = {};
loaded = Promise.resolve();
constructor(config) {
this.options = Object.assign({}, WordPOS.defaults, config);
console.log('wpos ctor -- ', this.options)
this.initFiles();
if (Array.isArray(this.options.stopwords)) {
this.options.stopwords = makeStopwordString(this.options.stopwords);
}
// TODO rand()
}
ready() {
return this.loaded;
}
initFiles() {
const keys = Object.keys(POS);
const loadOne = (Comp, pos) => new Comp(this.options.dictPath, POS[pos]);
const loader = (Comp) => keys.map(loadOne.bind(null, Comp));
const reducer = (arr) => arr.reduce((coll, item, i) => (coll[keys[i]] = item, coll), {});
this.indexFiles = reducer(loader(IndexFile));
this.dataFiles = reducer(loader(DataFile));
if (this.options.preload) {
this.loaded = this.preloadIndexes(this.options.preload);
}
}
getFilesFor(pos) {
return {
index: this.indexFiles[pos],
data: this.dataFiles[pos]
};
}
/**
* loads index files
*
* @param {string|Array} [pos] POS to load (default: all)
* @return {Promise.<index data>}
*/
preloadIndexes(pos) {
let file = this.indexFile[pos];
let load = p => file.load();
let promise;
if (!pos || pos === true) { // preload all
promise = Promise.all(Object.keys(POS).map(load));
}
else if (typeof pos === 'string' && file) {
promise = load(pos);
}
else if (pos instanceof Array) {
promise = pos.forEach(pos => file && load(pos));
}
// TODO includeData
return promise || Promise.reject(new RangeError(`Unknown POS "${pos}" for preload.`));
}
parse = prepText;
/**
* isX() - Test if word is given POS
* @see is
*/
isAdjective = is('a');
isAdverb = is('r');
isNoun = is('n');
isVerb = is('v');
/**
* getX() - Find all words in string that are given POS
* @see get
*/
getAdjectives = get('isAdjective');
getAdverbs = get('isAdverb');
getNouns = get('isNoun');
getVerbs = get('isVerb');
/**
* lookupX() - Lookup word definition if already know POS
* @see lookup
*/
lookupAdjective = lookup('a');
lookupAdverb = lookup('r');
lookupNoun = lookup('n');
lookupVerb = lookup('v');
}
WordPOS.defaults = {
/**
* path to WordNet data (override only if not using wordnet-db)
* @type {string}
*/
dictPath: '',
/**
* enable profiling, time in msec returned as second argument in callback
* @type {boolean}
*/
profile: false,
/**
* if true, exclude standard stopwords.
* if array, stopwords to exclude, eg, ['all','of','this',...]
* if false, do not filter any stopwords.
* @type {boolean}
*/
stopwords: true,
/**
* preload files.
* true - preload all POS
* false - do not preload any POS
* 'a' - preload adj
* ['a','v'] - preload adj & verb
* @type {boolean|string|Array}
*/
preload: false,
/**
* include data files in preload
* @type {boolean}
*/
includeData: false
};
/**
* access to WordNet DB
* @type {object}
*/
// WordPOS.WNdb = WNdb;
/**
* access to stopwords
* @type {Array}
*/
WordPOS.stopwords = stopwords;
export default WordPOS;

71
src/browser/indexFile.js Normal file
View File

@ -0,0 +1,71 @@
/*!
* indexFile.js
*
* implements fast index lookup of WordNet's index files
*
* Copyright (c) 2012-2019 mooster@42at.com
* https://github.com/moos/wordpos
*
* Portions: Copyright (c) 2011, Chris Umbel
*
* Released under MIT license
*/
import { indexLookup } from '../common';
import BaseFile from './BaseFile';
/**
* find a search term in an index file (using fast index)
*
* Calls to same bucket are queued for callback using the piper.
*
* @param search {string} - word to search for
* @param callback {function} - callback receives found line and tokens
* @returns none
* @this IndexFile
*/
function find(search, callback) {
var miss = {status: 'miss'};
if (!(search in this.file)) {
callback(miss);
return;
}
var
line = this.file[search],
tokens = line.split(/\s+/),
result = {
status: 'hit',
key: search,
line: line,
tokens: tokens
};
result.tokens.unshift(search);
callback(result);
}
/**
* IndexFile class
*
* @param dictPath {string} - WordNet db dict path
* @param name {string} - name of index: noun, verb, adj, adv
* @constructor
*/
class IndexFile extends BaseFile {
constructor(dictPath, posName) {
super('index', dictPath, posName);
}
lookup() {
return this.ready(indexLookup, arguments);
}
find() {
return this.ready(find, arguments);
}
}
export default IndexFile;

82
src/browser/piper.js Normal file
View File

@ -0,0 +1,82 @@
/*!
* piper.js
*
* executes multiple async i/o tasks and pools similar callbacks,
* calling i/o open/close when all incoming tasks are done.
*
* Copyright (c) 2012-2016 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var _ = require('underscore')._,
util = require('util'),
fs = require('fs');
/**
* run single 'task' method sharing callbacks. Method MUST take callback as LAST arg.
* piper is bound to an IndexFile.
*
* @param task {string} - task name unique to method!
* @param method {function} - method to execute, gets (args, ... , callback)
* @param args {Array} - args to pass to method
* @param context {object} - other params to remember and sent to callback
* @param callback {function} - result callback
*/
function piper(task, method, args, context, callback){
var readCallbacks = this.callbackQueue,
memoArgs = _.rest(arguments, 2),
wrappedCallback;
//console.log('piper', task, [method]);
// queue up if already reading file for this task
if (task in readCallbacks){
readCallbacks[task].push(memoArgs);
return;
}
readCallbacks[task] = [memoArgs];
if (!this.fd) {
//console.log(' ... opening', this.filePath);
this.fd = fs.openSync(this.filePath, 'r');
}
// ref count so we know when to close the main index file
++this.refcount;
wrappedCallback = _.partial(piper.wrapper, this, task);
// call method -- replace original callback (last arg) with wrapped one
method.apply(null, [].concat( args, wrappedCallback ));
}
// result is the *same* for same task
piper.wrapper = function(self, task /*, result...*/){
var readCallbacks = self.callbackQueue,
result = _.rest(arguments, 2),
callback, args;
// live access callbacks cache in case nested cb's
// add to the array.
while (args = readCallbacks[task].shift()) {
callback = args.pop(); // last arg MUST be callback
// console.log('>>>> pper wrapper', self.fastIndex.name, task, result.toString())
callback.apply(null, [].concat(_.flatten(args, /*shallow*/true), result));
}
// now done - delete cb cache
delete readCallbacks[task];
if (--self.refcount === 0) {
//console.log(' ... closing', self.filePath);
fs.closeSync(self.fd);
self.fd = null;
}
};
module.exports = piper;

267
src/browser/rand.js Normal file
View File

@ -0,0 +1,267 @@
/*!
* rand.js
*
* define rand() and randX() functions on wordpos
*
* Copyright (c) 2012-2016 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var _ = require('underscore')._,
util = require('util'),
Trie = require('../lib/natural/trie/trie'),
IndexFile = require('./indexFile'),
KEY_LENGTH = 3;
/**
* factory function for randX()
*
* @param pos {string} - a,r,n,v
* @returns {Function} - rand function bound to an index file
*/
function makeRandX(pos){
return function(opts, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index,
startsWith = opts && opts.startsWith || '',
count = opts && opts.count || 1;
if (typeof opts === 'function') {
callback = opts;
}
return index.rand(startsWith, count, function (record) {
args.push(record, startsWith);
profile && args.push(new Date() - start);
callback && callback.apply(null, args);
});
};
}
/**
* rand function (bound to index)
*
* @param startsWith {string} - get random word(s) that start with this, or ''
* @param num {number} - number of words to return
* @param callback {function} - callback function, receives words array and startsWith
* @returns Promise
*/
function rand(startsWith, num, callback){
var self = this,
nextKey = null,
trie = this.fastIndex.trie,
key, keys;
return new Promise(function(resolve, reject) {
//console.log('-- ', startsWith, num, self.fastIndex.indexKeys.length);
if (startsWith) {
key = startsWith.slice(0, KEY_LENGTH);
/**
* if key is 'a' or 'ab' (<3 chars), search for ALL keys starting with that.
*/
if (key.length < KEY_LENGTH) {
// calc trie if haven't done so yet
if (!trie) {
trie = new Trie();
trie.addStrings(self.fastIndex.indexKeys);
self.fastIndex.trie = trie;
//console.log(' +++ Trie calc ');
}
try {
// trie throws if not found!!!!!
keys = trie.keysWithPrefix(startsWith);
} catch (e) {
keys = [];
}
// read all keys then select random word.
// May be large disk read!
key = keys[0];
nextKey = _.last(keys);
}
if (!key || !(key in self.fastIndex.offsets)) {
callback && callback([], startsWith);
resolve([]);
}
} else {
// no startWith given - random select among keys
keys = _.sample(self.fastIndex.indexKeys, num);
// if num > 1, run each key independently and collect results
if (num > 1) {
var results = [], ii = 0;
_(keys).each(function (startsWith) {
self.rand(startsWith, 1, function (result) {
results.push(result[0]);
if (++ii == num) {
callback && callback(results, '');
resolve(results);
}
});
});
return;
}
key = keys;
}
// prepare the piper
var args = [key, nextKey, self],
task = 'rand:' + key + nextKey,
context = [startsWith, num, callback]; // last arg MUST be callback
// pay the piper
self.piper(task, IndexFile.readIndexBetweenKeys, args, context, collector);
function collector(key, nextKey, index, startsWith, num, callback, buffer) {
var lines = buffer.toString().split('\n'),
matches = lines.map(function (line) {
return line.substring(0, line.indexOf(' '));
});
//console.log(' got lines for key ', key, lines.length);
// we got bunch of matches for key - now search within for startsWith
if (startsWith !== key) {
// binary search for startsWith within set of matches
var ind = _.sortedIndex(matches, startsWith);
if (ind >= lines.length || matches[ind].indexOf(startsWith) === -1) {
callback && callback([], startsWith);
resolve([]);
return;
}
var trie = new Trie();
trie.addStrings(matches);
//console.log('Trie > ', trie.matchesWithPrefix( startsWith ));
matches = trie.keysWithPrefix(startsWith);
}
var words = _.sample(matches, num);
callback && callback(words, startsWith);
resolve(words);
}
}); // Promise
}
// relative weight of each POS word count (DB 3.1 numbers)
var POS_factor = {
Noun: 26,
Verb: 3,
Adjective: 5,
Adverb: 1,
Total: 37
};
/**
* rand() - for all Index files
* @returns Promise
*/
function randAll(opts, callback) {
if (typeof opts === 'function') {
callback = opts;
opts = {};
} else {
opts = _.clone(opts || {});
}
var
profile = this.options.profile,
start = profile && new Date(),
results = [],
startsWith = opts && opts.startsWith || '',
count = opts && opts.count || 1,
args = [null, startsWith],
parts = 'Noun Verb Adjective Adverb'.split(' '),
self = this;
return new Promise(function(resolve, reject) {
// select at random a POS to look at
var doParts = _.sample(parts, parts.length);
tryPart();
function tryPart() {
var part = doParts.pop(),
rand = 'rand' + part,
factor = POS_factor[part],
weight = factor / POS_factor.Total;
// pick count according to relative weight
opts.count = Math.ceil(count * weight * 1.1); // guard against dupes
self[rand](opts, partCallback);
}
function partCallback(result) {
if (result) {
results = _.uniq(results.concat(result)); // make sure it's unique!
}
if (results.length < count && doParts.length) {
return tryPart();
}
// final random and trim excess
results = _.sample(results, count);
done();
}
function done() {
profile && (args.push(new Date() - start));
args[0] = results;
callback && callback.apply(null, args);
resolve(results);
}
}); // Promise
}
/**
* bind rand() to index
*
* @param index {object} - the IndexFile instance
* @returns {function} - bound rand function for index
*/
function randomify(index){
if (!index.fastIndex) throw 'rand requires fastIndex';
return _.bind(rand, index);
}
module.exports = {
init: function(wordposProto) {
wordposProto.nounIndex.rand = randomify(wordposProto.nounIndex);
wordposProto.verbIndex.rand = randomify(wordposProto.verbIndex);
wordposProto.adjIndex.rand = randomify(wordposProto.adjIndex);
wordposProto.advIndex.rand = randomify(wordposProto.advIndex);
/**
* define rand()
*/
wordposProto.rand = randAll;
/**
* define randX()
*/
wordposProto.randAdjective = makeRandX('a');
wordposProto.randAdverb = makeRandX('r');
wordposProto.randNoun = makeRandX('n');
wordposProto.randVerb = makeRandX('v');
}
};

277
src/common.js Normal file
View File

@ -0,0 +1,277 @@
import { normalize, nextTick } from './util';
/**
* factory for main lookup function
*
* @param pos {string} - n/v/a/r
* @returns {Function} - lookup function bound to POS
* @this WordPOS
*/
function lookup(pos) {
return function(word, callback) {
var profile = this.options.profile,
start = profile && new Date(),
files = this.getFilesFor(pos),
args = [];
word = normalize(word);
// lookup index
return files.index.lookup(word)
.then(function(result) {
if (result) {
// lookup data
return files.data.lookup(result.synsetOffset).then(done);
} else {
// not found in index
return done([]);
}
})
.catch(done);
function done(results) {
if (results instanceof Error) {
args.push([], word);
} else {
args.push(results, word);
}
//console.log(3333, args)
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* find a word and prepare its lexical record
*
* @param word {string} - search word
* @param callback {function} - callback function receives result
* @returns {Promise.<IndexRecord>}
* @this IndexFile
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function indexLookup(word, callback) {
var self = this;
return new Promise(function(resolve, reject){
self.find(word, function (record) {
var indexRecord = null,
i;
if (record.status == 'hit') {
var ptrs = [], offsets = [];
let n = parseInt(record.tokens[3]);
for (i = 0; i < n; i++) {
ptrs.push(record.tokens[i]);
}
n = parseInt(record.tokens[2]);
for (i = 0; i < n; i++) {
offsets.push(record.tokens[ptrs.length + 6 + i]);
}
indexRecord = {
lemma : record.tokens[0],
pos : record.tokens[1],
ptrSymbol : ptrs,
senseCnt : parseInt(record.tokens[ptrs.length + 4], 10),
tagsenseCnt : parseInt(record.tokens[ptrs.length + 5], 10),
synsetOffset: offsets
};
}
callback && callback(indexRecord);
resolve(indexRecord);
});
});
}
/**
* getX() factory function
*
* @param isFn {function} - an isX() function
* @returns {Function}
* @this IndexFile
*/
function get(isFn) {
return function(text, callback, _noprofile) {
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
words = this.parse(text),
results = [],
self = this;
return Promise
.all(words.map(exec))
.then(done);
function exec(word) {
return self[isFn]
.call(self, word, null, /*_noprofile*/ true)
.then(function collect(result) {
result && results.push(word);
});
}
function done(){
var args = [results];
profile && args.push(new Date() - start);
nextTick(callback, args);
return results;
}
};
}
/**
* isX() factory function
*
* @param pos {string} - n/v/a/r
* @returns {Function}
* @this WordPOS
*/
function is(pos){
return function(word, callback, _noprofile) {
// disable profiling when isX() used internally
var profile = this.options.profile && !_noprofile,
start = profile && new Date(),
args = [],
index = this.getFilesFor(pos).index;
word = normalize(word);
return index
.lookup(word)
.then(function(record) {
var result = !!record;
args.push(result, word);
profile && args.push(new Date() - start);
nextTick(callback, args);
return result;
});
};
}
/**
* parse a single data file line, returning data object
*
* @param line {string} - a single line from WordNet data file
* @returns {object}
*
* Credit for this routine to https://github.com/NaturalNode/natural
*/
function lineDataToJSON(line, location) {
// if (!dataCheck(line, location)) return new Error('Bad data at location ' + location);
var data = line.split('| '),
tokens = data[0].split(/\s+/),
ptrs = [],
wCnt = parseInt(tokens[3], 16),
synonyms = [],
i;
for(i = 0; i < wCnt; i++) {
synonyms.push(tokens[4 + i * 2]);
}
var ptrOffset = (wCnt - 1) * 2 + 6;
let n = parseInt(tokens[ptrOffset], 10);
for(i = 0; i < n; i++) {
ptrs.push({
pointerSymbol: tokens[ptrOffset + 1 + i * 4],
synsetOffset: tokens[ptrOffset + 2 + i * 4],
pos: tokens[ptrOffset + 3 + i * 4],
sourceTarget: tokens[ptrOffset + 4 + i * 4]
});
}
// break "gloss" into definition vs. examples
var glossArray = data[1].split('; ');
var definition = glossArray[0];
var examples = glossArray.slice(1);
var lexFilenum = parseInt(tokens[1], 10);
for (var k = 0; k < examples.length; k++) {
examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,'');
}
return {
synsetOffset: tokens[0],
lexFilenum: lexFilenum,
lexName: LEX_NAMES[ lexFilenum ],
pos: tokens[2],
wCnt: wCnt,
lemma: tokens[4],
synonyms: synonyms,
lexId: tokens[5],
ptrs: ptrs,
gloss: data[1],
def: definition,
exp: examples
};
}
const LEX_NAMES = [
'adj.all',
'adj.pert',
'adv.all',
'noun.Tops',
'noun.act',
'noun.animal',
'noun.artifact',
'noun.attribute',
'noun.body',
'noun.cognition',
'noun.communication',
'noun.event',
'noun.feeling',
'noun.food',
'noun.group',
'noun.location',
'noun.motive',
'noun.object',
'noun.person',
'noun.phenomenon',
'noun.plant',
'noun.possession',
'noun.process',
'noun.quantity',
'noun.relation',
'noun.shape',
'noun.state',
'noun.substance',
'noun.time',
'verb.body',
'verb.change',
'verb.cognition',
'verb.communication',
'verb.competition',
'verb.consumption',
'verb.contact',
'verb.creation',
'verb.emotion',
'verb.motion',
'verb.perception',
'verb.possession',
'verb.social',
'verb.stative',
'verb.weather',
'adj.ppl'
];
export {
indexLookup,
is,
get,
lineDataToJSON,
LEX_NAMES,
lookup
}

56
src/util.js Normal file
View File

@ -0,0 +1,56 @@
let stopwords = require('../lib/natural/util/stopwords').words;
let stopwordsStr = makeStopwordString(stopwords);
function makeStopwordString(stopwords) {
return ' ' + stopwords.join(' ') + ' ';
}
// setImmediate executes callback AFTER promise handlers.
// Without it, exceptions in callback may be caught by Promise.
function nextTick(fn, args) {
if (fn) {
fn.apply(null, args);
}
}
function normalize(word) {
return word.toLowerCase().replace(/\s+/g, '_');
}
function isStopword(stopwords, word) {
return stopwords.indexOf(' '+word+' ') >= 0;
}
function tokenizer(str) {
return str.split(/\W+/);
}
function uniq(arr) {
return arr.filter((v, i) => arr.indexOf(v) === i);
}
function isString(s) {
return typeof s === 'string';
}
function reject(arr, predicate) {
return arr.filter(item => !predicate(item))
}
function prepText(text) {
if (Array.isArray(text)) return text;
var deduped = uniq(tokenizer(text));
if (!this.options.stopwords) return deduped;
return reject(deduped, isStopword.bind(null,
isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
));
}
export {
nextTick,
normalize,
tokenizer,
prepText,
makeStopwordString
}