From 0abe5a90104dbb449f8155b65439f390892a1c76 Mon Sep 17 00:00:00 2001 From: Moos Date: Sat, 9 Apr 2016 13:46:36 -0700 Subject: [PATCH] Added seek() method and lexName property. Bump to 1.1. --- README.md | 29 ++++++++++++--- bin/README.md | 45 ++++++++++++++++++++++ bin/wordpos-cli.js | 51 +++++++++++++++++-------- package.json | 2 +- src/dataFile.js | 89 +++++++++++++++++++++++++++++++++++++++----- src/wordpos.js | 27 +++++++++++++- test/wordpos_test.js | 85 +++++++++++++++++++++++++++++++++++++++++- 7 files changed, 296 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 79bfc8c..7a8eda7 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ wordpos.isAdjective('awesome', function(result){ // true 'awesome' ``` -Command-line: (see [CLI](bin)) +Command-line: (see [CLI](bin) for full command list) ```bash $ wordpos def git git @@ -71,7 +71,7 @@ WordPOS.defaults = { stopwords: true }; ``` -To override, pass an options hash to the constructor. With the `profile` option, all callbacks receive a last argument that is the execution time in msec of the call. +To override, pass an options hash to the constructor. With the `profile` option, most callbacks receive a last argument that is the execution time in msec of the call. ```js wordpos = new WordPOS({profile: true}); @@ -165,19 +165,33 @@ Example: ```js wordpos.lookupAdjective('awesome', console.log); // output: -[ { synsetOffset: 1282510, +[ { synsetOffset: 1285602, lexFilenum: 0, + lexName: 'adj.all', pos: 's', wCnt: 5, lemma: 'amazing', synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ], lexId: '0', ptrs: [], - gloss: 'inspiring awe or admiration or wonder; awing majesty, so vast, so high, so silent" ' + gloss: 'inspiring awe or admiration or wonder; [...] awing majesty, so vast, so high, so silent" ' + def: 'inspiring awe or admiration or wonder', + ... } ], 'awesome' ``` -In this case only one lookup was found, but there could be several. +In this case only one lookup was found, but there could be several. +Version 1.1 adds the `lexName` parameter, which maps the lexFilenum to one of [45 lexicographer domains](https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html). + + +#### seek(offset, pos, callback) +Version 1.1 introduces the seek method to lookup a record directly from the synsetOffset for a given POS. Unlike other methods, callback (if provided) receives `(err, result)` arguments. + +Examples: +```js +wordpos.seek(1285602, 'a').then(console.log) +// same result as wordpos.lookupAdjective('awesome', console.log); +``` #### rand(options, callback) #### randNoun(options, callback) @@ -214,6 +228,7 @@ wordpos.rand({starsWith: 'zzz'}, console.log) Getting random POS (`randNoun()`, etc.) is generally faster than `rand()`, which may look at multiple POS files until `count` requirement is met. + #### parse(text) Returns tokenized array of words in `text`, less duplicates and stopwords. This method is called on all getX() calls internally. @@ -274,6 +289,10 @@ See [bench/README](bench). ## Changes +1.1 - + - added seek() method + - added lexName property + 1.0.1 - Removed npm dependency on Natural. Certain modules are included in /lib. - Add support for ES6 Promises. diff --git a/bin/README.md b/bin/README.md index a7c5ea9..894aec2 100644 --- a/bin/README.md +++ b/bin/README.md @@ -19,6 +19,8 @@ $ wordpos syn lookup synonyms exp lookup examples + + seek get record at synset offset. Must include one of POS -n, -a, -v, -r rand get random words (starting with [word]). If first arg is a number, returns that many random words. Valid options are -b, -f, -j, -s, -i. @@ -222,6 +224,49 @@ $ wordpos rand --adj foot foot-shaped ``` +#### Seek a synset offset +Seek offset as adjective: +```sh +$ wordpos seek 1285602 -a +{ '1285602': + { synsetOffset: 1285602, + lexFilenum: 0, + lexName: 'adj.all', + pos: 's', + wCnt: 5, + lemma: 'amazing', + synonyms: [ 'amazing', 'awe-inspiring', 'awesome', 'awful', 'awing' ], + lexId: '0', + ptrs: + [ { pointerSymbol: '&', + synsetOffset: 1285124, + pos: 'a', +... +``` + +Same as verb (not found!): +```sh +$ wordpos seek 1285602 -v +{ '1285602': {} } +``` + +Multiple offsets from same POS: +```sh +$ wordpos seek 1285602 1285124 -a +{ '1285124': + { synsetOffset: 1285124, + lexFilenum: 0, + ... + }, + '1285602': + { synsetOffset: 1285602, + lexFilenum: 0, + ... + } +``` +Note that results are always returned as `--full` format. To get compact JSON format, add the `-j` option. + + #### Stopwords List stopwords (brief): ```bash diff --git a/bin/wordpos-cli.js b/bin/wordpos-cli.js index 18f983a..6efaff1 100644 --- a/bin/wordpos-cli.js +++ b/bin/wordpos-cli.js @@ -5,9 +5,9 @@ * command-line interface to wordpos * * Usage: - * wordpos [options] + * wordpos [options] * - * Copyright (c) 2012 mooster@42at.com + * Copyright (c) 2012, 2016 mooster@42at.com * https://github.com/moos/wordpos * * Released under MIT license @@ -17,6 +17,7 @@ var program = require('commander'), _ = require('underscore')._, fs = require('fs'), POS = {noun:'Noun', adj:'Adjective', verb:'Verb', adv:'Adverb'}, + POS_abbr = {noun:'n', adj:'a', verb:'v', adv:'r'}, version = JSON.parse(fs.readFileSync(__dirname + '/../package.json', 'utf8')).version, rawCmd = '', RAND_PLACEHOLDER = '__', @@ -67,6 +68,19 @@ program.command('exp') exec.apply(this, arguments); }); +program.command('seek') + .description('get record at synset offset. Must include one of POS -n, -a, -v, -r') + .action(function(){ + var one = _.chain(program).pick('noun adj adv verb'.split(' ')).countBy().value().true; + if (!one || one > 1) { + console.error('Must include one and only one of -n, -a, -v, -r'); + process.exit(-1); + } + // force full output mode + program.full = 1; + exec.apply(this, arguments); + }); + program.command('rand') .description('get random words (starting with [word]). If first arg is a number, returns ' + 'that many random words. Valid options are -b, -f, -j, -s, -i.') @@ -80,12 +94,10 @@ program.command('rand') args.shift(); program.num = num; } - // no startsWith given, add a placeholder if (args.length === 1){ args.unshift(RAND_PLACEHOLDER); } - exec.apply(this, args); }); @@ -150,23 +162,24 @@ function read_stdin(callback) { } function optToFn() { - var fns = _.reject(POS, function(fn, opt) { return !program[opt] }); + var + map = cmd === 'seek' ? POS_abbr : POS, + fns = _.reject(map, function(fn, opt) { return !program[opt] }); if (!fns.length && cmd === 'rand') return fns = ['']; // run rand() - if (!fns.length) fns = _.values(POS); //default to all if no POS given + if (!fns.length) fns = _.values(map); //default to all if no POS given return fns; } - function run(data) { var opts = {stopwords: !program.withStopwords}, wordpos = new WordPos(opts), - words = wordpos.parse(data), + seek = cmd === 'seek', + words = seek ? data.split(' ') : wordpos.parse(data), fns = optToFn(), - plural = (cmd=='get' ? 's':''), + plural = (cmd === 'get' ? 's':''), results = {}, - finale = _.after( - plural ? fns.length : words.length * fns.length, + finale = _.after(plural ? fns.length : words.length * fns.length, _.bind(output, null, results)), collect = function(what, result, word){ if (word) { // lookup @@ -184,13 +197,20 @@ function run(data) { _(fns).each(function(fn){ var method = cmd + fn + plural, cb = _.bind(collect, null, fn); - if (cmd == 'get') { + if (cmd === 'get') { wordpos[method](words, cb); - } else if (cmd == 'rand') { + } else if (cmd === 'rand') { if (words[0] === RAND_PLACEHOLDER) words[0] = ''; words.forEach(function(word){ wordpos[method]({startsWith: word, count: program.num || 1}, cb); }); + } else if (seek) { + words.forEach(function(offset){ + wordpos.seek(offset, fn, function(err, result){ + results[offset.trim()] = result; + finale(); + }); + }); } else { words.forEach(function(word){ wordpos[method](word, cb); @@ -227,8 +247,9 @@ function sprint(results) { }, ''); default: return _.reduce(results, function(memo, v, k){ - var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep); - return memo + (v.length && util.format('%s%s%s\n', pre, v.join(sep), sep) || ''); + var pre = program.brief ? '' : util.format('# %s %d:%s', k, v.length, sep), + res = v.length ? v.join(sep) : ''; + return memo + (v.length && util.format('%s%s%s\n', pre, res, sep) || ''); }, ''); } diff --git a/package.json b/package.json index d5d4edd..adb71f2 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,7 @@ "verbs" ], "description": "wordpos is a set of part-of-speech utilities for Node.js using the WordNet database.", - "version": "1.0.1", + "version": "1.1", "homepage": "https://github.com/moos/wordpos", "engines": { "node": ">=0.12" diff --git a/src/dataFile.js b/src/dataFile.js index fa01208..bbf4f23 100644 --- a/src/dataFile.js +++ b/src/dataFile.js @@ -13,6 +13,17 @@ var fs = require('fs'), path = require('path'), _ = require('underscore'); +/** + * sanity check read data - line must start with zero-padded location + * + * @param line {string} - line data read + * @return {boolean} true if line data is good + */ +function dataCheck(line, location) { + var pad = '00000000', // 8 zeros + padded = String(pad + location).slice( - pad.length); + return line.indexOf(padded) === 0; +} /** * parse a single data file line, returning data object @@ -22,7 +33,9 @@ var fs = require('fs'), * * Credit for this routine to https://github.com/NaturalNode/natural */ -function lineDataToJSON(line) { +function lineDataToJSON(line, location) { + if (!dataCheck(line, location)) return new Error('Bad data at location ' + location); + var data = line.split('| '), tokens = data[0].split(/\s+/), ptrs = [], @@ -48,6 +61,7 @@ function lineDataToJSON(line) { var glossArray = data[1].split("; "); var definition = glossArray[0]; var examples = glossArray.slice(1); + var lexFilenum = parseInt(tokens[1], 10); for (var k = 0; k < examples.length; k++) { examples[k] = examples[k].replace(/\"/g,'').replace(/\s\s+/g,''); @@ -55,7 +69,8 @@ function lineDataToJSON(line) { return { synsetOffset: parseInt(tokens[0], 10), - lexFilenum: parseInt(tokens[1], 10), + lexFilenum: lexFilenum, + lexName: DataFile.LEX_NAMES[ lexFilenum ], pos: tokens[2], wCnt: wCnt, lemma: tokens[4], @@ -85,12 +100,12 @@ function readLocation(location, callback) { readChunk(location, function(err, count) { if (err) { - console.log(err); + //console.log(err); callback(err); return; } //console.log(' read %d bytes at <%d>', count, location); - callback(null, lineDataToJSON(str)); + callback(null, lineDataToJSON(str, location)); }); function readChunk(pos, cb) { @@ -98,12 +113,13 @@ function readLocation(location, callback) { str += buffer.toString('ascii'); var eol = str.indexOf('\n'); //console.log(' -- read %d bytes at <%d>', count, pos, eol); - if (eol === -1 && len < file.maxLineLength) { + if (count && eol === -1 && len < file.maxLineLength) { // continue reading return readChunk(pos + count, cb); } str = str.substr(0, eol); + if (str === '' && !err) err = new Error('no data at offset ' + pos); cb(err, count); }); } @@ -112,15 +128,16 @@ function readLocation(location, callback) { /** * main lookup function * - * @param record {object} - record to lookup, obtained from index.find() + * @param offsets {array} - array of offsets to lookup (obtained from index.find()) * @param callback{function} (optional) - callback function * @returns {Promise} */ -function lookup(record, callback) { +function lookup(offsets, callback) { var results = [], self = this, - offsets = record.synsetOffset; + single = !_.isArray(offsets); + if (single) offsets = [offsets]; return new Promise(function(resolve, reject) { offsets .map(function (offset) { @@ -134,9 +151,10 @@ function lookup(record, callback) { function done(lastResult) { closeFile(); if (lastResult instanceof Error) { - callback && callback(lastResult, []); + callback && callback(lastResult, single ? {} :[]); reject(lastResult); } else { + if (single) results = results[0]; callback && callback(null, results); resolve(results); } @@ -233,5 +251,58 @@ DataFile.MAX_LINE_LENGTH = { adv: 638 }; +/** + * map of lexFilenum to lex names + * + * @see https://wordnet.princeton.edu/wordnet/man/lexnames.5WN.html + * @type {string[]} + */ +DataFile.LEX_NAMES = [ + 'adj.all', + 'adj.pert', + 'adv.all', + 'noun.Tops', + 'noun.act', + 'noun.animal', + 'noun.artifact', + 'noun.attribute', + 'noun.body', + 'noun.cognition', + 'noun.communication', + 'noun.event', + 'noun.feeling', + 'noun.food', + 'noun.group', + 'noun.location', + 'noun.motive', + 'noun.object', + 'noun.person', + 'noun.phenomenon', + 'noun.plant', + 'noun.possession', + 'noun.process', + 'noun.quantity', + 'noun.relation', + 'noun.shape', + 'noun.state', + 'noun.substance', + 'noun.time', + 'verb.body', + 'verb.change', + 'verb.cognition', + 'verb.communication', + 'verb.competition', + 'verb.consumption', + 'verb.contact', + 'verb.creation', + 'verb.emotion', + 'verb.motion', + 'verb.perception', + 'verb.possession', + 'verb.social', + 'verb.stative', + 'verb.weather', + 'adj.ppl' +]; module.exports = DataFile; diff --git a/src/wordpos.js b/src/wordpos.js index 29a286f..b5bb23a 100644 --- a/src/wordpos.js +++ b/src/wordpos.js @@ -63,7 +63,7 @@ function lookup(pos) { .then(function(result) { if (result) { // lookup data - return files.data.lookup(result).then(done); + return files.data.lookup(result.synsetOffset).then(done); } else { // not found in index return done([]); @@ -362,6 +362,31 @@ wordposProto.getVerbs = get('isVerb'); wordposProto.parse = prepText; +/** + * seek - get record at offset for pos + * + * @param offset {number} - synset offset + * @param pos {string} - POS a/r/n/v + * @param callback {function} - optional callback + * @returns Promise + */ +wordposProto.seek = function(offset, pos, callback){ + offset = Number(offset); + if (_.isNaN(offset) || offset <= 0) return error('offset must be valid positive number.'); + + var data = this.getFilesFor(pos).data; + if (!data) return error('Incorrect POS - 2nd argument must be a, r, n or v.'); + + return data.lookup(offset, callback); + + function error(msg) { + var err = new Error(msg); + callback && callback(err, {}); + return Promise.reject(err); + } +}; + + /** * access to WordNet DB * @type {object} diff --git a/test/wordpos_test.js b/test/wordpos_test.js index 9eb1fce..fa5e8f3 100644 --- a/test/wordpos_test.js +++ b/test/wordpos_test.js @@ -21,6 +21,7 @@ var chai = require('chai'), + _ = require('underscore'), assert = chai.assert, WordPOS = require('../src/wordpos'), wordpos = new WordPOS({profile: false}); @@ -35,7 +36,9 @@ var str = "The angry bear chased the frightened little squirrel", adverbs: [ 'little' ], rest: [ 'The' ] }, - garble = 'garblegarble'; // expect not to find word + garble = 'garblegarble', // expect not to find word + offset = 1285602, + offset_pos ='a'; @@ -356,6 +359,62 @@ describe('randX()...', function() { }); +describe('seek()...', function() { + + it('should handle bad offset', function(done) { + wordpos.seek('foobar', 'a', function(err, result){ + assert(err instanceof Error); + assert.equal(err.message, 'offset must be valid positive number.'); + done(); + }); + }); + + it('should handle wrong offset', function(done) { + var bad_offset = offset + 1; + wordpos.seek(bad_offset, offset_pos, function(err, result) { + assert(err instanceof Error); + assert.equal(err.message, 'Bad data at location ' + bad_offset); + assert.deepEqual(result, {}); + done(); + }); + }); + + it('should handle very large offset', function(done) { + var bad_offset = offset + 100000000; + wordpos.seek(bad_offset, offset_pos, function(err, result) { + assert(err instanceof Error); + assert.equal(err.message, 'no data at offset ' + bad_offset); + assert.deepEqual(result, {}); + done(); + }); + }); + + it('should handle bad pos', function(done) { + wordpos.seek(offset, 'g', function(err, result) { + assert(err instanceof Error); + assert(/Incorrect POS/.test(err.message)); + done(); + }); + }); + + it('should handle wrong pos', function(done) { + wordpos.seek(offset, 'v', function(err, result){ + assert.equal(err.message, 'Bad data at location ' + offset); + }); + done(); + }); + + it('should seek offset', function(done) { + wordpos.seek(offset, offset_pos, function(err, result) { + assert.equal(result.synsetOffset, offset); + assert.equal(result.pos, 's'); + assert.equal(result.lemma, 'amazing'); + done(); + }); + }); +}); + + describe('Promise pattern', function() { @@ -413,4 +472,28 @@ describe('Promise pattern', function() { assert.equal(result[0].indexOf('foo'), 0); }); }); + + it('seek()', function () { + return wordpos.seek(offset, offset_pos).then(function (result) { + assert.equal(result.synsetOffset, offset); + assert.equal(result.pos, 's'); + assert.equal(result.lemma, 'amazing'); + + }); + }); + + it('seek() - wrong offset', function () { + return wordpos.seek(offset + 1, offset_pos).catch(function (err) { + assert(err instanceof Error); + assert.equal(err.message, 'Bad data at location ' + (offset+1)); + }); + }); + + it('seek() - bad offset', function () { + return wordpos.seek('foobar', offset_pos).catch(function (err) { + assert(err instanceof Error); + assert.equal(err.message, 'offset must be valid positive number.'); + }); + }); + }); \ No newline at end of file