From 2548161bf6b43d2bb14054f00ae54c0161cfbc5a Mon Sep 17 00:00:00 2001 From: moos Date: Thu, 24 May 2012 01:11:55 -0700 Subject: [PATCH] v0.1.5: added validate spec, new dir structure validate_spec.js runs isX() on ALL index words. isX() callback now receives lookup word as second argument. wordpos-bench uses 512 word corpus. --- README.md | 18 ++-- text-128.txt => bench/text-128.txt | 0 bench/text-512.txt | 50 +++++++++++ wordpos-bench.js => bench/wordpos-bench.js | 20 ++--- package.json | 4 +- spec/validate.js | 98 ++++++++++++++++++++++ spec/validate_spec.js | 47 +++++++++++ wordpos_spec.js => spec/wordpos_spec.js | 32 ++++--- {tools => src}/fastIndex.js | 11 ++- wordpos.js => src/wordpos.js | 6 +- tools/stat.js | 22 +++-- 11 files changed, 260 insertions(+), 48 deletions(-) rename text-128.txt => bench/text-128.txt (100%) create mode 100644 bench/text-512.txt rename wordpos-bench.js => bench/wordpos-bench.js (85%) create mode 100644 spec/validate.js create mode 100644 spec/validate_spec.js rename wordpos_spec.js => spec/wordpos_spec.js (91%) rename {tools => src}/fastIndex.js (92%) rename wordpos.js => src/wordpos.js (98%) diff --git a/README.md b/README.md index 73f7f64..a03f3c6 100644 --- a/README.md +++ b/README.md @@ -124,20 +124,22 @@ wordpos.isAdjective(word, callback) -- callback receives result (true/false) if wordpos.isAdverb(word, callback) -- callback receives result (true/false) if word is an adverb. ``` +isX() methods return the looked-up word as the second argument to the callback. + Examples: ```js wordpos.isVerb('fish', console.log); -// true +// true 'fish' wordpos.isNoun('fish', console.log); -// true +// true 'fish' wordpos.isAdjective('fishy', console.log); -// true +// true 'fishy' wordpos.isAdverb('fishly', console.log); -// false +// false 'fishly' ``` ### lookupX()... @@ -182,7 +184,7 @@ wordpos.lookup('great', console.log); // ... ``` -### Other methods +### Other methods/properties ``` WordPOS.WNdb -- access to the WNdb object @@ -195,7 +197,7 @@ wordpos.parse(str) -- returns tokenized array of words, less duplicates and stop ```js WordPOS.defaults = { /** - * enable profiling, time in msec returned as second argument in callback + * enable profiling, time in msec returned as last argument in callback */ profile: false, @@ -210,10 +212,10 @@ To override, pass an options hash to the constructor. With the `profile` option, ```js wordpos = new WordPOS({profile: true}); wordpos.isAdjective('fast', console.log); - // true 29 + // true 'fast' 29 ``` -Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tool/stat.js. +Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js. Benchmark diff --git a/text-128.txt b/bench/text-128.txt similarity index 100% rename from text-128.txt rename to bench/text-128.txt diff --git a/bench/text-512.txt b/bench/text-512.txt new file mode 100644 index 0000000..9fe8f1b --- /dev/null +++ b/bench/text-512.txt @@ -0,0 +1,50 @@ +That's why, working with our military leaders, I have proposed a new +defense strategy that ensures we maintain the finest military in the +world, while saving nearly half a trillion dollars in our budget. To +stay one step ahead of our adversaries, I have already sent this +Congress legislation that will secure our country from the growing +danger of cyber-threats. + +Above all, our freedom endures because of the men and women in uniform +who defend it. As they come home, we must serve them as well as they +served us. That includes giving them the care and benefits they have +earned – which is why we've increased annual VA spending every year +I've been President. And it means enlisting our veterans in the work +of rebuilding our Nation. + +With the bipartisan support of this Congress, we are providing new tax +credits to companies that hire vets. Michelle and Jill Biden have worked +with American businesses to secure a pledge of 135,000 jobs for veterans +and their families. And tonight, I'm proposing a Veterans Job Corps +that will help our communities hire veterans as cops and firefighters, +so that America is as strong as those who defend her. + +Which brings me back to where I began. Those of us who've been sent +here to serve can learn from the service of our troops. When you put on +that uniform, it doesn't matter if you're black or white; Asian or +Latino; conservative or liberal; rich or poor; gay or straight. When +you're marching into battle, you look out for the person next to you, +or the mission fails. When you're in the thick of the fight, you rise +or fall as one unit, serving one Nation, leaving no one behind. + +One of my proudest possessions is the flag that the SEAL Team took with +them on the mission to get bin Laden. On it are each of their names. +Some may be Democrats. Some may be Republicans. But that doesn't +matter. Just like it didn't matter that day in the Situation Room, +when I sat next to Bob Gates – a man who was George Bush's defense +secretary; and Hillary Clinton, a woman who ran against me for +president. + +All that mattered that day was the mission. No one thought about +politics. No one thought about themselves. One of the young men involved +in the raid later told me that he didn't deserve credit for the mission. +It only succeeded, he said, because every single member of that unit did +their job – the pilot who landed the helicopter that spun out of +control; the translator who kept others from entering the compound; the +troops who separated the women and children from the fight; the SEALs +who charged up the stairs. More than that, the mission only succeeded +because every member of that unit trusted each other – because you +can't charge up those stairs, into darkness and danger, unless you know +that there's someone behind you, watching your back. + +So it is with America. Each time I look at that flag, I'm reminded diff --git a/wordpos-bench.js b/bench/wordpos-bench.js similarity index 85% rename from wordpos-bench.js rename to bench/wordpos-bench.js index 451129d..2821c74 100644 --- a/wordpos-bench.js +++ b/bench/wordpos-bench.js @@ -2,7 +2,7 @@ var uubench = require('uubench'), // from: https://github.com/moos/uubench fs = require('fs'), _ = require('underscore')._, - WordPOS = require('./wordpos'), + WordPOS = require('../src/wordpos'), wordpos = new WordPOS(); suite = new uubench.Suite({ @@ -23,6 +23,7 @@ suite = new uubench.Suite({ }, done: function(time){ + console.log('looked up %d words', nwords); console.log('done in %d msecs', time ); }, @@ -39,13 +40,14 @@ function out(res){ var text1 = 'laksasdf', - text128 = fs.readFileSync('text-128.txt', 'utf8'), - text, +// text128 = fs.readFileSync('text-128.txt', 'utf8'), + text512 = fs.readFileSync('text-512.txt', 'utf8'), + text, nwords, pos; function getPOS(next){ - wordpos.getPOS(text, function(res){ + nwords = wordpos.getPOS(text, function(res){ pos = res; next(); }); @@ -79,9 +81,6 @@ function getAdverbs(next){ }); } -/* - * one word - */ suite.section('--1 word--', function(next){ text = text1; next(); @@ -93,12 +92,9 @@ suite.bench('getAdjectives', getAdjectives); suite.bench('getAdverbs', getAdverbs); -/* - * 128 words - */ -suite.section('--128 words--', function(next){ +suite.section('--512 words--', function(next){ suite.options.iterations = 1; - text = text128; + text = text512; next(); }); suite.bench('getPOS', getPOS); diff --git a/package.json b/package.json index 9c7c5c6..2009050 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "author": "Moos ", "keywords": ["natural", "language", "wordnet", "pos"], "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", - "version": "0.1.4", + "version": "0.1.5", "homepage": "https://github.com/moos/wordpos", "engines": { "node": ">=0.4.10" @@ -20,7 +20,7 @@ "type" : "git", "url" : "git://github.com/moos/wordpos.git" }, - "main": "./wordpos.js", + "main": "./src/wordpos.js", "scripts": { "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun" } diff --git a/spec/validate.js b/spec/validate.js new file mode 100644 index 0000000..585e91b --- /dev/null +++ b/spec/validate.js @@ -0,0 +1,98 @@ +/** + * validate.js + * + * read each index. file, and look up using wordpos and confirm find all words + * + * Usage: + * node validate index.adv + * + * Copyright (c) 2012 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ +var + WordPos = require('../src/wordpos'), + WNdb = WordPos.WNdb, + util = require('util'), + BufferedReader = require ("../tools/buffered-reader"), + _ = require('underscore')._, + path = require('path'), + results = {}, + puts = _.compose(function(a){ process.stdout.write(a)}, util.format); + +if (process.argv.length < 3) return usage(); + +var basename = process.argv.slice(2).shift(), + indexFile = path.join(WNdb.path, basename); + +if (!path.existsSync(indexFile)) { + console.error('Error: no such file %s', indexFile); + process.exit(10); +} + +function usage() { + console.log('#Usage:\nnode stat index.adv'); + process.exit(1); +} + +function pos(basename) { + return basename.match(/index\.(.*)/)[1]; +} + +function isX(basename) { + return {noun:'isNoun', verb:'isVerb', adj:'isAdjective', adv:'isAdverb'}[pos(basename)]; +} + +var + wordpos = new WordPos(), + bin = results[basename] = {total:0, notfound:0, notlist:[]}, + isFn = wordpos[isX(basename)], + words = [], + count = 0; + +puts('\nReading %s:\n', indexFile); +new BufferedReader (indexFile, {encoding: "utf8", _bufferSize: 170 * 1024 }) + /* + * reads 16 KB chunks by default... there's an inherent nextTick() between chunks in the underlying streaming fns. + */ + .on ("error", function (error){ + console.error("error: %s", indexFile, error); + }) + .on ("line", function (line, offset){ + // skip license info + if (line[0] == ' ') return; + + //if (count > 50) return this.interrupt(); + var word = line.substring(0, line.indexOf(' ')); + ++count; + words.push(word); + }) + .on ("end", function (){ + puts('%d words, processing...', count); + words.forEach(function(word, i) { + isFn.call(wordpos, word, callback); + }); + }) + .read(); + + +function callback(result, word) { + ++bin.total; + !result && (++bin.notfound, bin.notlist.push(word)); + if (bin.total == count) done(); +} + +function done() { + if (bin.notfound == 0) { + console.log('OK!'); + process.exit(0); + } + else { + var n = 25; + console.log('%d not found\n%s', bin.notfound, bin.notlist.slice(0,n).join('\n')); + (bin.notlist.length > n) && console.log(' +%d more', bin.notlist.length - n); + process.nextTick(function(){ process.exit(1) }); + } +} + diff --git a/spec/validate_spec.js b/spec/validate_spec.js new file mode 100644 index 0000000..e492e43 --- /dev/null +++ b/spec/validate_spec.js @@ -0,0 +1,47 @@ +/** + * validate_spec.js + * + * Run validate on all four main index files + * + * Usage: + * npm install jasmine-node -g + * jasmine-node validate_spec.js --verbose + * + * Copyright (c) 2012 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license + */ +var + exec = require('child_process').exec; + +describe('validate isX() using fastIndex', function() { + + it('should validate index.noun', function() { + exec('node validate index.noun', callback); + asyncSpecWait(); + }); + + it('should validate index.verb', function() { + exec('node validate index.verb', callback); + asyncSpecWait(); + }); + + it('should validate index.adv', function() { + exec('node validate index.adv', callback); + asyncSpecWait(); + }); + + it('should validate index.adj', function() { + exec('node validate index.adj', callback); + asyncSpecWait(); + }); + +}); + +function callback(error, stdout, stderr) { + expect(error).toBe(null); + console.log(stdout); + console.error(stderr); + asyncSpecDone(); +} \ No newline at end of file diff --git a/wordpos_spec.js b/spec/wordpos_spec.js similarity index 91% rename from wordpos_spec.js rename to spec/wordpos_spec.js index c28347e..b0b4281 100644 --- a/wordpos_spec.js +++ b/spec/wordpos_spec.js @@ -1,11 +1,18 @@ -// npm install jasmine-node -g -// jasmine-node wordpos_spec.js --verbose - -/* Note: 'dict' folder should contain WordNet files. - * Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/ +/** + * wordpos_spec.js + * + * spec file for main wordpos functionality + * + * Usage: + * npm install jasmine-node -g + * jasmine-node wordpos_spec.js --verbose + * + * Copyright (c) 2012 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license */ - -var WordPOS = require('./wordpos'), +var WordPOS = require('../src/wordpos'), wordpos = new WordPOS(); var str = "The angry bear chased the frightened little squirrel", @@ -19,7 +26,7 @@ var str = "The angry bear chased the frightened little squirrel", garble = 'garblegarble'; // expect not to find word -describe('get POS', function() { +describe('getX()...', function() { beforeEach(function() { this.addMatchers({ @@ -78,7 +85,7 @@ describe('get POS', function() { }); }); -describe('is POS', function() { +describe('isX()...', function() { it('should check if noun', function() { wordpos.isNoun(expected.nouns[0], function(result) { expect(result).toBeTruthy(); @@ -109,7 +116,7 @@ describe('is POS', function() { }); }); -describe('is !POS', function() { +describe('!isX()...', function() { it('should check if !noun', function() { wordpos.isNoun(garble, function(result) { expect(result).not.toBeTruthy(); @@ -140,7 +147,7 @@ describe('is !POS', function() { }); }); -describe('lookup POS', function() { +describe('lookupX()...', function() { it('should lookup noun', function() { wordpos.lookupNoun('squirrel', function(result) { expect(result[0].pos).toBe('n'); @@ -196,7 +203,8 @@ describe('profile option', function() { var wp = new WordPOS({profile : true}); it('should return time argument for isX()', function(){ - wp.isNoun(garble, function(result, time) { + wp.isNoun(garble, function(result, word, time) { + expect(word).toEqual(garble); expect(time).toBeDefined(); asyncSpecDone(); }); diff --git a/tools/fastIndex.js b/src/fastIndex.js similarity index 92% rename from tools/fastIndex.js rename to src/fastIndex.js index 1e245e7..31397ae 100644 --- a/tools/fastIndex.js +++ b/src/fastIndex.js @@ -6,7 +6,7 @@ * Copyright (c) 2012 mooster@42at.com * https://github.com/moos/wordpos * - * Released under MIT license * + * Released under MIT license */ var _ = require('underscore')._, @@ -19,6 +19,7 @@ var _ = require('underscore')._, * load fast index bucket data * @param dir - dir path of index files * @param name - name of index file, eg, 'index.verb' + * @returns Object - fast index data object */ function loadFastIndex(dir, name) { var jsonFile = path.join(dir, 'fast-' + name + '.json'), @@ -37,6 +38,7 @@ function loadFastIndex(dir, name) { * @param key - 3-char key into fast index * @param index - index file name (eg, 'index.verb') * @param callback - function receives buffer of data read + * @returns none */ function readIndexForKey(key, index, callback) { var data = index.fastIndex, @@ -55,6 +57,9 @@ function readIndexForKey(key, index, callback) { /** * function that overrides WordNet's IndexFile.find() + * + * calls to same bucket are queued for callback. + * * @param search - word to search for * @param callback - callback receives found line and tokens * @returns none @@ -67,7 +72,7 @@ function find(search, callback) { args = [search, callback]; var key = search.slice(0, KEY_LENGTH); - if (!(key in data.offsets)) return callback(miss); + if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) }); // queue up if already reading file for this key if (key in readCallbacks){ @@ -122,7 +127,7 @@ module.exports = { * loads fast index data and return fast index find function * * @param index is the IndexFile instance - * @return function - fast index find or origin find if errors + * @returns function - fast index find or origin find if errors */ find: function(index){ diff --git a/wordpos.js b/src/wordpos.js similarity index 98% rename from wordpos.js rename to src/wordpos.js index 073a426..1b41888 100644 --- a/wordpos.js +++ b/src/wordpos.js @@ -1,5 +1,5 @@ /** -* wordpos +* wordpos.js * * Node.js part-of-speech utilities using natural's WordNet module. * @@ -19,7 +19,7 @@ var _ = require('underscore')._, fastIndex = null; try { - fastIndex = require('./tools/fastIndex'); + fastIndex = require('./fastIndex'); } catch(e) {} function normalize(word) { @@ -59,7 +59,7 @@ function is(pos){ index = this.getIndexFile(pos); word = normalize(word); index.lookup(word, function(record) { - args.push(!!record); + args.push(!!record, word); profile && args.push(new Date() - start); callback.apply(null, args); }); diff --git a/tools/stat.js b/tools/stat.js index 6d429f3..77c0922 100644 --- a/tools/stat.js +++ b/tools/stat.js @@ -1,14 +1,17 @@ /** + * stat.js + * * generate fast index for WordNet index files * * Usage: * node stat [--no-stats] index.adv ... * - * --no-stats prevents writing stat data to file - * Fast index is based on buckets keyed off first THREE characters in the index word, - * eg, 'awesome' goes into bucket 'awe' + * --no-stats prevents writing bucket size statistics to file. * - * Format of the fast index: + * Fast index is based on buckets keyed off first THREE characters in the index word, + * eg, 'awesome' goes into bucket 'awe'. + * + * Format of the fast index JSON object: * { * "firstKey":".22", // first key value * "keyLength":3, // #characters in key @@ -36,9 +39,14 @@ * get offset of key and offset of next key * read index file between the two offsets * binary search read data O(log avg) + * + * Copyright (c) 2012 mooster@42at.com + * https://github.com/moos/wordpos + * + * Released under MIT license */ var - WNdb = require('../wordpos').WNdb, + WNdb = require('../src/wordpos').WNdb, util = require('util'), BufferedReader = require ("./buffered-reader"), _ = require('underscore')._, @@ -109,9 +117,6 @@ _(process.argv.slice(2)).filter(function(arg){ avg = (sum/size).toFixed(2), info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median); -// console.log(sorted); -// return; - console.log(basename, info); if (stats) { @@ -144,6 +149,7 @@ _(process.argv.slice(2)).filter(function(arg){ }; fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8'); + console.log(' wrote %s\n', jsonFile); }) .read(); });