v0.1.5: added validate spec, new dir structure

validate_spec.js runs isX() on ALL index words.
isX() callback now receives lookup word as second argument.
wordpos-bench uses 512 word corpus.
This commit is contained in:
moos 2012-05-24 01:11:55 -07:00
parent 2230300dc3
commit 2548161bf6
11 changed files with 260 additions and 48 deletions

View File

@ -124,20 +124,22 @@ wordpos.isAdjective(word, callback) -- callback receives result (true/false) if
wordpos.isAdverb(word, callback) -- callback receives result (true/false) if word is an adverb. wordpos.isAdverb(word, callback) -- callback receives result (true/false) if word is an adverb.
``` ```
isX() methods return the looked-up word as the second argument to the callback.
Examples: Examples:
```js ```js
wordpos.isVerb('fish', console.log); wordpos.isVerb('fish', console.log);
// true // true 'fish'
wordpos.isNoun('fish', console.log); wordpos.isNoun('fish', console.log);
// true // true 'fish'
wordpos.isAdjective('fishy', console.log); wordpos.isAdjective('fishy', console.log);
// true // true 'fishy'
wordpos.isAdverb('fishly', console.log); wordpos.isAdverb('fishly', console.log);
// false // false 'fishly'
``` ```
### lookupX()... ### lookupX()...
@ -182,7 +184,7 @@ wordpos.lookup('great', console.log);
// ... // ...
``` ```
### Other methods ### Other methods/properties
``` ```
WordPOS.WNdb -- access to the WNdb object WordPOS.WNdb -- access to the WNdb object
@ -195,7 +197,7 @@ wordpos.parse(str) -- returns tokenized array of words, less duplicates and stop
```js ```js
WordPOS.defaults = { WordPOS.defaults = {
/** /**
* enable profiling, time in msec returned as second argument in callback * enable profiling, time in msec returned as last argument in callback
*/ */
profile: false, profile: false,
@ -210,10 +212,10 @@ To override, pass an options hash to the constructor. With the `profile` option,
```js ```js
wordpos = new WordPOS({profile: true}); wordpos = new WordPOS({profile: true});
wordpos.isAdjective('fast', console.log); wordpos.isAdjective('fast', console.log);
// true 29 // true 'fast' 29
``` ```
Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tool/stat.js. Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js.
Benchmark Benchmark

50
bench/text-512.txt Normal file
View File

@ -0,0 +1,50 @@
That's why, working with our military leaders, I have proposed a new
defense strategy that ensures we maintain the finest military in the
world, while saving nearly half a trillion dollars in our budget. To
stay one step ahead of our adversaries, I have already sent this
Congress legislation that will secure our country from the growing
danger of cyber-threats.
Above all, our freedom endures because of the men and women in uniform
who defend it. As they come home, we must serve them as well as they
served us. That includes giving them the care and benefits they have
earned which is why we've increased annual VA spending every year
I've been President. And it means enlisting our veterans in the work
of rebuilding our Nation.
With the bipartisan support of this Congress, we are providing new tax
credits to companies that hire vets. Michelle and Jill Biden have worked
with American businesses to secure a pledge of 135,000 jobs for veterans
and their families. And tonight, I'm proposing a Veterans Job Corps
that will help our communities hire veterans as cops and firefighters,
so that America is as strong as those who defend her.
Which brings me back to where I began. Those of us who've been sent
here to serve can learn from the service of our troops. When you put on
that uniform, it doesn't matter if you're black or white; Asian or
Latino; conservative or liberal; rich or poor; gay or straight. When
you're marching into battle, you look out for the person next to you,
or the mission fails. When you're in the thick of the fight, you rise
or fall as one unit, serving one Nation, leaving no one behind.
One of my proudest possessions is the flag that the SEAL Team took with
them on the mission to get bin Laden. On it are each of their names.
Some may be Democrats. Some may be Republicans. But that doesn't
matter. Just like it didn't matter that day in the Situation Room,
when I sat next to Bob Gates a man who was George Bush's defense
secretary; and Hillary Clinton, a woman who ran against me for
president.
All that mattered that day was the mission. No one thought about
politics. No one thought about themselves. One of the young men involved
in the raid later told me that he didn't deserve credit for the mission.
It only succeeded, he said, because every single member of that unit did
their job the pilot who landed the helicopter that spun out of
control; the translator who kept others from entering the compound; the
troops who separated the women and children from the fight; the SEALs
who charged up the stairs. More than that, the mission only succeeded
because every member of that unit trusted each other because you
can't charge up those stairs, into darkness and danger, unless you know
that there's someone behind you, watching your back.
So it is with America. Each time I look at that flag, I'm reminded

View File

@ -2,7 +2,7 @@
var uubench = require('uubench'), // from: https://github.com/moos/uubench var uubench = require('uubench'), // from: https://github.com/moos/uubench
fs = require('fs'), fs = require('fs'),
_ = require('underscore')._, _ = require('underscore')._,
WordPOS = require('./wordpos'), WordPOS = require('../src/wordpos'),
wordpos = new WordPOS(); wordpos = new WordPOS();
suite = new uubench.Suite({ suite = new uubench.Suite({
@ -23,6 +23,7 @@ suite = new uubench.Suite({
}, },
done: function(time){ done: function(time){
console.log('looked up %d words', nwords);
console.log('done in %d msecs', time ); console.log('done in %d msecs', time );
}, },
@ -39,13 +40,14 @@ function out(res){
var text1 = 'laksasdf', var text1 = 'laksasdf',
text128 = fs.readFileSync('text-128.txt', 'utf8'), // text128 = fs.readFileSync('text-128.txt', 'utf8'),
text, text512 = fs.readFileSync('text-512.txt', 'utf8'),
text, nwords,
pos; pos;
function getPOS(next){ function getPOS(next){
wordpos.getPOS(text, function(res){ nwords = wordpos.getPOS(text, function(res){
pos = res; pos = res;
next(); next();
}); });
@ -79,9 +81,6 @@ function getAdverbs(next){
}); });
} }
/*
* one word
*/
suite.section('--1 word--', function(next){ suite.section('--1 word--', function(next){
text = text1; text = text1;
next(); next();
@ -93,12 +92,9 @@ suite.bench('getAdjectives', getAdjectives);
suite.bench('getAdverbs', getAdverbs); suite.bench('getAdverbs', getAdverbs);
/* suite.section('--512 words--', function(next){
* 128 words
*/
suite.section('--128 words--', function(next){
suite.options.iterations = 1; suite.options.iterations = 1;
text = text128; text = text512;
next(); next();
}); });
suite.bench('getPOS', getPOS); suite.bench('getPOS', getPOS);

View File

@ -3,7 +3,7 @@
"author": "Moos <mooster@42at.com>", "author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"], "keywords": ["natural", "language", "wordnet", "pos"],
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.4", "version": "0.1.5",
"homepage": "https://github.com/moos/wordpos", "homepage": "https://github.com/moos/wordpos",
"engines": { "engines": {
"node": ">=0.4.10" "node": ">=0.4.10"
@ -20,7 +20,7 @@
"type" : "git", "type" : "git",
"url" : "git://github.com/moos/wordpos.git" "url" : "git://github.com/moos/wordpos.git"
}, },
"main": "./wordpos.js", "main": "./src/wordpos.js",
"scripts": { "scripts": {
"postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun" "postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun"
} }

98
spec/validate.js Normal file
View File

@ -0,0 +1,98 @@
/**
* validate.js
*
* read each index.<pos> file, and look up using wordpos and confirm find all words
*
* Usage:
* node validate index.adv
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
WordPos = require('../src/wordpos'),
WNdb = WordPos.WNdb,
util = require('util'),
BufferedReader = require ("../tools/buffered-reader"),
_ = require('underscore')._,
path = require('path'),
results = {},
puts = _.compose(function(a){ process.stdout.write(a)}, util.format);
if (process.argv.length < 3) return usage();
var basename = process.argv.slice(2).shift(),
indexFile = path.join(WNdb.path, basename);
if (!path.existsSync(indexFile)) {
console.error('Error: no such file %s', indexFile);
process.exit(10);
}
function usage() {
console.log('#Usage:\nnode stat index.adv');
process.exit(1);
}
function pos(basename) {
return basename.match(/index\.(.*)/)[1];
}
function isX(basename) {
return {noun:'isNoun', verb:'isVerb', adj:'isAdjective', adv:'isAdverb'}[pos(basename)];
}
var
wordpos = new WordPos(),
bin = results[basename] = {total:0, notfound:0, notlist:[]},
isFn = wordpos[isX(basename)],
words = [],
count = 0;
puts('\nReading %s:\n', indexFile);
new BufferedReader (indexFile, {encoding: "utf8", _bufferSize: 170 * 1024 })
/*
* reads 16 KB chunks by default... there's an inherent nextTick() between chunks in the underlying streaming fns.
*/
.on ("error", function (error){
console.error("error: %s", indexFile, error);
})
.on ("line", function (line, offset){
// skip license info
if (line[0] == ' ') return;
//if (count > 50) return this.interrupt();
var word = line.substring(0, line.indexOf(' '));
++count;
words.push(word);
})
.on ("end", function (){
puts('%d words, processing...', count);
words.forEach(function(word, i) {
isFn.call(wordpos, word, callback);
});
})
.read();
function callback(result, word) {
++bin.total;
!result && (++bin.notfound, bin.notlist.push(word));
if (bin.total == count) done();
}
function done() {
if (bin.notfound == 0) {
console.log('OK!');
process.exit(0);
}
else {
var n = 25;
console.log('%d not found\n%s', bin.notfound, bin.notlist.slice(0,n).join('\n'));
(bin.notlist.length > n) && console.log(' +%d more', bin.notlist.length - n);
process.nextTick(function(){ process.exit(1) });
}
}

47
spec/validate_spec.js Normal file
View File

@ -0,0 +1,47 @@
/**
* validate_spec.js
*
* Run validate on all four main index files
*
* Usage:
* npm install jasmine-node -g
* jasmine-node validate_spec.js --verbose
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
exec = require('child_process').exec;
describe('validate isX() using fastIndex', function() {
it('should validate index.noun', function() {
exec('node validate index.noun', callback);
asyncSpecWait();
});
it('should validate index.verb', function() {
exec('node validate index.verb', callback);
asyncSpecWait();
});
it('should validate index.adv', function() {
exec('node validate index.adv', callback);
asyncSpecWait();
});
it('should validate index.adj', function() {
exec('node validate index.adj', callback);
asyncSpecWait();
});
});
function callback(error, stdout, stderr) {
expect(error).toBe(null);
console.log(stdout);
console.error(stderr);
asyncSpecDone();
}

View File

@ -1,11 +1,18 @@
// npm install jasmine-node -g /**
// jasmine-node wordpos_spec.js --verbose * wordpos_spec.js
*
/* Note: 'dict' folder should contain WordNet files. * spec file for main wordpos functionality
* Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/ *
* Usage:
* npm install jasmine-node -g
* jasmine-node wordpos_spec.js --verbose
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/ */
var WordPOS = require('../src/wordpos'),
var WordPOS = require('./wordpos'),
wordpos = new WordPOS(); wordpos = new WordPOS();
var str = "The angry bear chased the frightened little squirrel", var str = "The angry bear chased the frightened little squirrel",
@ -19,7 +26,7 @@ var str = "The angry bear chased the frightened little squirrel",
garble = 'garblegarble'; // expect not to find word garble = 'garblegarble'; // expect not to find word
describe('get POS', function() { describe('getX()...', function() {
beforeEach(function() { beforeEach(function() {
this.addMatchers({ this.addMatchers({
@ -78,7 +85,7 @@ describe('get POS', function() {
}); });
}); });
describe('is POS', function() { describe('isX()...', function() {
it('should check if noun', function() { it('should check if noun', function() {
wordpos.isNoun(expected.nouns[0], function(result) { wordpos.isNoun(expected.nouns[0], function(result) {
expect(result).toBeTruthy(); expect(result).toBeTruthy();
@ -109,7 +116,7 @@ describe('is POS', function() {
}); });
}); });
describe('is !POS', function() { describe('!isX()...', function() {
it('should check if !noun', function() { it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) { wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy(); expect(result).not.toBeTruthy();
@ -140,7 +147,7 @@ describe('is !POS', function() {
}); });
}); });
describe('lookup POS', function() { describe('lookupX()...', function() {
it('should lookup noun', function() { it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) { wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n'); expect(result[0].pos).toBe('n');
@ -196,7 +203,8 @@ describe('profile option', function() {
var wp = new WordPOS({profile : true}); var wp = new WordPOS({profile : true});
it('should return time argument for isX()', function(){ it('should return time argument for isX()', function(){
wp.isNoun(garble, function(result, time) { wp.isNoun(garble, function(result, word, time) {
expect(word).toEqual(garble);
expect(time).toBeDefined(); expect(time).toBeDefined();
asyncSpecDone(); asyncSpecDone();
}); });

View File

@ -6,7 +6,7 @@
* Copyright (c) 2012 mooster@42at.com * Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos * https://github.com/moos/wordpos
* *
* Released under MIT license * * Released under MIT license
*/ */
var _ = require('underscore')._, var _ = require('underscore')._,
@ -19,6 +19,7 @@ var _ = require('underscore')._,
* load fast index bucket data * load fast index bucket data
* @param dir - dir path of index files * @param dir - dir path of index files
* @param name - name of index file, eg, 'index.verb' * @param name - name of index file, eg, 'index.verb'
* @returns Object - fast index data object
*/ */
function loadFastIndex(dir, name) { function loadFastIndex(dir, name) {
var jsonFile = path.join(dir, 'fast-' + name + '.json'), var jsonFile = path.join(dir, 'fast-' + name + '.json'),
@ -37,6 +38,7 @@ function loadFastIndex(dir, name) {
* @param key - 3-char key into fast index * @param key - 3-char key into fast index
* @param index - index file name (eg, 'index.verb') * @param index - index file name (eg, 'index.verb')
* @param callback - function receives buffer of data read * @param callback - function receives buffer of data read
* @returns none
*/ */
function readIndexForKey(key, index, callback) { function readIndexForKey(key, index, callback) {
var data = index.fastIndex, var data = index.fastIndex,
@ -55,6 +57,9 @@ function readIndexForKey(key, index, callback) {
/** /**
* function that overrides WordNet's IndexFile.find() * function that overrides WordNet's IndexFile.find()
*
* calls to same bucket are queued for callback.
*
* @param search - word to search for * @param search - word to search for
* @param callback - callback receives found line and tokens * @param callback - callback receives found line and tokens
* @returns none * @returns none
@ -67,7 +72,7 @@ function find(search, callback) {
args = [search, callback]; args = [search, callback];
var key = search.slice(0, KEY_LENGTH); var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss); if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) });
// queue up if already reading file for this key // queue up if already reading file for this key
if (key in readCallbacks){ if (key in readCallbacks){
@ -122,7 +127,7 @@ module.exports = {
* loads fast index data and return fast index find function * loads fast index data and return fast index find function
* *
* @param index is the IndexFile instance * @param index is the IndexFile instance
* @return function - fast index find or origin find if errors * @returns function - fast index find or origin find if errors
*/ */
find: function(index){ find: function(index){

View File

@ -1,5 +1,5 @@
/** /**
* wordpos * wordpos.js
* *
* Node.js part-of-speech utilities using natural's WordNet module. * Node.js part-of-speech utilities using natural's WordNet module.
* *
@ -19,7 +19,7 @@ var _ = require('underscore')._,
fastIndex = null; fastIndex = null;
try { try {
fastIndex = require('./tools/fastIndex'); fastIndex = require('./fastIndex');
} catch(e) {} } catch(e) {}
function normalize(word) { function normalize(word) {
@ -59,7 +59,7 @@ function is(pos){
index = this.getIndexFile(pos); index = this.getIndexFile(pos);
word = normalize(word); word = normalize(word);
index.lookup(word, function(record) { index.lookup(word, function(record) {
args.push(!!record); args.push(!!record, word);
profile && args.push(new Date() - start); profile && args.push(new Date() - start);
callback.apply(null, args); callback.apply(null, args);
}); });

View File

@ -1,14 +1,17 @@
/** /**
* stat.js
*
* generate fast index for WordNet index files * generate fast index for WordNet index files
* *
* Usage: * Usage:
* node stat [--no-stats] index.adv ... * node stat [--no-stats] index.adv ...
* *
* --no-stats prevents writing stat data to file * --no-stats prevents writing bucket size statistics to file.
* Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'
* *
* Format of the fast index: * Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'.
*
* Format of the fast index JSON object:
* { * {
* "firstKey":".22", // first key value * "firstKey":".22", // first key value
* "keyLength":3, // #characters in key * "keyLength":3, // #characters in key
@ -36,9 +39,14 @@
* get offset of key and offset of next key * get offset of key and offset of next key
* read index file between the two offsets * read index file between the two offsets
* binary search read data O(log avg) * binary search read data O(log avg)
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/ */
var var
WNdb = require('../wordpos').WNdb, WNdb = require('../src/wordpos').WNdb,
util = require('util'), util = require('util'),
BufferedReader = require ("./buffered-reader"), BufferedReader = require ("./buffered-reader"),
_ = require('underscore')._, _ = require('underscore')._,
@ -109,9 +117,6 @@ _(process.argv.slice(2)).filter(function(arg){
avg = (sum/size).toFixed(2), avg = (sum/size).toFixed(2),
info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median); info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);
// console.log(sorted);
// return;
console.log(basename, info); console.log(basename, info);
if (stats) { if (stats) {
@ -144,6 +149,7 @@ _(process.argv.slice(2)).filter(function(arg){
}; };
fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8'); fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
console.log(' wrote %s\n', jsonFile);
}) })
.read(); .read();
}); });