v0.1.5: added validate spec, new dir structure

validate_spec.js runs isX() on ALL index words.
isX() callback now receives lookup word as second argument.
wordpos-bench uses 512 word corpus.
This commit is contained in:
moos 2012-05-24 01:11:55 -07:00
parent 2230300dc3
commit 2548161bf6
11 changed files with 260 additions and 48 deletions

View File

@ -124,20 +124,22 @@ wordpos.isAdjective(word, callback) -- callback receives result (true/false) if
wordpos.isAdverb(word, callback) -- callback receives result (true/false) if word is an adverb.
```
isX() methods return the looked-up word as the second argument to the callback.
Examples:
```js
wordpos.isVerb('fish', console.log);
// true
// true 'fish'
wordpos.isNoun('fish', console.log);
// true
// true 'fish'
wordpos.isAdjective('fishy', console.log);
// true
// true 'fishy'
wordpos.isAdverb('fishly', console.log);
// false
// false 'fishly'
```
### lookupX()...
@ -182,7 +184,7 @@ wordpos.lookup('great', console.log);
// ...
```
### Other methods
### Other methods/properties
```
WordPOS.WNdb -- access to the WNdb object
@ -195,7 +197,7 @@ wordpos.parse(str) -- returns tokenized array of words, less duplicates and stop
```js
WordPOS.defaults = {
/**
* enable profiling, time in msec returned as second argument in callback
* enable profiling, time in msec returned as last argument in callback
*/
profile: false,
@ -210,10 +212,10 @@ To override, pass an options hash to the constructor. With the `profile` option,
```js
wordpos = new WordPOS({profile: true});
wordpos.isAdjective('fast', console.log);
// true 29
// true 'fast' 29
```
Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tool/stat.js.
Version 0.1.4 introduces `fastIndex` option. This uses a secondary index on the index files and is much faster. It is on by default. Secondary index files are generated at install time and placed in the same directory as WNdb.path. Details can be found in tools/stat.js.
Benchmark

50
bench/text-512.txt Normal file
View File

@ -0,0 +1,50 @@
That's why, working with our military leaders, I have proposed a new
defense strategy that ensures we maintain the finest military in the
world, while saving nearly half a trillion dollars in our budget. To
stay one step ahead of our adversaries, I have already sent this
Congress legislation that will secure our country from the growing
danger of cyber-threats.
Above all, our freedom endures because of the men and women in uniform
who defend it. As they come home, we must serve them as well as they
served us. That includes giving them the care and benefits they have
earned which is why we've increased annual VA spending every year
I've been President. And it means enlisting our veterans in the work
of rebuilding our Nation.
With the bipartisan support of this Congress, we are providing new tax
credits to companies that hire vets. Michelle and Jill Biden have worked
with American businesses to secure a pledge of 135,000 jobs for veterans
and their families. And tonight, I'm proposing a Veterans Job Corps
that will help our communities hire veterans as cops and firefighters,
so that America is as strong as those who defend her.
Which brings me back to where I began. Those of us who've been sent
here to serve can learn from the service of our troops. When you put on
that uniform, it doesn't matter if you're black or white; Asian or
Latino; conservative or liberal; rich or poor; gay or straight. When
you're marching into battle, you look out for the person next to you,
or the mission fails. When you're in the thick of the fight, you rise
or fall as one unit, serving one Nation, leaving no one behind.
One of my proudest possessions is the flag that the SEAL Team took with
them on the mission to get bin Laden. On it are each of their names.
Some may be Democrats. Some may be Republicans. But that doesn't
matter. Just like it didn't matter that day in the Situation Room,
when I sat next to Bob Gates a man who was George Bush's defense
secretary; and Hillary Clinton, a woman who ran against me for
president.
All that mattered that day was the mission. No one thought about
politics. No one thought about themselves. One of the young men involved
in the raid later told me that he didn't deserve credit for the mission.
It only succeeded, he said, because every single member of that unit did
their job the pilot who landed the helicopter that spun out of
control; the translator who kept others from entering the compound; the
troops who separated the women and children from the fight; the SEALs
who charged up the stairs. More than that, the mission only succeeded
because every member of that unit trusted each other because you
can't charge up those stairs, into darkness and danger, unless you know
that there's someone behind you, watching your back.
So it is with America. Each time I look at that flag, I'm reminded

View File

@ -2,7 +2,7 @@
var uubench = require('uubench'), // from: https://github.com/moos/uubench
fs = require('fs'),
_ = require('underscore')._,
WordPOS = require('./wordpos'),
WordPOS = require('../src/wordpos'),
wordpos = new WordPOS();
suite = new uubench.Suite({
@ -23,6 +23,7 @@ suite = new uubench.Suite({
},
done: function(time){
console.log('looked up %d words', nwords);
console.log('done in %d msecs', time );
},
@ -39,13 +40,14 @@ function out(res){
var text1 = 'laksasdf',
text128 = fs.readFileSync('text-128.txt', 'utf8'),
text,
// text128 = fs.readFileSync('text-128.txt', 'utf8'),
text512 = fs.readFileSync('text-512.txt', 'utf8'),
text, nwords,
pos;
function getPOS(next){
wordpos.getPOS(text, function(res){
nwords = wordpos.getPOS(text, function(res){
pos = res;
next();
});
@ -79,9 +81,6 @@ function getAdverbs(next){
});
}
/*
* one word
*/
suite.section('--1 word--', function(next){
text = text1;
next();
@ -93,12 +92,9 @@ suite.bench('getAdjectives', getAdjectives);
suite.bench('getAdverbs', getAdverbs);
/*
* 128 words
*/
suite.section('--128 words--', function(next){
suite.section('--512 words--', function(next){
suite.options.iterations = 1;
text = text128;
text = text512;
next();
});
suite.bench('getPOS', getPOS);

View File

@ -3,7 +3,7 @@
"author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "pos"],
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.4",
"version": "0.1.5",
"homepage": "https://github.com/moos/wordpos",
"engines": {
"node": ">=0.4.10"
@ -20,7 +20,7 @@
"type" : "git",
"url" : "git://github.com/moos/wordpos.git"
},
"main": "./wordpos.js",
"main": "./src/wordpos.js",
"scripts": {
"postinstall": "node tools/stat.js --no-stats index.adv index.adj index.verb index.noun"
}

98
spec/validate.js Normal file
View File

@ -0,0 +1,98 @@
/**
* validate.js
*
* read each index.<pos> file, and look up using wordpos and confirm find all words
*
* Usage:
* node validate index.adv
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
WordPos = require('../src/wordpos'),
WNdb = WordPos.WNdb,
util = require('util'),
BufferedReader = require ("../tools/buffered-reader"),
_ = require('underscore')._,
path = require('path'),
results = {},
puts = _.compose(function(a){ process.stdout.write(a)}, util.format);
if (process.argv.length < 3) return usage();
var basename = process.argv.slice(2).shift(),
indexFile = path.join(WNdb.path, basename);
if (!path.existsSync(indexFile)) {
console.error('Error: no such file %s', indexFile);
process.exit(10);
}
function usage() {
console.log('#Usage:\nnode stat index.adv');
process.exit(1);
}
function pos(basename) {
return basename.match(/index\.(.*)/)[1];
}
function isX(basename) {
return {noun:'isNoun', verb:'isVerb', adj:'isAdjective', adv:'isAdverb'}[pos(basename)];
}
var
wordpos = new WordPos(),
bin = results[basename] = {total:0, notfound:0, notlist:[]},
isFn = wordpos[isX(basename)],
words = [],
count = 0;
puts('\nReading %s:\n', indexFile);
new BufferedReader (indexFile, {encoding: "utf8", _bufferSize: 170 * 1024 })
/*
* reads 16 KB chunks by default... there's an inherent nextTick() between chunks in the underlying streaming fns.
*/
.on ("error", function (error){
console.error("error: %s", indexFile, error);
})
.on ("line", function (line, offset){
// skip license info
if (line[0] == ' ') return;
//if (count > 50) return this.interrupt();
var word = line.substring(0, line.indexOf(' '));
++count;
words.push(word);
})
.on ("end", function (){
puts('%d words, processing...', count);
words.forEach(function(word, i) {
isFn.call(wordpos, word, callback);
});
})
.read();
function callback(result, word) {
++bin.total;
!result && (++bin.notfound, bin.notlist.push(word));
if (bin.total == count) done();
}
function done() {
if (bin.notfound == 0) {
console.log('OK!');
process.exit(0);
}
else {
var n = 25;
console.log('%d not found\n%s', bin.notfound, bin.notlist.slice(0,n).join('\n'));
(bin.notlist.length > n) && console.log(' +%d more', bin.notlist.length - n);
process.nextTick(function(){ process.exit(1) });
}
}

47
spec/validate_spec.js Normal file
View File

@ -0,0 +1,47 @@
/**
* validate_spec.js
*
* Run validate on all four main index files
*
* Usage:
* npm install jasmine-node -g
* jasmine-node validate_spec.js --verbose
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
exec = require('child_process').exec;
describe('validate isX() using fastIndex', function() {
it('should validate index.noun', function() {
exec('node validate index.noun', callback);
asyncSpecWait();
});
it('should validate index.verb', function() {
exec('node validate index.verb', callback);
asyncSpecWait();
});
it('should validate index.adv', function() {
exec('node validate index.adv', callback);
asyncSpecWait();
});
it('should validate index.adj', function() {
exec('node validate index.adj', callback);
asyncSpecWait();
});
});
function callback(error, stdout, stderr) {
expect(error).toBe(null);
console.log(stdout);
console.error(stderr);
asyncSpecDone();
}

View File

@ -1,11 +1,18 @@
// npm install jasmine-node -g
// jasmine-node wordpos_spec.js --verbose
/* Note: 'dict' folder should contain WordNet files.
* Download and unpack manually from http://wordnet.princeton.edu/wordnet/download/current-version/
/**
* wordpos_spec.js
*
* spec file for main wordpos functionality
*
* Usage:
* npm install jasmine-node -g
* jasmine-node wordpos_spec.js --verbose
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var WordPOS = require('./wordpos'),
var WordPOS = require('../src/wordpos'),
wordpos = new WordPOS();
var str = "The angry bear chased the frightened little squirrel",
@ -19,7 +26,7 @@ var str = "The angry bear chased the frightened little squirrel",
garble = 'garblegarble'; // expect not to find word
describe('get POS', function() {
describe('getX()...', function() {
beforeEach(function() {
this.addMatchers({
@ -78,7 +85,7 @@ describe('get POS', function() {
});
});
describe('is POS', function() {
describe('isX()...', function() {
it('should check if noun', function() {
wordpos.isNoun(expected.nouns[0], function(result) {
expect(result).toBeTruthy();
@ -109,7 +116,7 @@ describe('is POS', function() {
});
});
describe('is !POS', function() {
describe('!isX()...', function() {
it('should check if !noun', function() {
wordpos.isNoun(garble, function(result) {
expect(result).not.toBeTruthy();
@ -140,7 +147,7 @@ describe('is !POS', function() {
});
});
describe('lookup POS', function() {
describe('lookupX()...', function() {
it('should lookup noun', function() {
wordpos.lookupNoun('squirrel', function(result) {
expect(result[0].pos).toBe('n');
@ -196,7 +203,8 @@ describe('profile option', function() {
var wp = new WordPOS({profile : true});
it('should return time argument for isX()', function(){
wp.isNoun(garble, function(result, time) {
wp.isNoun(garble, function(result, word, time) {
expect(word).toEqual(garble);
expect(time).toBeDefined();
asyncSpecDone();
});

View File

@ -6,7 +6,7 @@
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license *
* Released under MIT license
*/
var _ = require('underscore')._,
@ -19,6 +19,7 @@ var _ = require('underscore')._,
* load fast index bucket data
* @param dir - dir path of index files
* @param name - name of index file, eg, 'index.verb'
* @returns Object - fast index data object
*/
function loadFastIndex(dir, name) {
var jsonFile = path.join(dir, 'fast-' + name + '.json'),
@ -37,6 +38,7 @@ function loadFastIndex(dir, name) {
* @param key - 3-char key into fast index
* @param index - index file name (eg, 'index.verb')
* @param callback - function receives buffer of data read
* @returns none
*/
function readIndexForKey(key, index, callback) {
var data = index.fastIndex,
@ -55,6 +57,9 @@ function readIndexForKey(key, index, callback) {
/**
* function that overrides WordNet's IndexFile.find()
*
* calls to same bucket are queued for callback.
*
* @param search - word to search for
* @param callback - callback receives found line and tokens
* @returns none
@ -67,7 +72,7 @@ function find(search, callback) {
args = [search, callback];
var key = search.slice(0, KEY_LENGTH);
if (!(key in data.offsets)) return callback(miss);
if (!(key in data.offsets)) return process.nextTick(function(){ callback(miss) });
// queue up if already reading file for this key
if (key in readCallbacks){
@ -122,7 +127,7 @@ module.exports = {
* loads fast index data and return fast index find function
*
* @param index is the IndexFile instance
* @return function - fast index find or origin find if errors
* @returns function - fast index find or origin find if errors
*/
find: function(index){

View File

@ -1,5 +1,5 @@
/**
* wordpos
* wordpos.js
*
* Node.js part-of-speech utilities using natural's WordNet module.
*
@ -19,7 +19,7 @@ var _ = require('underscore')._,
fastIndex = null;
try {
fastIndex = require('./tools/fastIndex');
fastIndex = require('./fastIndex');
} catch(e) {}
function normalize(word) {
@ -59,7 +59,7 @@ function is(pos){
index = this.getIndexFile(pos);
word = normalize(word);
index.lookup(word, function(record) {
args.push(!!record);
args.push(!!record, word);
profile && args.push(new Date() - start);
callback.apply(null, args);
});

View File

@ -1,14 +1,17 @@
/**
* stat.js
*
* generate fast index for WordNet index files
*
* Usage:
* node stat [--no-stats] index.adv ...
*
* --no-stats prevents writing stat data to file
* Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'
* --no-stats prevents writing bucket size statistics to file.
*
* Format of the fast index:
* Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'.
*
* Format of the fast index JSON object:
* {
* "firstKey":".22", // first key value
* "keyLength":3, // #characters in key
@ -36,9 +39,14 @@
* get offset of key and offset of next key
* read index file between the two offsets
* binary search read data O(log avg)
*
* Copyright (c) 2012 mooster@42at.com
* https://github.com/moos/wordpos
*
* Released under MIT license
*/
var
WNdb = require('../wordpos').WNdb,
WNdb = require('../src/wordpos').WNdb,
util = require('util'),
BufferedReader = require ("./buffered-reader"),
_ = require('underscore')._,
@ -109,9 +117,6 @@ _(process.argv.slice(2)).filter(function(arg){
avg = (sum/size).toFixed(2),
info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);
// console.log(sorted);
// return;
console.log(basename, info);
if (stats) {
@ -144,6 +149,7 @@ _(process.argv.slice(2)).filter(function(arg){
};
fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
console.log(' wrote %s\n', jsonFile);
})
.read();
});