wordpos/tools/stat.js

150 lines
4.4 KiB
JavaScript

/**
* generate fast index for WordNet index files
*
* Usage:
* node stat [--no-stats] index.adv ...
*
* --no-stats prevents writing stat data to file
* Fast index is based on buckets keyed off first THREE characters in the index word,
* eg, 'awesome' goes into bucket 'awe'
*
* Format of the fast index:
* {
* "firstKey":".22", // first key value
* "keyLength":3, // #characters in key
* "version":"3.0", // WNdb version
* "name":"index.adj", // index file name
* "stats":{
* "buckets":2326, // # of buckets
* "words":21479, // total # words
* "biggest":310, // #words in biggest bucket
* "avg":"9.23", // average #words per bucket
* "median":3 // median #words per bucket
* },
* "offsets":{
* "100":[2271,"101"], // "100" is the key,
* // value=[byte offset in index file, next key]
* ...
* }
* }
*
* To lookup a word:
*
* find key (first <keyLength> chars of word)
* look it up in <offsets> O(1)
* if it exists
* get offset of key and offset of next key
* read index file between the two offsets
* binary search read data O(log avg)
*/
var
WNdb = require('../wordpos').WNdb,
util = require('util'),
BufferedReader = require ("./buffered-reader"),
_ = require('underscore')._,
fs = require('fs'),
path = require('path'),
KEY_LENGTH = 3,
stats = true,
eofKey = '_EOF_'; // should be unique
console.log('DB folder: ', WNdb.path);
if (process.argv.length < 3) {
console.log('#Usage:\nnode stat index.adv ...');
process.exit(1);
}
_(process.argv.slice(2)).filter(function(arg){
// disable writing stats file
if (arg == '--no-stats') {
stats = false;
return false;
}
return true;
}).forEach(function(basename){
var indexFile = path.join(WNdb.path, basename),
jsonFile = path.join(WNdb.path, 'fast-' + basename + '.json'),
countFile = 'fast-' + basename + '.tsv',
endOffset = fs.statSync(indexFile).size,
buckets = {},
lastKey = null,
offsets = {},
firstKey = null;
new BufferedReader (indexFile, {encoding: "utf8"})
.on ("error", function (error){
console.log ("error: %s", indexFile, error);
})
.on ("line", function (line, offset){
// skip license info
if (line[0] == ' ') return;
// if (++i > 225) return this.interrupt();
var key = line.substring(0, Math.min(line.indexOf(' '), KEY_LENGTH));
if (firstKey === null) firstKey = key;
if (key in buckets) {
++buckets[key];
return;
}
buckets[key] = 1;
offsets[key] = [offset];
(lastKey !== null) && offsets[lastKey].push(key); // current key is the 'next key' for the previous key
lastKey = key;
})
.on ("end", function (){
// add EOF offset
offsets[lastKey].push(eofKey);
offsets[eofKey] = [endOffset, null];
var size = _.size(buckets),
sum = _.reduce(buckets, function(memo, num){ return memo + num; }, 0),
sorted = _.sortBy(buckets, function(val){ return val }),
median = sorted[Math.floor(size/2)],
max = sorted[sorted.length-1], // _.max(buckets),
maxkey = _.reduce(buckets, function(memo, val, key){ return memo + (val == max ? key : '') }, ''),
avg = (sum/size).toFixed(2),
info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);
// console.log(sorted);
// return;
console.log(basename, info);
if (stats) {
// distribution in groups of 10
var grouped = _.groupBy(buckets, function(num){ return 1 + 10*(Math.floor((num-1)/10) ) });
_(grouped).each(function(arr, key, list){
list[key] = arr.length;
});
str = '';
_.each(grouped, function(value, key){ str += key+"\t"+value+"\n" });
fs.writeFileSync(countFile, '#'+info+'\n'
+ '#bucket_size (1-10, 11-20, etc.) \t #buckets\n'
+ str, 'utf8');
}
// offset data
var data = {
firstKey: firstKey,
keyLength: KEY_LENGTH,
version: WNdb.version,
name: basename,
stats: {
buckets: size,
words: sum,
biggest: max,
avg: avg,
median: median
},
offsets: offsets
};
fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
})
.read();
});