wordpos/tools/stat.js

/**
 * generate fast index for WordNet index files
 *
 * Usage:
 *    node stat [--no-stats] index.adv ...
 *
 * --no-stats prevents writing stat data to file
 * Fast index is based on buckets keyed off first THREE characters in the index word,
 * eg, 'awesome' goes into bucket 'awe'
 *
 * Format of the fast index:
 *  {
 *   "firstKey":".22",				// first key value
 *   "keyLength":3,					// #characters in key
 *   "version":"3.0",				// WNdb version
 *   "name":"index.adj",			// index file name
 *   "stats":{
 *   	"buckets":2326,				// # of buckets
 *   	"words":21479,				// total # words
 *   	"biggest":310,				// #words in biggest bucket
 *   	"avg":"9.23",				// average #words per bucket
 *   	"median":3					// median #words per bucket
 *     },
 *   "offsets":{
 *     "100":[2271,"101"],			// "100" is the key,
 *     								// value=[byte offset in index file, next key]
 *      ...
 *    }
 *  }
 *
 *  To lookup a word:
 *
 *  find key (first <keyLength> chars of word)
 *  look it up in <offsets> O(1)
 *  if it exists
 *  	get offset of key and offset of next key
 *      read index file between the two offsets
 *  	binary search read data O(log avg)
 */
var
  WNdb = require('../wordpos').WNdb,
  util = require('util'),
  BufferedReader = require ("./buffered-reader"),
  _ = require('underscore')._,
  fs = require('fs'),
  path = require('path'),
  KEY_LENGTH = 3,
  stats = true,
  eofKey = '_EOF_'; // should be unique

console.log('DB folder: ', WNdb.path);
if (process.argv.length < 3) {
  console.log('#Usage:\nnode stat index.adv ...');
  process.exit(1);
}

_(process.argv.slice(2)).filter(function(arg){
  // disable writing stats file
  if (arg == '--no-stats') {
    stats = false;
    return false;
  }
  return true;
}).forEach(function(basename){

  var indexFile = path.join(WNdb.path, basename),
    jsonFile = path.join(WNdb.path, 'fast-' + basename + '.json'),
    countFile = 'fast-' + basename + '.tsv',
    endOffset = fs.statSync(indexFile).size,
    buckets = {},
    lastKey = null,
    offsets = {},
    firstKey = null;

  new BufferedReader (indexFile, {encoding: "utf8"})
    .on ("error", function (error){
      console.log ("error: %s", indexFile, error);
    })
    .on ("line", function (line, offset){
      // skip license info
      if (line[0] == ' ') return;

      // if (++i > 225) return this.interrupt();
      var key = line.substring(0, Math.min(line.indexOf(' '), KEY_LENGTH));
      if (firstKey === null) firstKey = key;

      if (key in buckets) {
        ++buckets[key];
        return;
      }

      buckets[key] = 1;
      offsets[key] = [offset];
      (lastKey !== null) && offsets[lastKey].push(key);	// current key is the 'next key' for the previous key
      lastKey = key;
    })
    .on ("end", function (){

      // add EOF offset
      offsets[lastKey].push(eofKey);
      offsets[eofKey] = [endOffset, null];

      var size = _.size(buckets),
        sum = _.reduce(buckets, function(memo, num){ return memo + num; }, 0),
        sorted = _.sortBy(buckets, function(val){ return val }),
        median = sorted[Math.floor(size/2)],
        max =  sorted[sorted.length-1], // _.max(buckets),
        maxkey = _.reduce(buckets, function(memo, val, key){ return memo + (val == max ? key :  '') }, ''),
        avg = (sum/size).toFixed(2),
        info = util.format('buckets %d, max %d at %s, sum %d, avg %d, median %d', size, max, maxkey, sum, avg, median);

//      console.log(sorted);
//      return;

      console.log(basename, info);

      if (stats) {
        // distribution in groups of 10
        var grouped = _.groupBy(buckets, function(num){ return 1 + 10*(Math.floor((num-1)/10) ) });
        _(grouped).each(function(arr, key, list){
              list[key] = arr.length;
            });
        str = '';
        _.each(grouped, function(value, key){ str += key+"\t"+value+"\n" });
        fs.writeFileSync(countFile, '#'+info+'\n'
            + '#bucket_size (1-10, 11-20, etc.) \t #buckets\n'
            + str, 'utf8');
      }

      // offset data
      var data = {
          firstKey: firstKey,
          keyLength: KEY_LENGTH,
          version: WNdb.version,
          name: basename,
          stats: {
            buckets: size,
            words: sum,
            biggest: max,
            avg: avg,
            median: median
          },
          offsets: offsets
      };

      fs.writeFileSync(jsonFile, JSON.stringify(data), 'utf8');
    })
    .read();
});