fix stopwords with CLI

This commit is contained in:
Moos 2014-09-25 04:37:33 -07:00
parent 946147d05f
commit cb884d2dfa
4 changed files with 186 additions and 112 deletions

124
README.md
View File

@ -3,7 +3,7 @@ wordpos
wordpos is a set of part-of-speech (POS) utilities for Node.js using [natural's](http://github.com/NaturalNode/natural) WordNet module. wordpos is a set of part-of-speech (POS) utilities for Node.js using [natural's](http://github.com/NaturalNode/natural) WordNet module.
*Update*: New version 0.1.10 - get random word(s). *Update*: get random word(s).
## Quick usage ## Quick usage
Command-line: Command-line:
@ -50,6 +50,7 @@ Note: `wordpos-bench.js` requires a [forked uubench](https://github.com/moos/uub
To run spec: To run spec:
npm install jasmine-node -g npm install jasmine-node -g
cd spec
jasmine-node wordpos_spec.js --verbose jasmine-node wordpos_spec.js --verbose
jasmine-node validate_spec.js --verbose jasmine-node validate_spec.js --verbose
@ -116,7 +117,7 @@ If you're only interested in a certain POS (say, adjectives), using the particul
than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js) than getPOS() which looks up the word in all index files. [stopwords] (https://github.com/NaturalNode/natural/blob/master/lib/natural/util/stopwords.js)
are stripped out from text before lookup. are stripped out from text before lookup.
If text is an array, all words are looked-up -- no deduplication, stopword filter or tokenization is applied. If text is an *array*, all words are looked-up -- no deduplication, stopword filter or tokenization is applied.
getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords). getX() functions return the number of parsed words that will be looked up (less duplicates and stopwords).
@ -298,105 +299,11 @@ See blog article [Optimizing WordPos](http://blog.42at.com/optimizing-wordpos).
## Command-line: CLI ## Command-line: CLI
Version 0.1.6 introduces the command-line interface (./bin/wordpos-cli.js), available as 'wordpos' if installed globally
"npm install wordpos -g", otherwise as 'node_modules/.bin/wordpos' if installed without the -g.
```bash
$ wordpos get The angry bear chased the frightened little squirrel
# Noun 4:
bear
chased
little
squirrel
# Adjective 3:
angry
frightened
little
# Verb 1:
bear
# Adverb 1:
little
```
Just the nouns, brief output:
```bash
$ wordpos get --noun -b The angry bear chased the frightened little squirrel
bear chased little squirrel
```
Just the counts: (nouns, adjectives, verbs, adverbs, total parsed words)
```bash
$ wordpos get -c The angry bear chased the frightened little squirrel
4 3 1 1 7
```
Just the adjective count: (0, adjectives, 0, 0, total parsed words)
```bash
$ wordpos get --adj -c The angry bear chased the frightened little squirrel
0 3 0 0 7
```
Get definitions:
```bash
$ wordpos def git
git
n: a person who is deemed to be despicable or contemptible; "only a rotter would do that"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptible persona `git'"
```
Get full result object:
```bash
$ wordpos def git -f
{ git:
[ { synsetOffset: 10539715,
lexFilenum: 18,
pos: 'n',
wCnt: 0,
lemma: 'rotter',
synonyms: [],
lexId: '0',
ptrs: [],
gloss: 'a person who is deemed to be despicable or contemptible; "only a rotter would do that
"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptib
le person a `git\'" ' } ] }
```
As JSON:
```bash
$ wordpos def git -j
{"git":[{"synsetOffset":10539715,"lexFilenum":18,"pos":"n","wCnt":0,"lemma":"rotter","synonyms":[],"
lexId":"0","ptrs":[],"gloss":"a person who is deemed to be despicable or contemptible; \"only a rotter
would do that\"; \"kill the rat\"; \"throw the bum out\"; \"you cowardly little pukes!\"; \"the British
call a contemptible person a `git'\" "}]}
```
Get random words:
```bash
$ wordpos rand
# 1:
hopelessly
$ wordpos rand -N 2 foot
# foot 2:
footprint
footlights
$ wordpos rand -N 2 foot hand
# foot 2:
footlocker
footmark
# hand 2:
hand-hewn
handstitched
$ wordpos rand --adj foot
# foot 1:
foot-shaped
```
Usage: Usage:
```bash ```bash
$ wordpos $ wordpos
Usage: wordpos-cli.js [options] <command> [word ... | -i <file> | <stdin>] Usage: wordpos [options] <command> [word ... | -i <file> | <stdin>]
Commands: Commands:
@ -404,9 +311,11 @@ $ wordpos
def lookup definitions def lookup definitions
rand get random words (optionally starting with 'word' ...)
parse show parsed words, deduped and less stopwords parse show parsed words, deduped and less stopwords
rand get random words (optionally starting with 'word' ...) stopwords show list of stopwords (valid options are -b and -j)
Options: Options:
@ -421,10 +330,12 @@ $ wordpos
-f, --full full results object -f, --full full results object
-j, --json full results object as JSON -j, --json full results object as JSON
-i, --file <file> input file -i, --file <file> input file
-s, --stopwords include stopwords -s, --withStopwords include stopwords (default: stopwords are excluded)
-N, --num <num> number of random words to get -N, --num <num> number of random words to get
``` ```
For CLI examples, see [bin/README](bin/README.md).
## Benchmark ## Benchmark
node wordpos-bench.js node wordpos-bench.js
@ -452,6 +363,21 @@ done in 1375 msecs
220 words are looked-up (less stopwords and duplicates) on a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files. 220 words are looked-up (less stopwords and duplicates) on a win7/64-bit/dual-core/3GHz. getPOS() is slowest as it searches through all four index files.
## Changes
v0.1.11
- fix stopwords not getting excluded when running with CLI
- added 'stopwords' CLI *command* to show list of stopwords
- CLI *option* --stopword now renamed to --withStopwords
v0.1.10
- rand functionality added
v0.1.6
- added command line tool
v0.1.4
- added fast index
License License
------- -------

136
bin/README.md Normal file
View File

@ -0,0 +1,136 @@
wordpos
=======
## Command-line: CLI
Version 0.1.6 introduces the command-line interface (./bin/wordpos-cli.js), available as 'wordpos' if installed globally
"npm install wordpos -g", otherwise as 'node_modules/.bin/wordpos' if installed without the -g.
```bash
$ wordpos get The angry bear chased the frightened little squirrel
# Noun 4:
bear
chased
little
squirrel
# Adjective 3:
angry
frightened
little
# Verb 1:
bear
# Adverb 1:
little
```
Just the nouns, brief output:
```bash
$ wordpos get --noun -b The angry bear chased the frightened little squirrel
bear chased little squirrel
```
Just the counts: (nouns, adjectives, verbs, adverbs, total parsed words)
```bash
$ wordpos get -c The angry bear chased the frightened little squirrel
4 3 1 1 7
```
Just the adjective count: (0, adjectives, 0, 0, total parsed words)
```bash
$ wordpos get --adj -c The angry bear chased the frightened little squirrel
0 3 0 0 7
```
Get definitions:
```bash
$ wordpos def git
git
n: a person who is deemed to be despicable or contemptible; "only a rotter would do that"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptible persona `git'"
```
Get full result object:
```bash
$ wordpos def git -f
{ git:
[ { synsetOffset: 10539715,
lexFilenum: 18,
pos: 'n',
wCnt: 0,
lemma: 'rotter',
synonyms: [],
lexId: '0',
ptrs: [],
gloss: 'a person who is deemed to be despicable or contemptible; "only a rotter would do that
"; "kill the rat"; "throw the bum out"; "you cowardly little pukes!"; "the British call a contemptib
le person a `git\'" ' } ] }
```
As JSON:
```bash
$ wordpos def git -j
{"git":[{"synsetOffset":10539715,"lexFilenum":18,"pos":"n","wCnt":0,"lemma":"rotter","synonyms":[],"
lexId":"0","ptrs":[],"gloss":"a person who is deemed to be despicable or contemptible; \"only a rotter
would do that\"; \"kill the rat\"; \"throw the bum out\"; \"you cowardly little pukes!\"; \"the British
call a contemptible person a `git'\" "}]}
```
Get random words:
```bash
$ wordpos rand
# 1:
hopelessly
$ wordpos rand -N 2 foot
# foot 2:
footprint
footlights
$ wordpos rand -N 2 foot hand
# foot 2:
footlocker
footmark
# hand 2:
hand-hewn
handstitched
$ wordpos rand --adj foot
# foot 1:
foot-shaped
$ wordpos stopwords -b
about after all also am an and another any are as at be because ...
```
## Usage:
```bash
$ wordpos
Usage: wordpos-cli.js [options] <command> [word ... | -i <file> | <stdin>]
Commands:
get get list of words for particular POS
def lookup definitions
rand get random words (optionally starting with 'word' ...)
parse show parsed words, deduped and less stopwords
stopwords show list of stopwords (valid options are -b and -j)
Options:
-h, --help output usage information
-V, --version output the version number
-n, --noun Get nouns
-a, --adj Get adjectives
-v, --verb Get verbs
-r, --adv Get adverbs
-c, --count get counts only (noun, adj, verb, adv, total parsed words)
-b, --brief brief output (all on one line, no headers)
-f, --full full results object
-j, --json full results object as JSON
-i, --file <file> input file
-s, --withStopwords include stopwords (default: stopwords are excluded)
-N, --num <num> number of random words to get
```

View File

@ -34,7 +34,7 @@ program
.option('-f, --full', 'full results object') .option('-f, --full', 'full results object')
.option('-j, --json', 'full results object as JSON') .option('-j, --json', 'full results object as JSON')
.option('-i, --file <file>', 'input file') .option('-i, --file <file>', 'input file')
.option('-s, --stopwords', 'include stopwords') .option('-s, --withStopwords', 'include stopwords (default: stopwords are excluded)')
.option('-N, --num <num>', 'number of random words to return') .option('-N, --num <num>', 'number of random words to return')
; ;
@ -49,14 +49,26 @@ program.command('def')
exec.apply(this, arguments); exec.apply(this, arguments);
}); });
program.command('parse')
.description('show parsed words, deduped and less stopwords')
.action(exec);
program.command('rand') program.command('rand')
.description('get random words (starting with word, optionally)') .description('get random words (starting with <word>, optionally)')
.action(exec); .action(exec);
program.command('parse')
.description('show parsed words, deduped and less stopwords')
.action(exec);
program.command('stopwords')
.description('show list of stopwords (valid options are -b and -j)')
.action(function(){
cmd = _.last(arguments)._name;
var stopwords = WordPos.natural.stopwords;
if (program.json)
output(stopwords);
else
console.log(stopwords.join(program.brief ? ' ' : '\n'))
});
var var
WordPos = require('../src/wordpos'), WordPos = require('../src/wordpos'),
util = require('util'), util = require('util'),
@ -109,9 +121,9 @@ function optToFn() {
function run(data) { function run(data) {
var var
opts = {stopwords: !program.stopwords}, opts = {stopwords: !program.withStopwords},
wordpos = new WordPos(opts), wordpos = new WordPos(opts),
words = wordpos.parse(data.split(' ')), // make array words = wordpos.parse(data),
fns = optToFn(), fns = optToFn(),
plural = (cmd=='get' ? 's':''), plural = (cmd=='get' ? 's':''),
results = {}, results = {},
@ -143,7 +155,7 @@ function run(data) {
}); });
} else { } else {
words.forEach(function(word){ words.forEach(function(word){
wordpos[method](word, cb); wordpos [method](word, cb);
}); });
} }
}); });

View File

@ -3,7 +3,7 @@
"author": "Moos <mooster@42at.com>", "author": "Moos <mooster@42at.com>",
"keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"], "keywords": ["natural", "language", "wordnet", "adjectives", "nouns", "adverbs", "verbs"],
"description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.", "description": "wordpos is a set of part-of-speech utilities for Node.js using natural's WordNet module.",
"version": "0.1.10", "version": "0.1.11",
"homepage": "https://github.com/moos/wordpos", "homepage": "https://github.com/moos/wordpos",
"engines": { "engines": {
"node": ">=0.6" "node": ">=0.6"