2018-10-15 05:20:56 +00:00
|
|
|
/**
|
|
|
|
* util.js
|
|
|
|
*
|
|
|
|
* Copyright (c) 2012-2019 mooster@42at.com
|
|
|
|
* https://github.com/moos/wordpos
|
|
|
|
*
|
|
|
|
* Released under MIT license
|
|
|
|
*/
|
|
|
|
|
2018-10-13 03:35:11 +00:00
|
|
|
let stopwords = require('../lib/natural/util/stopwords').words;
|
|
|
|
let stopwordsStr = makeStopwordString(stopwords);
|
|
|
|
|
|
|
|
function makeStopwordString(stopwords) {
|
|
|
|
return ' ' + stopwords.join(' ') + ' ';
|
|
|
|
}
|
|
|
|
|
|
|
|
// setImmediate executes callback AFTER promise handlers.
|
|
|
|
// Without it, exceptions in callback may be caught by Promise.
|
|
|
|
function nextTick(fn, args) {
|
|
|
|
if (fn) {
|
|
|
|
fn.apply(null, args);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function normalize(word) {
|
|
|
|
return word.toLowerCase().replace(/\s+/g, '_');
|
|
|
|
}
|
|
|
|
|
2018-10-15 05:20:56 +00:00
|
|
|
function isStopword(stopwordsStr, word) {
|
|
|
|
return stopwordsStr.indexOf(' '+word+' ') >= 0;
|
2018-10-13 03:35:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function tokenizer(str) {
|
|
|
|
return str.split(/\W+/);
|
|
|
|
}
|
|
|
|
|
|
|
|
function uniq(arr) {
|
|
|
|
return arr.filter((v, i) => arr.indexOf(v) === i);
|
|
|
|
}
|
|
|
|
|
|
|
|
function isString(s) {
|
|
|
|
return typeof s === 'string';
|
|
|
|
}
|
|
|
|
|
|
|
|
function reject(arr, predicate) {
|
|
|
|
return arr.filter(item => !predicate(item))
|
|
|
|
}
|
|
|
|
|
|
|
|
function prepText(text) {
|
|
|
|
if (Array.isArray(text)) return text;
|
|
|
|
var deduped = uniq(tokenizer(text));
|
|
|
|
if (!this.options.stopwords) return deduped;
|
|
|
|
return reject(deduped, isStopword.bind(null,
|
|
|
|
isString(this.options.stopwords) ? this.options.stopwords : stopwordsStr
|
|
|
|
));
|
|
|
|
}
|
|
|
|
|
2018-10-15 05:20:56 +00:00
|
|
|
module.exports = {
|
2018-10-21 03:51:37 +00:00
|
|
|
isString,
|
2018-10-15 05:20:56 +00:00
|
|
|
stopwords,
|
2018-10-13 03:35:11 +00:00
|
|
|
nextTick,
|
|
|
|
normalize,
|
|
|
|
tokenizer,
|
|
|
|
prepText,
|
|
|
|
makeStopwordString
|
2018-10-21 03:51:37 +00:00
|
|
|
};
|