中文停用词提取器的 Node.js 类。它使用了中文停用词库,并且允许你自行训练和保存停用词。
1const fs = require('fs');
2const path = require('path');
3const readline = require('readline');
45class ChineseStopwordsExtractor {
6 constructor(stopwordsFilePath = path.join(__dirname, 'chinese_stopwords.txt')) {
7 this.stopwordsFilePath = stopwordsFilePath;
8 this.stopwords = new Set();
9 this.loadStopwords();
10 }
11
12 loadStopwords() {
13 try {
14 const fileStream = fs.createReadStream(this.stopwordsFilePath);
15 const rl = readline.createInterface({
16 input: fileStream,
17 crlfDelay: Infinity
18 });
19 rl.on('line', (line) => {
20 this.stopwords.add(line.trim());
21 });
22 } catch (err) {
23 console.error(`Failed to load stopwords from ${this.stopwordsFilePath}: ${err}`);
24 }
25 }
26
27 train(textFilePath) {
28 try {
29 const fileStream = fs.createReadStream(textFilePath);
30 const rl = readline.createInterface({
31 input: fileStream,
32 crlfDelay: Infinity
33 });
34 const wordCount = new Map();
35 rl.on('line', (line) => {
36 const words = line.trim().split(/[\s,,.。!!??;;::'‘’"“”()()]+/);
37 for (const word of words) {
38 if (word.length > 1) {
39 if (wordCount.has(word)) {
40 wordCount.set(word, wordCount.get(word) + 1);
41 } else {
42 wordCount.set(word, 1);
43 }
44 }
45 }
46 });
47 rl.on('close', () => {
48 const sortedWords = [...wordCount.entries()].sort((a, b) => b[1] - a[1]);
49 const stopwords = new Set(sortedWords.slice(0, 100).map((entry) => entry[0]));
50 this.stopwords = stopwords;
51 this.saveStopwords();
52 });
53 } catch (err) {
54 console.error(`Failed to train stopwords from ${textFilePath}: ${err}`);
55 }
56 }
57
58 saveStopwords() {
59 try {
60 fs.writeFileSync(this.stopwordsFilePath, [...this.stopwords].join('\n'), 'utf8');
61 } catch (err) {
62 console.error(`Failed to save stopwords to ${this.stopwordsFilePath}: ${err}`);
63 }
64 }
65
66 extract(text) {
67 const words = text.trim().split(/[\s,,.。!!??;;::'‘’"“”()()]+/);
68 const result = [];
69 for (const word of words) {
70 if (word.length > 1 && !this.stopwords.has(word)) {
71 result.push(word);
72 }
73 }
74 return result;
75 }
76}
77
78module.exports = ChineseStopwordsExtractor;
这个类有三个主要方法:
loadStopwords()
:从指定的文件路径加载中文停用词。train(textFilePath)
:从指定的文件路径训练停用词,并将前 100 个最常见的词作为停用词。extract(text)
:从给定的文本中提取不包含停用词的词语。
你可以使用以下代码来使用这个类:
1const ChineseStopwordsExtractor = require('./ChineseStopwordsExtractor');
23const extractor = new ChineseStopwordsExtractor();
4const text = '这是一段测试文本,包含了一些常见的中文停用词,例如“的”、“了”、“是”等等。';
5console.log(extractor.extract(text)); // ['测试文本', '包含', '常见', '中文', '停用词', '例如']
6extractor.train('./text.txt'); // 从文本文件中训练停用词
7console.log(extractor.extract(text)); // ['测试文本', '包含', '常见', '中文', '停用词', '例如']
在这个例子中,我们首先创建了一个新的 ChineseStopwordsExtractor
实例,并用一段测试文本调用了 extract()
方法来提取不包含停用词的词语。然后我们调用了 train()
方法来训练停用词,然后再次调用 extract()
方法来提取新文