自然语言中如何提取保存训练停用词的Nodejs方法

本文链接：https://blog.csdn.net/chen_jmail/article/details/129980912

该类用于处理中文文本，提供加载、训练和提取停用词的功能。通过训练文本文件生成停用词列表，并能保存和使用这些列表来过滤文本中的停用词。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

中文停用词提取器的 Node.js 类。它使用了中文停用词库，并且允许你自行训练和保存停用词。

1const fs = require('fs');
2const path = require('path');
3const readline = require('readline');
45class ChineseStopwordsExtractor {
6  constructor(stopwordsFilePath = path.join(__dirname, 'chinese_stopwords.txt')) {
7    this.stopwordsFilePath = stopwordsFilePath;
8    this.stopwords = new Set();
9    this.loadStopwords();
10  }
11
12  loadStopwords() {
13    try {
14      const fileStream = fs.createReadStream(this.stopwordsFilePath);
15      const rl = readline.createInterface({
16        input: fileStream,
17        crlfDelay: Infinity
18      });
19      rl.on('line', (line) => {
20        this.stopwords.add(line.trim());
21      });
22    } catch (err) {
23      console.error(`Failed to load stopwords from ${this.stopwordsFilePath}: ${err}`);
24    }
25  }
26
27  train(textFilePath) {
28    try {
29      const fileStream = fs.createReadStream(textFilePath);
30      const rl = readline.createInterface({
31        input: fileStream,
32        crlfDelay: Infinity
33      });
34      const wordCount = new Map();
35      rl.on('line', (line) => {
36        const words = line.trim().split(/[\s,，.。!！?？;；:：'‘’"“”()（）]+/);
37        for (const word of words) {
38          if (word.length > 1) {
39            if (wordCount.has(word)) {
40              wordCount.set(word, wordCount.get(word) + 1);
41            } else {
42              wordCount.set(word, 1);
43            }
44          }
45        }
46      });
47      rl.on('close', () => {
48        const sortedWords = [...wordCount.entries()].sort((a, b) => b[1] - a[1]);
49        const stopwords = new Set(sortedWords.slice(0, 100).map((entry) => entry[0]));
50        this.stopwords = stopwords;
51        this.saveStopwords();
52      });
53    } catch (err) {
54      console.error(`Failed to train stopwords from ${textFilePath}: ${err}`);
55    }
56  }
57
58  saveStopwords() {
59    try {
60      fs.writeFileSync(this.stopwordsFilePath, [...this.stopwords].join('\n'), 'utf8');
61    } catch (err) {
62      console.error(`Failed to save stopwords to ${this.stopwordsFilePath}: ${err}`);
63    }
64  }
65
66  extract(text) {
67    const words = text.trim().split(/[\s,，.。!！?？;；:：'‘’"“”()（）]+/);
68    const result = [];
69    for (const word of words) {
70      if (word.length > 1 && !this.stopwords.has(word)) {
71        result.push(word);
72      }
73    }
74    return result;
75  }
76}
77
78module.exports = ChineseStopwordsExtractor;

这个类有三个主要方法：

loadStopwords()：从指定的文件路径加载中文停用词。
train(textFilePath)：从指定的文件路径训练停用词，并将前 100 个最常见的词作为停用词。
extract(text)：从给定的文本中提取不包含停用词的词语。

你可以使用以下代码来使用这个类：

1const ChineseStopwordsExtractor = require('./ChineseStopwordsExtractor');
23const extractor = new ChineseStopwordsExtractor();
4const text = '这是一段测试文本，包含了一些常见的中文停用词，例如“的”、“了”、“是”等等。';
5console.log(extractor.extract(text)); // ['测试文本', '包含', '常见', '中文', '停用词', '例如']
6extractor.train('./text.txt'); // 从文本文件中训练停用词
7console.log(extractor.extract(text)); // ['测试文本', '包含', '常见', '中文', '停用词', '例如']

在这个例子中，我们首先创建了一个新的 ChineseStopwordsExtractor 实例，并用一段测试文本调用了 extract() 方法来提取不包含停用词的词语。然后我们调用了 train() 方法来训练停用词，然后再次调用 extract() 方法来提取新文