Python3.6 利用jieba对中文文本进行分词，去停用词，统计词频

最新推荐文章于 2023-02-18 23:11:30 发布

越来越胖的GuanRunwei

最新推荐文章于 2023-02-18 23:11:30 发布

阅读量1.1w

点赞数 10

分类专栏： Python 文章标签：分词统计词频去停用词

Python 专栏收录该内容

82 篇文章 4 订阅

订阅专栏

from collections import Counter
import jieba


# jieba.load_userdict('userdict.txt')
# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r').readlines()]
    return stopwords


# 对句子进行分词
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('stop_words.txt')  # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr


inputs = open('wordsbag2.txt', 'r')  # 加载要处理的文件的路径
outputs = open('result2.txt', 'w')  # 加载处理后的文件路径
for line in inputs:
    line_seg = seg_sentence(line)  # 这里的返回值是字符串
    outputs.write(line_seg)
outputs.close()
inputs.close()
# WordCount
with open('result2.txt', 'r') as fr:  # 读入已经去除停用词的文件
    data = jieba.cut(fr.read())
data = dict(Counter(data))

with open('wordcount2.txt', 'w') as fw:  # 读入存储wordcount的文件路径
    for k, v in data.items():
        fw.write('%s,%d\n' % (k, v))