结巴分词----去停用词

最新推荐文章于 2024-03-22 12:59:16 发布

broccoli2

最新推荐文章于 2024-03-22 12:59:16 发布

阅读量8.3k

点赞数 4

分类专栏：结巴分词文章标签：结巴分词去停用词

本文链接：https://blog.csdn.net/broccoli2/article/details/100897423

版权

结巴分词专栏收录该内容

6 篇文章 0 订阅

订阅专栏

目前的课题需要用结巴分词处理一下数据，其中要去掉其中的停用词，以下是代码：

import jieba
import os
import pymysql


def fun(filepath):  # 遍历文件夹中的所有文件，返回文件list
    arr = []
    for root, dirs, files in os.walk(filepath):
        for fn in files:
            arr.append(root+"\\"+fn)
    return arr

#创建停用词表
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子去除停用词
def movestopwords(sentence):
    stopwords = stopwordslist('D:/2181729/stop_words.txt')  # 这里加载停用词的路径
    santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]
    return santi_words

def segmentor(text):
    words = jieba.cut(text, cut_all=False)
    return words
#stopwords = {}.fromkeys(['的', '包括', '等', '是', '《', '》', '（', '）', '.', '、', '。'])



stopwords = stopwordslist('D:/2181729/stop_words.txt')

filepath = r'D:/2181729/data'
filelist = fun(filepath)  # 获取文件列表
text = ""
count = 0
print(len(filelist))
#f1 = open('D:/2181729/nerfcdata/1.txt', 'a+')
for file in filelist:
    with open(file, encoding='UTF-8')as f:
         for line in f:
             segs = jieba.cut(line, cut_all=False)
             for seg in segs:
                if seg not in stopwords:
                  text += seg
    words = segmentor(text)
    #print('/'.join(words))
    count += 1
    output = '/'.join(words)
    dir='D:/2181729/nerfcdata/'+f.name[-5:]
    with open(dir, 'w', encoding='UTF-8') as f1:
         print(output)
         f1.write(output)