结巴分词----去停用词

目前的课题需要用结巴分词处理一下数据,其中要去掉其中的停用词,以下是代码:

import jieba
import os
import pymysql


def fun(filepath):  # 遍历文件夹中的所有文件,返回文件list
    arr = []
    for root, dirs, files in os.walk(filepath):
        for fn in files:
            arr.append(root+"\\"+fn)
    return arr

#创建停用词表
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子去除停用词
def movestopwords(sentence):
    stopwords = stopwordslist('D:/2181729/stop_words.txt')  # 这里加载停用词的路径
    santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]
    return santi_words

def segmentor(text):
    words = jieba.cut(text, cut_all=False)
    return words
#stopwords = {}.fromkeys(['的', '包括', '等', '是', '《', '》', '(', ')', '.', '、', '。'])



stopwords = stopwordslist('D:/2181729/stop_words.txt')

filepath = r'D:/2181729/data'
filelist = fun(filepath)  # 获取文件列表
text = ""
count = 0
print(len(filelist))
#f1 = open('D:/2181729/nerfcdata/1.txt', 'a+')
for file in filelist:
    with open(file, encoding='UTF-8')as f:
         for line in f:
             segs = jieba.cut(line, cut_all=False)
             for seg in segs:
                if seg not in stopwords:
                  text += seg
    words = segmentor(text)
    #print('/'.join(words))
    count += 1
    output = '/'.join(words)
    dir='D:/2181729/nerfcdata/'+f.name[-5:]
    with open(dir, 'w', encoding='UTF-8') as f1:
         print(output)
         f1.write(output)




评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值