[Python]jieba切词 添加字典 去除停用词、单字 python 2020.2.10

源码如下:

import jieba
import io
import re

#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')

#添加字典
def add_dict():
    f=open("E:/xinxi2.txt","r+",encoding="utf-8")  #百度爬取的字典
    for line in f:
        jieba.suggest_freq(line.rstrip("\n"), True)
    f.close()

#对句子进行分词
def cut():
    number=0
    f=open("E:/luntan.txt","r+",encoding="utf-8")   #要处理的内容,所爬信息,CSDN论坛标题
    for line in f:
        line=seg_sentence(line.rstrip("\n"))
        seg_list=jieba.cut(line)
        for i in seg_list:
            print(i) #打印词汇内容
            m=patton.findall(i)
            #print(len(m)) #打印字符长度
            if len(m)!=0:
                write(i.strip()+" ")
        line=line.rstrip().lstrip()
        print(len(line))#打印句子长度
        if len(line)>1:
            write("\n")
        number+=1
        print("已处理",number,"行")

#分词后写入
def write(contents):
    f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要写入的文件
    f.write(contents)
    #print("写入成功!")
    f.close()

#创建停用词
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行去除停用词
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('E://stop.txt')  # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                #outstr += " "
    return outstr

#循环去除、无用函数
def cut_all():
    inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
    outputs = open('E//luntan_stop.txt', 'a')
    for line in inputs:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        outputs.write(line_seg + '\n')
    outputs.close()
    inputs.close()

if __name__=="__main__":
    add_dict()
    cut()

luntan.txt的来源,地址:https://www.cnblogs.com/zlc364624/p/12285055.html

其中停用词自行百度下载,或者自己创建一个txt文件夹,自行添加词汇换行符隔开。

百度爬取的字典在前几期博客中可以找到,地址:https://www.cnblogs.com/zlc364624/p/12289008.html

效果如下:

 

import jieba
import io
import re

#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')

#添加字典
def add_dict():
    f=open("E:/xinxi2.txt","r+",encoding="utf-8")  #百度爬取的字典
for line in f:
        jieba.suggest_freq(line.rstrip("\n"), True)
    f.close()

#对句子进行分词
def cut():
    number=0
f=open("E:/luntan.txt","r+",encoding="utf-8")   #要处理的内容,所爬信息,CSDN论坛标题
for line in f:
        line=seg_sentence(line.rstrip("\n"))
        seg_list=jieba.cut(line)
for i in seg_list:
print(i) #打印词汇内容
m=patton.findall(i)
#print(len(m)) #打印字符长度
if len(m)!=0:
                write(i.strip()+" ")
        line=line.rstrip().lstrip()
print(len(line))#打印句子长度
if len(line)>1:
            write("\n")
        number+=1
print("已处理",number,"行")

#分词后写入
def write(contents):
    f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要写入的文件
f.write(contents)
#print("写入成功!")
f.close()

#创建停用词
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords

# 对句子进行去除停用词
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('E://stop.txt')  # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
                outstr += word
#outstr += " "
return outstr

#循环去除、无用函数
def cut_all():
    inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
    outputs = open('E//luntan_stop.txt', 'a')
for line in inputs:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
outputs.write(line_seg + '\n')
    outputs.close()
    inputs.close()

if __name__=="__main__":
    add_dict()
    cut()

 

  • 1
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值