使用jieba进行数据预处理（分词，过滤停用词及标点，获取词频、关键词等）

最新推荐文章于 2025-03-18 12:48:45 发布

Ezrealmore

最新推荐文章于 2025-03-18 12:48:45 发布

阅读量4.7w

点赞数 45

本文为博主原创文章，转载时请注明出处链接。

本文链接：https://blog.csdn.net/lk7688535/article/details/77971376

版权

整理停用词去空行和两边的空格

#encoding=utf-8
filename = "stop_words.txt"

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue

    result.append(line)
f.close
with open("stop_words2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("\n") 
print ("end")

分词、停用词过滤（包括标点）

#encoding=utf-8
import jieba
filename = "../data/1000页洗好2.txt"
stopwords_file = "../data/stop_words2.txt"

stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
        continue

    stop_words.append(line)
stop_f.close

print(len(stop_words))

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue
    outstr = '' 
    seg_list = jieba.cut(line,cut_all=False) 
    for word in seg_list:  
        if word not in stop_words:  
            if word != '\t':  
                outstr += word 
                outstr += " "  
   # seg_list = " ".join(seg_list)
    result.append(outstr.strip())
f.close

with open("../data/test2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8'