文本分词的介绍网上已经很全面了,这里主要介绍一种文本分词、去停用词的具体实现,停用词表是对百度、哈工大等常见停用词表融合后去重
import csv
import datetime
import re
import pandas as pd
import numpy as np
import jieba
# 停用词路径
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return list(set(stopwords))
# 分词去停用词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist(r"C:\Users\lenovo\Desktop\fin_data\NLP_code\wordtoremove.txt") # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
# 分词、去停用词、只保留动 名词
def seg_sentence_return_vn(sentence):
seg = psg.cut(sentence.strip())
outstr = ''
for x in seg:
if x.flag in ('n','v'):
outstr += x.word
outstr += " "
return outstr
if __name__ == '__main__':
filename = 'zhihu_data_setiment.csv'
csv_data = pd.read_csv(filename, header = 0, index_col = False, engine='python',encoding = 'utf-8-sig')
file_userdict = 'personal_dic.txt'
jieba.load_userdict(file_userdict)
for i in range(csv_data.shape[0]):
answer = str(csv_data.iloc[i,14])
csv_data.iloc[i,20] = seg_sentence(answer)
csv_data.iloc[i,21] = seg_sentence_return_vn(answer)
# 进度查看器
if(i % 500 ==0):
print(i,end = ' ')
# 写文件
csv_data.to_csv("zhihu_data_jieba_seg.csv",header=True,index=False,encoding='utf-8-sig')