1、 安装包
import jieba
import re
2、数据预处理
数据格式:
[“晚上想吃五花肉土豆盖浇饭”,
“今晚吃鸡嘿咻嘿”,
“绿皮环保小火车进站”,
“一首《梦醒时分》送给大家”]
具体流程如下:
2.1 数据清洗
目的:清洗文本中的特殊符号
sentence =["晚上想吃五花肉土豆盖浇饭",
"今晚吃鸡嘿咻嘿",
"绿皮环保小火车进站",
"一首《梦醒时分》送给大家"]
def subReplace(lines):
#清洗文本中的特殊符号
#re.compile将正则表达式编译成一个对象
regex = re.compile(r"[0-9__~()《》___()、/,...,!。::;%-. 【】]")
result = []
for line in lines:
line = regex.sub('',str(line)) #字符串替换
result.append(line)
return result
print(subReplace(sentence))
输出:
[‘晚上想吃五花肉土豆盖浇饭’, ‘今晚吃鸡嘿咻嘿’, ‘绿皮环保小火车进站’, ‘一首梦醒时分送给大家’]
2.2 分词
其中:自定义停用词与自定义分词词库Notepad++处编辑,注意保存格式为UTF-8,可将文本放在如下地址D:/PYTHON/PYTHON/Lib/site-packages/jieba/
def Cut_word(sentences_list):
all_result_list = []
jieba.load_userdict("my_dict_1.txt") #自定义词库如(梦醒时分、吃鸡)
for sentence in sentences_list:
result_list =[ word.upper() for word in jieba.cut(sentence)]
all_result_list.append(result_list)
return all_result_list
输出:
[[‘晚上’, ‘想’, ‘吃’, ‘五花肉’, ‘土豆’, ‘盖浇饭’],
[‘今晚’, ‘吃鸡’, ‘嘿咻嘿’],
[‘绿皮’, ‘环保’, ‘小’, ‘火车’, ‘进站’],
[‘一首’, ‘梦醒时分’, ‘送给’, ‘大家’]]
2.3 去除停用词
def stop_words_list():
#导入停用词
stop_words = []
with open("my_stopword.txt",encoding = "UTF-8") as file_obj:
for word in file_obj:
stop_words.append(str(word.strip()))
return stop_words
def del_stop_words(word_list):
stop_words = stop_words_list() #导入停用词
result = []
all_result = []
for sentences in word_list:
for word in sentences:
if word.isspace() == True: #去除空格
pass
elif word not in stop_words :
result.append(word)
else:
pass
all_result.append(result)
result = []
return all_result
输出:
[[‘晚上’, ‘吃’, ‘五花肉’, ‘土豆’, ‘盖浇饭’],
[‘今晚’, ‘吃鸡’],
[‘绿皮’, ‘环保’, ‘火车’, ‘进站’],
[‘一首’, ‘梦醒时分’, ‘送给’ ,‘大家’]]
2.4 同义词替换
其中:同义词词库Notepad++处编辑,注意保存格式为UTF-8,一行词为同义词,用TAB键隔开,第一个词为替换词。可将文本放在如下地址D:/PYTHON/PYTHON/Lib/site-packages/jieba/
def Replace_syn(word_list):
# 1读取同义词表:并生成一个字典。
synonym_dict = {}
with open("my_synonym.txt",encoding = "UTF-8") as file_obj:
for line in file_obj:
seperate_word=line.strip().split("\t")
num = len(seperate_word)
for i in range(1,num):
synonym_dict[seperate_word[i]] = seperate_word[0]
sen = []
result = []
for sentences in word_list:
for word in sentences:
if word in synonym_dict:
word = synonym_dict[word]
sen.append(word)
else:
sen.append(word)
result.append(sen)
sen = []
return result
输出:
[[‘今晚’, ‘吃’, ‘五花肉’, ‘土豆’, ‘盖浇饭’],
[‘今晚’, ‘吃鸡’],
[‘绿皮’, ‘环保’, ‘火车’, ‘进站’],
[‘一首’, ‘梦醒时分’, ‘送给’ ,‘大家’]]