文章目录
一 Seq2Seq实现闲聊机器人
1. 准备训练数据
单轮次
的聊天数据非常不好获取,所以从github上使用一些开放的数据集来训练闲聊机器人模型
数据地址:https://github.com/codemayq/chaotbot_corpus_Chinese
主要的数据有两个:
-
小黄鸡的聊天语料:噪声很大
-
微博的标题和评论:质量相对较高
2. 数据的处理和保存
由于数据中存到大量的噪声,可以对其进行基础的处理,然后分别把input和target使用两个文件保存,即input中的第N行为问,target的第N行为答
后续可能会把单个字作为特征(存放在input_word.txt),也可能会把词语作为特征(input.txt)
2.1 小黄鸡的语料的处理
def format_xiaohuangji_corpus(word=False):
"""处理小黄鸡的语料"""
if word:
corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv"
input_path = "./chatbot/corpus/input_word.txt"
output_path = "./chatbot/corpus/output_word.txt"
else:
corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv"
input_path = "./chatbot/corpus/input.txt"
output_path = "./chatbot/corpus/output.txt"
f_input = open(input_path,"a")
f_output = open(output_path,"a")
pair = []
for line in tqdm(open(corpus_path),ascii=True):
if line.strip() == "E":
if not pair:
continue
else:
assert len(pair) == 2,"长度必须是2"
if len(pair[0].strip())>=1 and len(pair[1].strip())>=1:
f_input.write(pair[0]+"\n")
f_output.write(pair[1]+"\n")
pair = []
elif line.startswith("M"):
line = line[1:]
if word:
pair.append(" ".join(list(line.strip())))
else:
pair.append(" ".join(jieba_cut(line.strip())))
2.2 微博语料的处理
def format_weibo(word=False):
"""
微博数据存在一些噪声,未处理
:return:
"""
if word:
origin_input = "./chatbot/corpus/stc_weibo_train_post"
input_path = "./chatbot/corpus/input_word.txt"
origin_output = "./chatbot/corpus/stc_weibo_train_response"
output_path = "./chatbot/corpus/output_word.txt"
else:
origin_input = "./chatbot/corpus/stc_weibo_train_post"
input_path = "./chatbot/corpus/input.txt"
origin_output = "./chatbot/corpus/stc_weibo_train_response"
output_path = "./chatbot/corpus/output.txt"
f_input = open(input_path,"a")
f_output = open(output_path, "a")
with open(origin_input) as in_o,open(origin_output) as out_o:
for _in,_out in tqdm(zip(in_o,out_o),ascii=True):
_in = _in.strip()
_out = _out.strip()
if _in.endswith(")") or _in.endswith("」") or _in.endswith(")"):
_in = re.sub("(.*)|「.*?」|\(.*?\)"," ",_in)
_in = re.sub("我在.*?alink|alink|(.*?\d+x\d+.*?)|#|】|【|-+|_+|via.*?:*.*"," ",_in)
_in = re.sub("\s+"," ",_in)
if len(_in)<1 or len(_out)<1:
continue
if word:
_in = re.sub("\s+","",_in) #转化为一整行,不含空格
_out = re.sub("\s+","",_out)
if len(_in)>=1 and len(_out)>=1:
f_input.write(" ".join(list(_in)) + "\n")
f_output.write(" ".join(list(_out)) + "\n")
else:
if len(_in) >= 1 and len(_out) >=