一、环境搭建
sys (python系统库)
pickle(将特有的类型与python的数据类型进行转换的库)
re (正则表达式库)
tqdm(可扩展的进度条)
activate nlp,pip install tqdm --upgrade
二、聊天机器人语料处理流程介绍
1、语料收集:聊天记录、电影对话、台词片段
2、语料清洗:
清洗的内容:多余的空格,不正规的符号,多余的字符、英文
清洗的方法:正则化,切分,好坏语句判断
3、句子向量的编码化:
原始的文本不能直接训练
将句子转换成向量
将向量转换成句子
4、语料问答对的构建
5、语料模型的保存
三、、数据处理
数据集 dgk_shooter_min.conv
创建extract_conv.py文件
# -*- coding:utf-8 -*-
import re
import pickle
import sys
from tqdm import tqdm
#########句子的构造和判断#############
def make_split(line):
# 数据处理,如果有这些字符用空替换
# 如 你好?我是recky... 变为 你好我是recky
if re.match(r'.*([,···?!\.,!?])$', ''.join(line)):
return []
# 用空格切分,逗号隔开
return [', ']
# 判断句子是不是有用的句子
def good_line(line):
# 判断包含大于2个的字母数字等,如果很多就不是好句子
if len(re.findall(r'[a-zA-Z0-9]', ''.join(line))) > 2:
return False
return True
##########正则表达式###########
# 英文字符替换为中文字符 如 你好? 转换为 你好?
def regular(sen):
# .连续出现3-100次,替换成...
sen = re.sub(r'\.{3,100}', '···', sen)
# ...连续出现2-100次,替换成...
sen = re.sub(r'···{2,100}', '···', sen)
sen = re.sub(r'[,]{1,100}', ',', sen)
sen = re.sub(r'[\.]{1,100}', '。', sen)
sen = re.sub(r'[\?]{1,100}', '?', sen)
sen = re.sub(r'[!]{1,100}', '!', sen)
return sen
# 主函数,句子最长20
def main(limit=20, x_limit=3, y_limit=6):
############# 句子编码化处理(字典定义及转换)看word_sequence文件#############
from word_sequence import WordSequence
print('extract lines')
fp = open('dgk_shooter_min.conv', 'r', errors='ignore', encoding='utf-8')
groups = []
group = []
for line in tqdm(fp): # 进度条
if line.startswith('M '): # 数据是M开头的是句子
line = line.replace('\n', '') # 去掉回车
if '/' in line: # 有/就用/做切分
line = line[2:].split('/')
else:
line = list(line[2:])
line = line[:-1]
group.append(list(regular(''.join(line))))
else:
if group:
groups.append(group)
group = []
if group:
groups.append(group)
group = []
print('extract group')
####################训练语料问答对的处理###################
# 定义问答语句
x_data = []
y_data = []
for group in tqdm(groups):
# enumerate枚举group,返回标签i,句子line
for i, line in enumerate(group):
last_line = None # 最后一句设置为空
if i > 0: # 说明里面至少2行,0,1行
last_line = group[i - 1] # 拿到最后一行
if not good_line(last_line): # 如果不是好的句子就丢弃
last_line = None
next_line = None # 下一行
if i < len(group) - 1:
next_line = group[i + 1]
if not good_line(next_line):
next_line = None
next_next_line = None # 下下句
if i < len(group) - 2:
next_next_line = group[i + 2]
if not good_line(next_next_line):
next_next_line = None
# 拿到3行数据做赋值
if next_line: # 存在下一行
x_data.append(line) # 问句追加,x赋值第一行
y_data.append(next_line) # 答句y赋值第二行
if last_line and next_line: # 有第一行和下一行
x_data.append(last_line + make_split(last_line) + line) # 下一行+下一行make_split切分+当前行
y_data.append(next_line)
if next_line and next_next_line:
x_data.append(line)
y_data.append(next_line + make_split(next_line) + next_next_line)
print(len(x_data), len(y_data))
# 构建问答,zip是把数据整合为列表,最多20个字符
for ask, answer in zip(x_data[:20], y_data[:20]):
print(''.join(ask))
print(''.join(answer))
print('-' * 20)
###############数据模型打包处理##############
# 生成pkl文件备用
data = list(zip(x_data, y_data))
data = [
(x, y) for x, y in data if limit > len(x) >= x_limit and limit > len(y) >= y_limit
]
x_data, y_data = zip(*data)
ws_input = WordSequence() # 句子编码化处理
ws_input.fit(x_data + y_data) # 训练数据
print('dump')
pickle.dump(
(x_data, y_data), open('chatbot.pkl', 'wb'))
pickle.dump(ws_input, open('ws.pkl', 'wb'))
print('done')
if __name__ == '__main__':
main()
创建word_sequence.py
# -*- coding:utf-8 -*-
import numpy as np
############## 句子编码化处理(字典定义及转换)#############
#训练就是把句子转换为向量
class WordSequence(object):
# 定义标记,pad填充,用来补位,unk未知
PAD_TAG = '<pad>'
UNK_TAG = '<unk>'
START_TAG = '<s>'
END_TAG = '</S>'
PAD = 0
UNK = 1
START = 2
END = 3
# 字典初始化
def __init__(self):
self.fited = False # 是否训练
self.dict = {WordSequence.PAD_TAG: WordSequence.PAD, WordSequence.UNK_TAG: WordSequence.UNK,
WordSequence.START_TAG: WordSequence.START, WordSequence.END_TAG: WordSequence.END}
# 把字转化为index
def to_index(self, word):
assert self.fited, 'WordSequence尚未进行fit操作'
# 在字典里。返回位置
if word in self.dict:
return self.dict[word]
return WordSequence.UNK
# 把index转化为字
def to_word(self, index):
assert self.fited, 'WordSequence尚未进行fit操作'
for k, v in self.dict.items():
if v == index:
return k
return WordSequence.UNK_TAG
# 大小
def size(self):
assert self.fited, 'WordSequence尚未进行fit操作'
return len(self.dict) + 1#字典长度进行补位
def __len__(self):#重写系统方法
return self.size()
##############句子编码化处理(训练字典)##############
# sentences句子,最小出现次数,最大出现次数,最大特征数
def fit(self, sentences, min_count=5, max_count=None, max_features=None):
assert not self.fited, 'WordSequence只能fit一次'
# 拟合的过程
count = {}
# 传进来的句子拿出来之后进行统计
for sentence in sentences:
arr = list(sentence)
for a in arr:
if a not in count: # 没有被统计
count[a] = 0
count[a] += 1 # 被统计了
# 首先看是不是大于最小值
if min_count is not None:
# 大于最小值进行key value 的统计
count = {k: v for k, v in count.items() if v >= min_count}
# 小于最大值进行key value 的统计
if max_count is not None:
count = {k: v for k, v in count.items() if v <= max_count}
self.dict = {WordSequence.PAD_TAG: WordSequence.PAD, WordSequence.UNK_TAG: WordSequence.UNK,
WordSequence.START_TAG: WordSequence.START, WordSequence.END_TAG: WordSequence.END}
# isinstance是用来判断内容是不是已知的类型
if isinstance(max_features, int): # 判断是不是int类型
# 统计count是list每个item排序
count = sorted(list(count.items()), key=lambda x: x[1])
# 最大特征数不为空且做统计的长度大于最大特征数
if max_features is not None and len(count) > max_features:
count = count[-int(max_features):]
for w, _ in count: # 把长度放入字典中,第w个=字典长度
self.dict[w] = len(self.dict)
else: # 不是int类型,直接把长度放入字典中
for w in sorted(count.keys()): # key排序
self.dict[w] = len(self.dict) # 第w个=字典长度
self.fited = True # 标志
###############句子和向量之间的转换############
# 句子转换为向量
def transform(self, sentence, max_len=None):
assert self.fited, 'WordSequence尚未进行fit操作'
if max_len is not None:
r = [self.PAD] * max_len # 填充的位置*最大长度
else:
r = [self.PAD] * len(sentence) # 填充的位置*句子长度
for index, a in enumerate(sentence): # 在枚举句子中
if max_len is not None and index >= len(r):
break
r[index] = self.to_index(a)
return np.array(r) # 转换为numpy的array
# 向量转换为句子
def inverse_transform(self, indices, ignore_pad=False, ignore_unk=False, ignore_start=False, ignore_end=False):
ret = []
for i in indices:
word = self.to_word(i) # 转换完的字
if word == WordSequence.PAD_TAG and ignore_pad:
continue
if word == WordSequence.UNK_TAG and ignore_unk:
continue
if word == WordSequence.START_TAG and ignore_start:
continue
if word == WordSequence.END_TAG and ignore_end:
continue
ret.append(word)
return ret
def test():
ws = WordSequence()
ws.fit([['你', '好', '啊'], ['你', '好', '哦'], ])
indice = ws.transform(['我', '们', '好'])
print(indice)
back = ws.inverse_transform(indice)
print(back)
if __name__ == '__main__':
test()