1、介绍
对话机器人本质上是一个用来模拟人类对话或聊天的计算机程序,接受人类自然语言作为输入,并给出合适 回复
按照任务类型划分,对话机器人可以分为闲聊机器人,问答机器人,任务对话机器人
按照答案产生的逻辑划分,对话机器人可以划分为检索式对话机器人和生成式对话机器人
2、代码实战
2.1、读取faq数据
import pandas as pd
data = pd.read_csv('./law_faq.csv')
data.head()
data = data[:200]
2.2、加载Dual模型
from dual_model import DualModel
dual_model = DualModel.from_pretrained("../Model/chinese-macbert-base")
dual_model定义如下
import torch
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss
class DualModel(BertPreTrainedModel):
def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.bert = BertModel(config)
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Step1 分别获取sentenceA 和 sentenceB的输入
senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1]
senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]
# Step2 分别获取sentenceA 和 sentenceB的向量表示
senA_outputs = self.bert(
senA_input_ids,
attention_mask=senA_attention_mask,
token_type_ids=senA_token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
senA_pooled_output = senA_outputs[1] # [batch, hidden]
senB_outputs = self.bert(
senB_input_ids,
attention_mask=senB_attention_mask,
token_type_ids=senB_token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
senB_pooled_output = senB_outputs[1] # [batch, hidden]
# step3 计算相似度
cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output) # [batch, ]
# step4 计算loss
loss = None
if labels is not None:
loss_fct = CosineEmbeddingLoss(0.3)
loss = loss_fct(senA_pooled_output, senB_pooled_output, labels)
output = (cos,)
return ((loss,) + output) if loss is not None else output
dual_model = dual_model.cuda()
dual_model.eval()
print('匹配模型加载成功')
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../Model/chinese-macbert-base")
2.3、将问题转化为向量
import torch
from tqdm import tqdm
questions = data['title'].to_list()
vectors = []
with torch.inference_mode():
for i in tqdm(range(0, len(questions),32)):
batch_sens = questions[i:i+32]
inputs = tokenizer(
batch_sens,
return_tensors='pt',
padding=True,
max_length=64,
truncation=True)
inputs = {k:v.to('cuda') for k, v in inputs.items()}
vector = dual_model.bert(**inputs)[1]
vectors.append(vector)
vectors = torch.cat(vectors, dim=0).cpu().numpy()
vectors.shape
2.4、创建索引
import faiss
index = faiss.IndexFlatIP(768)
faiss.normalize_L2(vectors)
index.add(vectors)
index
2.5、对问题进行编码
question = "寻衅滋事"
with torch.inference_mode():
inputs = tokenizer(
batch_sens,
return_tensors='pt',
padding=True,
max_length=64,
truncation=True)
inputs = {k:v.to('cuda') for k, v in inputs.items()}
vector = dual_model.bert(**inputs)[1]
q_vector = vector.cpu().numpy()
q_vector.shape
2.6、向量匹配
faiss.normalize_L2(q_vector)
scores, indexes = index.search(q_vector, 10)
topk_result = data.values[indexes[0].tolist()]
topk_result[:, 0]
2.7、加载分类模型
from transformers import BertForSequenceClassification
cross_model = BertForSequenceClassification.from_pretrained('../Model/chinese-macbert-base')
cross_model.eval()
print("succ")
2.8、最终预测
canidate = topk_result[:,0].tolist()
ques = [question]*len(canidate)
inputs = tokenizer(qes,canidate,
return_tensors='pt',
padding=True,
max_length=64,
truncation=True)
inputs = {k:v.to('cpu') for k,v in inputs.items()}
with torch.inference_mode():
logits = cross_model(**inputs).logits.squeeze()
result = torch.argmax(logits,dim=-1)
canidate_answer = topk_result[:,1].tolist()
match_question = canidate[result.item()]
final_answer = canidate_answer[result.item()]