【Advanced】(六)、transformers实战检索机器人

1、介绍

 对话机器人本质上是一个用来模拟人类对话或聊天的计算机程序,接受人类自然语言作为输入,并给出合适 回复

 按照任务类型划分,对话机器人可以分为闲聊机器人,问答机器人,任务对话机器人
 按照答案产生的逻辑划分,对话机器人可以划分为检索式对话机器人和生成式对话机器人

2、代码实战

2.1、读取faq数据

import pandas as pd

data = pd.read_csv('./law_faq.csv')
data.head()
data = data[:200]

2.2、加载Dual模型

from dual_model import DualModel

dual_model = DualModel.from_pretrained("../Model/chinese-macbert-base")

dual_model定义如下

import torch
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss

class DualModel(BertPreTrainedModel):

    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = BertModel(config)
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Step1 分别获取sentenceA 和 sentenceB的输入
        senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1]
        senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # Step2 分别获取sentenceA 和 sentenceB的向量表示
        senA_outputs = self.bert(
            senA_input_ids,
            attention_mask=senA_attention_mask,
            token_type_ids=senA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senA_pooled_output = senA_outputs[1]    # [batch, hidden]

        senB_outputs = self.bert(
            senB_input_ids,
            attention_mask=senB_attention_mask,
            token_type_ids=senB_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senB_pooled_output = senB_outputs[1]    # [batch, hidden]

        # step3 计算相似度

        cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output)    # [batch, ]

        # step4 计算loss

        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(0.3)
            loss = loss_fct(senA_pooled_output, senB_pooled_output, labels)

        output = (cos,)
        return ((loss,) + output) if loss is not None else output
dual_model = dual_model.cuda()
dual_model.eval()
print('匹配模型加载成功')
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("../Model/chinese-macbert-base")

2.3、将问题转化为向量

import torch
from tqdm import tqdm

questions = data['title'].to_list()
vectors = []
with torch.inference_mode():
    for i in tqdm(range(0, len(questions),32)):
        batch_sens = questions[i:i+32]
        inputs = tokenizer(
            batch_sens,
            return_tensors='pt',
            padding=True,
            max_length=64,
            truncation=True)
        inputs = {k:v.to('cuda') for k, v in inputs.items()}
        vector = dual_model.bert(**inputs)[1]
        vectors.append(vector)
vectors = torch.cat(vectors, dim=0).cpu().numpy()
vectors.shape

2.4、创建索引

import faiss

index = faiss.IndexFlatIP(768)
faiss.normalize_L2(vectors)
index.add(vectors)
index

2.5、对问题进行编码

question = "寻衅滋事"
with torch.inference_mode():
    inputs = tokenizer(
                batch_sens,
                return_tensors='pt',
                padding=True,
                max_length=64,
                truncation=True)
    inputs = {k:v.to('cuda') for k, v in inputs.items()}
    vector = dual_model.bert(**inputs)[1]
    q_vector = vector.cpu().numpy()
q_vector.shape

2.6、向量匹配

faiss.normalize_L2(q_vector)
scores, indexes = index.search(q_vector, 10)
topk_result = data.values[indexes[0].tolist()]
topk_result[:, 0]

2.7、加载分类模型

from transformers import BertForSequenceClassification
cross_model = BertForSequenceClassification.from_pretrained('../Model/chinese-macbert-base')
cross_model.eval()
print("succ")

2.8、最终预测

canidate = topk_result[:,0].tolist() 
ques = [question]*len(canidate)
inputs = tokenizer(qes,canidate,
                return_tensors='pt',
                padding=True,
                max_length=64,
                truncation=True)
inputs = {k:v.to('cpu') for k,v in inputs.items()}
with torch.inference_mode():
    logits = cross_model(**inputs).logits.squeeze()
    result = torch.argmax(logits,dim=-1)
    
canidate_answer = topk_result[:,1].tolist()
match_question = canidate[result.item()]

final_answer = canidate_answer[result.item()]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

鲸可落

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值