seq2sqe与attenton实现聊天机器人

最新推荐文章于 2024-04-19 10:03:46 发布
Happy丶lazy
最新推荐文章于 2024-04-19 10:03:46 发布
阅读量226
点赞数
分类专栏： pytorch 文章标签：自然语言处理 pytorch 聊天机器人
本文链接：https://blog.csdn.net/qq_39309652/article/details/117137402
版权
pytorch 专栏收录该内容
9 篇文章 3 订阅
订阅专栏
import pandas as pd
import jieba
from torch.utils import data
import warnings
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import math
import time
SEED=1222
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def cal_clear_word(test):
    stoplist = [' ', '\n', '，']

    def function(a):
        word_list = [w for w in jieba.cut(a) if w not in list(stoplist)]
        return word_list

    test['quest'] = test.apply(lambda x: function(x['quest']), axis=1)
    test['anwer'] = test.apply(lambda x: function(x['anwer']), axis=1)
    return test

def cal_update_date(test, sequence_length):
    def prepare_sequence(seq):
        idxs = [w for w in seq]
        if len(idxs) >= sequence_length:
            idxs = idxs[:sequence_length]
        else:
            pad_num = sequence_length - len(idxs)
            for i in range(pad_num):
                idxs.append('UNK')
        return idxs

    test['quest'] = test.apply(lambda x: prepare_sequence(x['quest']), axis=1)
    test['anwer'] = test.apply(lambda x: prepare_sequence(x['anwer']), axis=1)
    return test

def cal_add_status(test):
    test['enc_input'] = test['quest']
    test['dec_input'] = test['anwer']
    test=test[['enc_input','dec_input']]
    for enc_input, dec_input in test.values:
        enc_input.append('E')
        dec_input.append('E')
        enc_input.insert(0, "S")
        dec_input.insert(0, "S")
    return test
def cal_word_to_ix(test):
    word_to_ix = {}  # 单词的索引字典
    for enc_input, dec_input in test.values:
        for word in dec_input:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

    for enc_input, dec_input in test.values:
        for word in enc_input:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

    def prepare_sequence(seq, to_ix):
        idxs = [to_ix[w] for w in seq]
        return idxs
    test['enc_input'] = test.apply(lambda x: prepare_sequence(x['enc_input'], word_to_ix), axis=1)
    test['dec_input'] = test.apply(lambda x: prepare_sequence(x['dec_input'], word_to_ix), axis=1)
    return test, len(word_to_ix), word_to_ix

class TestDataset(data.Dataset):#继承Dataset
    def __init__(self,test):
        self.enc_input=test['enc_input']
        self.dec_input=test['dec_input']

    def __getitem__(self, index):
        #把numpy转换为Tensor
        enc_input=torch.from_numpy(np.array(self.enc_input[index]))
        dec_input=torch.from_numpy(np.array(self.dec_input[index]))
        return enc_input,dec_input

    def __len__(self):
        return len(self.enc_input)


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        '''
        src = [src_len, batch_size]
        '''
        src = src.transpose(0, 1)  # src = [batch_size, src_len]
        embedded = self.dropout(self.embedding(src)).transpose(0, 1)  # embedded = [src_len, batch_size, emb_dim]

        # enc_output = [src_len, batch_size, hid_dim * num_directions]
        # enc_hidden = [n_layers * num_directions, batch_size, hid_dim]
        enc_output, enc_hidden = self.rnn(embedded)  # if h_0 is not give, it will be set 0 acquiescently

        # enc_hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # enc_output are always from the last layer

        # enc_hidden [-2, :, : ] is the last of the forwards RNN
        # enc_hidden [-1, :, : ] is the last of the backwards RNN

        # initial decoder hidden is final hidden state of the forwards and backwards
        # encoder RNNs fed through a linear layer
        # s = [batch_size, dec_hid_dim]
        s = torch.tanh(self.fc(torch.cat((enc_hidden[-2, :, :], enc_hidden[-1, :, :]), dim=1)))

        return enc_output, s


class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim, bias=False)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, s, enc_output):
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]

        batch_size = enc_output.shape[1]
        src_len = enc_output.shape[0]

        # repeat decoder hidden state src_len times
        # s = [batch_size, src_len, dec_hid_dim]
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        s = s.unsqueeze(1).repeat(1, src_len, 1)
        enc_output = enc_output.transpose(0, 1)

        # energy = [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim=2)))

        # attention = [batch_size, src_len]
        attention = self.v(energy).squeeze(2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, dec_input, s, enc_output):
        # dec_input = [batch_size]
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]

        dec_input = dec_input.unsqueeze(1)  # dec_input = [batch_size, 1]

        embedded = self.dropout(self.embedding(dec_input)).transpose(0, 1)  # embedded = [1, batch_size, emb_dim]

        # a = [batch_size, 1, src_len]
        a = self.attention(s, enc_output).unsqueeze(1)

        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        enc_output = enc_output.transpose(0, 1)

        # c = [1, batch_size, enc_hid_dim * 2]
        c = torch.bmm(a, enc_output).transpose(0, 1)

        # rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
        rnn_input = torch.cat((embedded, c), dim=2)

        # dec_output = [src_len(=1), batch_size, dec_hid_dim]
        # dec_hidden = [n_layers * num_directions, batch_size, dec_hid_dim]
        dec_output, dec_hidden = self.rnn(rnn_input, s.unsqueeze(0))

        # embedded = [batch_size, emb_dim]
        # dec_output = [batch_size, dec_hid_dim]
        # c = [batch_size, enc_hid_dim * 2]
        embedded = embedded.squeeze(0)
        dec_output = dec_output.squeeze(0)
        c = c.squeeze(0)

        # pred = [batch_size, output_dim]
        pred = self.fc_out(torch.cat((dec_output, c, embedded), dim=1))

        return pred, dec_hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src_len, batch_size]
        # trg = [trg_len, batch_size]
        # teacher_forcing_ratio is probability to use teacher forcing

        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # enc_output is all hidden states of the input sequence, back and forwards
        # s is the final forward and backward hidden states, passed through a linear layer
        enc_output, s = self.encoder(src)

        # first input to the decoder is the <sos> tokens
        dec_input = trg[0, :]

        for t in range(1, trg_len):
            # insert dec_input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            dec_output, s = self.decoder(dec_input, s, enc_output)

            # place predictions in a tensor holding predictions for each token
            outputs[t] = dec_output

            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from our predictions
            top1 = dec_output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            dec_input = trg[t] if teacher_force else top1

        return outputs


data_dict={'quest':['好好写博客','我想去大厂','今天打王者嘛','明天要加班'],
      'anwer':['加油噢','肯定可以的','打呀，放假为啥不打','五一加屁班']}
train_df = pd.DataFrame(data_dict)
# 结巴分类以及去掉停用词
return_df = cal_clear_word(train_df)
n_step = max([max(len(i), len(j)) for i, j in return_df.values])
return_df = cal_update_date(return_df, n_step)
# 给enc_input输入值添加一个结束状态，给dec输入状态添加一个开始状态，dec输出状态结束状态
return_df = cal_add_status(return_df)
# 将文字转化为数字
return_df, vocab_size, letter2idx = cal_word_to_ix(return_df)
# 将数据转化为pytorch专用数据类型，方便批量化处理
result_df =TestDataset(return_df)
batch_size = 2
test_loader = data.DataLoader(result_df,batch_size,shuffle=False)
print(vocab_size)
INPUT_DIM = vocab_size
OUTPUT_DIM = vocab_size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)
TRG_PAD_IDX = int(letter2idx['UNK'])
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(100):
    model.train()
    epoch_loss = 0
    for enc_input_batch, dec_input_batch in test_loader:
        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
        (enc_input_batch, dec_intput_batch) = (enc_input_batch.to(device).long(), dec_input_batch.to(device).long())
        src = enc_input_batch
        trg = dec_intput_batch  # trg = [trg_len, batch_size]
        # pred = [trg_len, batch_size, pred_dim]
        pred = model(src, trg)
        pred_dim = pred.shape[-1]
        # trg = [(trg len - 1) * batch size]
        # pred = [(trg len - 1) * batch size, pred_dim]
        trg = trg[1:].view(-1)
        pred = pred[1:].view(-1, pred_dim)
        loss = criterion(pred, trg)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        if (epoch + 1) % 10 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

def make_data(word, n_step, to_ix):
    stoplist = [' ', '\n', '，']
    ord_list = [w for w in jieba.cut(word) if w not in list(stoplist)]
    idxs = [w for w in ord_list]
    if len(idxs) >= n_step:
        idxs = idxs[:n_step]
    else:
        pad_num = n_step - len(idxs)
        for i in range(pad_num):
            idxs.append('UNK')
    enc_input = []
    for i in idxs:
        enc_input.append(i)
    enc_input.insert(0, "S")
    enc_input.append('E')
    enc_input = [to_ix[n] for n in enc_input]  # ['m', 'a', 'n', '?', '?', 'E']
    dec_input = []
    for i in range(n_step):
        dec_input.append('UNK')
    dec_input.insert(0, "S")
    dec_input.append('E')
    dec_input = [to_ix[n] for n in dec_input]  # ['m', 'a', 'n', '?', '?', 'E']
    enc_input = torch.Tensor(enc_input)
    dec_input = torch.Tensor(dec_input)
    enc_input = torch.unsqueeze(enc_input, 0)
    dec_input = torch.unsqueeze(dec_input, 0)
    return enc_input, dec_input
# Test
letter = {value:key for key, value in letter2idx.items()}
def translate(word):
    model.eval()
    enc_input, dec_input = make_data(word, n_step, letter2idx)
    enc_input, dec_input = enc_input.to(device).long(), dec_input.to(device).long()
    # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    output = model(enc_input, dec_input)
    # output : [n_step+1, batch_size, n_class]
    predict = output.data.max(2, keepdim=True)[1]  # select n_class dimension
    predict = predict.view(8)
    predict = predict.numpy()
    decoded = [letter[i] for i in predict]
    translated = ''.join(decoded)
    translated = translated.replace('UNK', ' ')
    translated = translated.replace('S', ' ')
    return translated
print('test')
print('今天打王者嘛 ->', translate('今天打王者嘛'))
print('好好写博客 ->', translate('好好写博客'))
print('我想去大厂 ->', translate('我想去大厂'))
print('明天要加班 ->', translate('明天要加班'))