实现了一个简单的序列到序列(Seq2Seq)模型,用于中文问答任务

它包括了数据预处理、模型定义、训练、预测和图形界面的创建。不过,如果你想要进一步完善这段代码

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import tkinter as tk
import jieba
import matplotlib.pyplot as plt
import os
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 中文词汇表和索引映射
word2index = {"": 0, "": 1, "": 2, "": 3}
index2word = {0: "", 1: "", 2: "", 3: ""}

# 使用 jieba 进行中文分词
def tokenize_chinese(sentence):
    """
    对输入的中文句子进行分词
    参数:
    sentence (str): 输入的中文句子
    返回:
    list: 分词后的词汇列表
    """
    tokens = jieba.lcut(sentence)
    return tokens

# 构建词汇表
def build_vocab(sentences):
    """
    构建词汇表
    参数:
    sentences (list): 包含句子的列表
    返回:
    int: 词汇表的大小
    """
    global word2index, index2word
    vocab_size = len(word2index)
    for sentence in sentences:
        for token in tokenize_chinese(sentence):
            if token not in word2index:
                word2index[token] = vocab_size
                index2word[vocab_size] = token
                vocab_size += 1
    return vocab_size

# 将句子转换为张量
def sentence_to_tensor(sentence, max_length=50):
    """
    将句子转换为张量
    参数:
    sentence (str): 输入的句子
    max_length (int): 最大长度
    返回:
    torch.Tensor: 转换后的张量
    int: 句子的实际长度
    """
    tokens = tokenize_chinese(sentence)
    indices = [word2index.get(token, word2index[""]) for token in tokens]
    indices += [word2index[""]] * (max_length - len(indices))
    return torch.tensor(indices, dtype=torch.long), len(indices)

# 读取问题和答案文件
def load_data(file_path):
    """
    读取文件中的数据
    参数:
    file_path (str): 文件路径
    返回:
    list: 文件中的数据行列表
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()
    return lines

# 数据增强函数
def data_augmentation(sentence):
    tokens = tokenize_chinese(sentence)
    augmented_sentence = []
    # 随机插入
    if random.random() < 0.1:
        insert_token = random.choice(list(word2index.keys()))
        insert_index = random.randint(0, len(tokens))
        tokens.insert(insert_index, insert_token)
    # 随机删除
    if random.random() < 0.1:
        delete_index = random.randint(0, len(tokens) - 1)
        del tokens[delete_index]
    # 随机交换
    if len(tokens) > 1 and random.random() < 0.1:
        index1, index2 = random.sample(range(len(tokens)), 2)
        tokens[index1], tokens[index2] = tokens[index2], tokens[index1]
    augmented_sentence = ''.join(tokens)
    return augmented_sentence

# 回译数据增强
def back_translation(sentence, model, tokenizer):
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    output = model.generate(input_ids)
    translated_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return translated_sentence

# 随机替换数据增强
def random_replace(sentence, model, tokenizer):
    tokens = tokenize_chinese(sentence)
    for i in range(len(tokens)):
        if random.random() < 0.1:
            input_ids = tokenizer.encode(tokens[i], return_tensors='pt')
            output = model.generate(input_ids)
            tokens[i] = tokenizer.decode(output[0], skip_special_tokens=True)
    return ''.join(tokens)

# 同义词替换数据增强
def synonym_replace(sentence, model, tokenizer):
    tokens = tokenize_chinese(sentence)
    for i in range(len(tokens)):
        if random.random() < 0.1:
            input_ids = tokenizer.encode(tokens[i], return_tensors='pt')
            output = model.generate(input_ids)
            tokens[i] = tokenizer.decode(output[0], skip_special_tokens=True)
    return ''.join(tokens)

# 定义数据集
class ChatDataset(Dataset):
    def __init__(self, questions, answers):
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        input_tensor, input_length = sentence_to_tensor(self.questions[idx])
        target_tensor, target_length = sentence_to_tensor(self.answers[idx])
        return input_tensor, target_tensor, input_length, target_length

# 自定义 collate 函数
def collate_fn(batch):
    inputs, targets, input_lengths, target_lengths = zip(*batch)
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=word2index[""])
    targets = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=word2index[""])
    return inputs, targets, torch.tensor(input_lengths), torch.tensor(target_lengths)

# 创建数据集和数据加载器
def create_dataset_and_dataloader(questions_file, answers_file, batch_size=60, shuffle=True):
    questions = load_data(questions_file)
    answers = load_data(answers_file)
    vocab_size = build_vocab(questions + answers)
    dataset = ChatDataset(questions, answers)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
    return dataset, dataloader, vocab_size

# 定义模型结构
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, batch_first=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True, enforce_sorted=False)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_step, hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        gru_output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(gru_output.squeeze(1)))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, tokenizer):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.tokenizer = tokenizer

    def forward(self, input_tensor, target_tensor, input_lengths, target_lengths, teacher_forcing_ratio=0.5):
        batch_size = input_tensor.size(0)
        max_target_len = max(target_lengths)
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, max_target_len, vocab_size).to(self.device)
        encoder_outputs, encoder_hidden = self.encoder(input_tensor, input_lengths)
        decoder_input = torch.tensor([[word2index[""]] * batch_size], device=self.device).transpose(0, 1)
        decoder_hidden = encoder_hidden
        for t in range(max_target_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            outputs[:, t, :] = decoder_output
            top1 = decoder_output.argmax(1)
            decoder_input = target_tensor[:, t].unsqueeze(1) if random.random() < teacher_forcing_ratio else top1.unsqueeze(1)
        return outputs

# 实例化模型和优化器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset, dataloader, vocab_size = create_dataset_and_dataloader('questions.txt', 'answers.txt')
encoder = Encoder(vocab_size, hidden_size=200).to(device)
decoder = Decoder(vocab_size, hidden_size=200).to(device)
model = Seq2Seq(encoder, decoder, device, tokenizer={'word2index': word2index, 'index2word': index2word}).to(device)

# 加载预训练模型和分词器
model_path = './models/model.pth'
tokenizer_path = './models/tokenizer.pth'
if os.path.exists(model_path) and os.path.exists(tokenizer_path):
    print("Loading existing model and tokenizer...")
    model = torch.load(model_path)
    tokenizer = torch.load(tokenizer_path)
    word2index = tokenizer['word2index']
    index2word = tokenizer['index2word']
else:
    print("Creating new model and tokenizer...")

# 训练模型
def train(model, dataloader, num_epochs, learning_rate=0.001, save_path='model.pth'):
    """
    训练模型
    参数:
    model (nn.Module): 要训练的模型
    dataloader (DataLoader): 数据加载器
    num_epochs (int): 训练的轮数
    learning_rate (float): 学习率
    save_path (str): 模型保存路径
    """
    criterion = nn.CrossEntropyLoss(ignore_index=word2index[""])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_values = []
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, targets, input_lengths, target_lengths in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            input_lengths = input_lengths.cpu().clone().detach()
            target_lengths = target_lengths.cpu().clone().detach()
            optimizer.zero_grad()
            outputs = model(inputs, targets, input_lengths, target_lengths)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        loss_values.append(avg_loss)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.20f}")
    # 验证
    model.eval()
    with torch.no_grad():
        val_loss = 0
        correct_predictions = 0
        total_samples = 0
        for inputs, targets, input_lengths, target_lengths in dataloader: # 使用相同的数据进行验证
            inputs, targets = inputs.to(device), targets.to(device)
            input_lengths = input_lengths.cpu().clone().detach()
            target_lengths = target_lengths.cpu().clone().detach()
            outputs = model(inputs, targets, input_lengths, target_lengths, teacher_forcing_ratio=0)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            val_loss += loss.item()
            # 计算准确率
            predicted_indices = outputs.argmax(dim=2)
            for pred, target, target_len in zip(predicted_indices, targets, target_lengths):
                pred = pred[:target_len]
                target = target[:target_len]
                correct = (pred == target).all().item()
                if correct:
                    correct_predictions += 1
                total_samples += 1
        val_accuracy = correct_predictions / total_samples if total_samples > 0 else 0
        print(f"Validation Loss: {val_loss / len(dataloader):.20f}, Validation Accuracy: {val_accuracy:.20f}")
    torch.save(model, save_path)
    plt.plot(range(1, num_epochs + 1), loss_values)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss Curve')
    plt.show()

# 创建 tokenizer 字典
tokenizer = {'word2index': word2index, 'index2word': index2word}

# 保存分词器
def save_tokenizer(tokenizer, save_path='tokenizer.pth'):
    torch.save(tokenizer, save_path)

# 训练模型并保存
dataset, dataloader, vocab_size = create_dataset_and_dataloader('questions.txt', 'answers.txt')
train(model, dataloader, num_epochs=10, save_path='./models/model.pth')

# 保存分词器
save_tokenizer(tokenizer, save_path='./models/tokenizer.pth')

# 预测函数
def predict(question):
    model.eval()
    with torch.no_grad():
        input_tensor, input_length = sentence_to_tensor(question)
        input_tensor = input_tensor.unsqueeze(0).to(device)
        input_length = [input_length]
        encoder_outputs, encoder_hidden = model.encoder(input_tensor, input_length)
        decoder_input = torch.tensor([[word2index[""]]], device=device)
        decoder_hidden = encoder_hidden
        decoded_words = []
        for _ in range(50): # 设置一个较大的最大长度来避免潜在的循环
            decoder_output, decoder_hidden = model.decoder(decoder_input, decoder_hidden, encoder_outputs)
            top1 = decoder_output.argmax(1).item()
            if top1 == word2index[""]:
                break
            else:
                decoded_words.append(index2word[top1])
            decoder_input = torch.tensor([[top1]], device=device)
        return ''.join(decoded_words)

# 数据增强函数(示例)
def data_augmentation(sentence):
    tokens = tokenize_chinese(sentence)
    augmented_sentence = []
    # 随机插入
    if random.random() < 0.1:
        insert_token = random.choice(list(word2index.keys()))
        insert_index = random.randint(0, len(tokens))
        tokens.insert(insert_index, insert_token)
    # 随机删除
    if random.random() < 0.1:
        delete_index = random.randint(0, len(tokens) - 1)
        del tokens[delete_index]
    # 随机交换
    if len(tokens) > 1 and random.random() < 0.1:
        index1, index2 = random.sample(range(len(tokens)), 2)
        tokens[index1], tokens[index2] = tokens[index2], tokens[index1]
    augmented_sentence = ''.join(tokens)
    return augmented_sentence

# 创建图形界面
def on_predict():
    question = question_entry.get()
    if question.strip() == "":
        result_label.config(text="请输入有效的问题。")
        return
    answer = predict(question)
    # 对答案进行简单的后处理,去除多余空格
    answer = " ".join(answer.split())
    result_label.config(text=f'Answer: {answer}')

def on_clear():
    question_entry.delete(0, 'end')

# 创建主窗口
root = tk.Tk()
root.title("羲和")

# 输入框
question_label = tk.Label(root, text="请输入你的问题:")
question_label.pack()
question_entry = tk.Entry(root, width=50)
question_entry.pack()

# 生成按钮
generate_button = tk.Button(root, text="生成答案", command=on_predict)
generate_button.pack(side=tk.LEFT, padx=10)

# 清除按钮
clear_button = tk.Button(root, text="清除", command=on_clear)
clear_button.pack(side=tk.LEFT)

# 结果标签
result_label = tk.Label(root, text="")
result_label.pack(pady=10)

# 添加提示信息
tip_label = tk.Label(root, text="提示:本模型可能存在一定的局限性,答案仅供参考。")
tip_label.pack()

question_entry.focus_set() # 生成答案后自动选中输入框

# 主事件循环
root.mainloop()

# 在程序结束时释放 GPU 内存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

yehaiwz

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值