代码实例:
比较简单的示例,其它gpt架构相关知识和代码移步专栏其它文章。
from torchtext.datasets import WikiText2 # 导入WikiText2
from torchtext.data.utils import get_tokenizer # 导入Tokenizer分词工具
from torchtext.vocab import build_vocab_from_iterator # 导入Vocabulary工具
tokenizer = get_tokenizer("basic_english") # 定义数据预处理所需的tokenizer
train_iter = WikiText2(split='train') # 加载WikiText2数据集的训练部分
# 定义一个生成器函数,用于将数据集中的文本转换为tokens
def yield_tokens(data_iter):
for item in data_iter:
yield tokenizer(item)
# 创建词汇表,包括特殊tokens:"<pad>", "<sos>", "<eos>"
vocab = build_vocab_from_iterator(yield_tokens(train_iter),
specials=["<pad>", "<sos>", "<eos>"])
vocab.set_default_index(vocab["<pad>"])
# 打印词汇表信息
print("词汇表大小:", len(vocab))
print("词汇示例(word to index):",
{word: vocab[word] for word in ["<pad>", "<sos>", "<eos>", "the", "apple"]})
import torch #导入torch
from torch.utils.data import Dataset #导入Dataset
class ChatDataset(Dataset):
def __init__(self, file_path, tokenizer, vocab):
self.tokenizer = tokenizer #分词器
self.vocab = vocab #词汇表
self.input_data, self.target_data = self.load_and_process_data(file_path)
def load_and_process_data(self, file_path):
with open(file_path, "r") as f:
lines = f.readlines() # 打开文件,读取每一行数据
input_data, target_data = [], []
for i, line in enumerate(lines):
if line.startswith("User:"): # 移除 "User: " 前缀,构建输入序列
tokens = self.tokenizer(line.strip()[6:])
tokens = ["<sos>"] + tokens + ["<eos>"]
indices = [self.vocab[token] for token in tokens]
input_data.append(torch.tensor(indices, dtype=torch.long))
elif line.startswith("AI:"): # 移除 "AI: " 前缀,构建目标序列
tokens = self.tokenizer(line.strip()[4:])
tokens = ["<sos>"] + tokens + ["<eos>"]
indices = [self.vocab[token] for token in tokens]
target_data.append(torch.tensor(indices, dtype=torch.long))
return input_data, target_data
def __len__(self): #数据集长度
return len(self.input_data)
def __getitem__(self, idx): #根据索引获取数据样本
return self.input_data[idx], self.target_data[idx]
file_path = "chat.txt" # 加载chat.txt数据集
chat_dataset = ChatDataset(file_path, tokenizer, vocab)
for i in range(3): # 打印几个样本数据
input_sample, target_sample = chat_dataset[i]
print(f"Sample {i + 1}:")
print("Input Data: ", input_sample)
print("Target Data: ", target_sample)
print("-" * 50)
from torch.utils.data import DataLoader # 导入Dataloader
# 定义pad_sequence函数,用于将一批序列补齐到相同长度
def pad_sequence(sequences, padding_value=0, length=None):
# 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度
max_length = max(len(seq) for seq in sequences) if length is None else length
# 创建一个具有适当形状的全零张量,用于存储补齐后的序列
result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long)
# 遍历序列,将每个序列的内容复制到结果张量中
for i, seq in enumerate(sequences):
end = len(seq)
result[i, :end] = seq[:end]
return result
# 定义collate_fn函数,用于将一个批次的数据整理成适当的形状
def collate_fn(batch):
# 从批次中分离源序列和目标序列
sources, targets = zip(*batch)
# 计算批次中的最大序列长度
max_length = max(max(len(s) for s in sources), max(len(t) for t in targets))
# 使用pad_sequence函数补齐源序列和目标序列
sources = pad_sequence(sources, padding_value=vocab["<pad>"], length=max_length)
targets = pad_sequence(targets, padding_value=vocab["<pad>"], length=max_length)
# 返回补齐后的源序列和目标序列
return sources, targets
# 创建Dataloader
batch_size = 2
chat_dataloader = DataLoader(chat_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 检查Dataloader输出
for input_batch, target_batch in chat_dataloader:
print("Input batch tensor size:", input_batch.size())
print("Target batch tensor size:", target_batch.size())
break
from GPT_Model import GPT #导入GPT模型的类(这是我们自己制作的)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT(len(vocab), max_seq_len=256, n_layers=6).to(device) #创建模型示例
model.load_state_dict(torch.load('trained_model_2023-05-05_14-08-24.pt')) #加载模型
# model.eval()
import torch.nn as nn #导入nn
import torch.optim as optim #导入优化器
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"]) #损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0001) # 优化器
for epoch in range(100): # 开始训练
for batch_idx, (input_batch, target_batch) in enumerate(chat_dataloader):
optimizer.zero_grad() # 梯度清零
input_batch, target_batch = input_batch.to(device), target_batch.to(device) #移动到设备
outputs = model(input_batch) # 前向传播,计算模型输出
loss = criterion(outputs.view(-1, len(vocab)), target_batch.view(-1)) # 计算损失
loss.backward() # 反向传播
optimizer.step() # 更新参数
if (epoch + 1) % 20 == 0: # 每200个epoch打印一次损失值
print(f"Epoch: {epoch + 1:04d}, cost = {loss:.6f}")
def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):
model.eval() # 将模型设置为评估(测试)模式,关闭dropout和batch normalization等训练相关的层
# 将输入字符串中的每个token 转换为其在词汇表中的索引
input_tokens = [vocab[token] for token in input_str]
# 创建一个列表,用于存储候选序列
candidates = [(input_tokens, 0.0)]
with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程
for _ in range(max_len): # 生成最多max_len个tokens
new_candidates = []
for candidate, candidate_score in candidates:
inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)
outputs = model(inputs) # 输出 logits形状为[1, len(output_tokens), vocab_size]
logits = outputs[:, -1, :] # 只关心最后一个时间步(即最新生成的token)的logits
# 将<pad>标记的得分设置为一个很大的负数,以避免选择它
logits[0, vocab["<pad>"]] = -1e9 # 不是这个原因,注意不认识的词汇都变成0
# 找到具有最高分数的前beam_width个tokens
scores, next_tokens = torch.topk(logits, beam_width, dim=-1)
final_results = []
for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):
new_candidate = candidate + [next_token.item()]
new_score = candidate_score - score.item() # 使用负数,因为我们需要降序排列
if next_token.item() == vocab["<eos>"]:
# 如果生成的token是EOS(结束符),将其添加到最终结果中
final_results.append((new_candidate, new_score))
else:
# 将新生成的候选序列添加到新候选列表中
new_candidates.append((new_candidate, new_score))
# 从新候选列表中选择得分最高的beam_width个序列
candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]
# 选择得分最高的候选序列
best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0]
# 将输出 token 转换回文本字符串
output_str = " ".join([vocab.get_itos()[token] for token in best_candidate])
return output_str
input_str = "what is the weather like today ?"
input_str = "hi , how are you ?"
# input_str = "hi , what is you name ?"
generated_text = generate_text_beam_search(model, input_str.split())
print("Generated text:", generated_text)