一个简单例子记录下(例子不太合理,只是学习下模型训练过程,了解后可以换基他分词库)。简单示例不要使用大文本,10k左右
基本步骤:分词 -> 创建词汇索引 -> 创建模型 -> 训练 -> 使用
import jieba
import torch
import torch.nn as nn
from collections import Counter
from tqdm import tqdm
# 数据预处理
with open('sanguoyanyi.txt', 'r', encoding='utf-8') as f:
text = f.read()
# 分词
words = jieba.lcut(text)
# 统计词频(有去重作用)
word_counts = Counter(words)
# 创建词汇索引表
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items(), 2)}
vocab['<PAD>'] = 0 # 填充标记
vocab['<UNK>'] = 1 # 未知词标记
# 将词语映射为索引
words_encoded = list(vocab.values())
# 使用字典推导式交换键和值
swapped_dict = {value: key for key, value in vocab.items()}
# 创建输入和目标序列
input_sequence = words_encoded[:-1]
target_sequence = words_encoded[1:]
class LanguageModel(nn.Module):
def __init__(self, vocab_size, input_size, hidden_size):
super(LanguageModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, input_size)
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
self.linear = nn.Linear(hidden_size, vocab_size)
def forward(self, x, hidden):
x = self.embedding(x)
out, hidden = self.lstm(x, hidden)
out = self.linear(out)
return out, hidden
# 确定模型
model = LanguageModel(len(vocab), 8, 32)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
# 训练循环
losses = []
for epoch in tqdm(range(200), desc="Epoch Progress"):
hidden = None
for i in range(0, len(input_sequence)-1, 4):
inputs = torch.tensor(input_sequence[i:i+4])
targets = torch.tensor(target_sequence[i:i+4])
optimizer.zero_grad() # 清空之前的梯度
outputs, hidden = model(inputs, hidden) # 前向传播
if isinstance(hidden, tuple):
hidden = tuple([h.detach() for h in hidden])
else:
hidden = hidden.detach()
loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1)) # 计算损失
losses.append(loss.item())
loss.backward() # 反向传播
optimizer.step() # 更新参数
# 文本生成函数
def generate_text(model, vocab, input_text, max_length=30):
# 将输入文本分词并转换为索引
input_words = jieba.lcut(input_text)
input_encoded = [vocab.get(word, vocab['<UNK>']) for word in input_words]
input_tensor = torch.tensor(input_encoded).unsqueeze(0) # 添加批次维度
hidden = None
output_text = input_text
for _ in range(max_length):
outputs, hidden = model(input_tensor, hidden)
# 获取最后一个时间步的输出
last_output = outputs[:, -1, :]
'''
# 预测下一个词的索引(输出比较固定)
_, predicted_idx = torch.max(last_output, dim=1)
predicted_idx = predicted_idx.item()
'''
# 对输出进行 softmax 操作,得到概率分布(输出比较随机)
probabilities = torch.softmax(last_output, dim=1)
# 按概率随机选择下一个词的索引
predicted_idx = torch.multinomial(probabilities, num_samples=1).item()
# 将预测的索引转换为词,并添加到输出文本中
output_text += swapped_dict.get(predicted_idx,vocab['<UNK>'])
# 更新输入序列
input_encoded.append(predicted_idx)
input_tensor = torch.tensor(input_encoded).unsqueeze(0)
return output_text
# 使用示例
while True:
user_input = input("请输入命令: ")
if user_input == "exit":
print("收到退出命令,程序即将退出...")
break
else:
print("生成的文本:", generate_text(model, vocab, user_input))