废话少说直接上代码:
# 使用huggingface的uer/gpt2-chinese-cluecorpussmall模型进行训练
数据集可自定义:
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
import torch
from transformers import AdamW
from transformers.optimization import get_scheduler
#加载编码器
tokenizer = AutoTokenizer.from_pretrained('uer/gpt2-chinese-cluecorpussmall')
model = AutoModelWithLMHead.from_pretrained('uer/gpt2-chinese-cluecorpussmall')
class Dataset(torch.utils.data.Dataset):
def __init__(self, txt_path):
with open(txt_path,'r',encoding='utf-8') as f:
data = f.readlines()
data = [i.strip() for i in data]
self.data = data
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
dataset = Dataset('label.txt')
print('len(dataset):',len(dataset))
print(dataset[0])
def collate_fn(data):
data = tokenizer.batch_encode_plus(
data,
max_length=50,
truncation=True,
padding=True,
return_tensors='pt'
)
data['labels'] = data['input_ids'].clone()
return data
loader = torch.utils.data.DataLoader(
dataset = dataset,
batch_size = 64,
shuffle = True,
collate_fn = collate_fn,
drop_last = True,
)
# for i, data in enumerate(loader):
# if i == 2:
# print(data)
# break
def train():
for epoch in range(5):
epoch_loss = 0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler(name='linear',
num_warmup_steps=0,
num_training_steps=5*len(loader),
optimizer=optimizer)
model.train()
for i ,data in enumerate(loader):
for k in data.keys():
data[k] = data[k].to(device)
out = model(**data)
loss = out['loss']
epoch_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
if i % 100 == 0:
labels = data['labels'][:,1:]
out = out['logits'].argmax(dim=2)[:,:-1]
select = labels != 0
labels = labels[select]
out = out[select]
del select
acc = (labels == out).sum().item() / labels.numel()
lr = optimizer.state_dict()['param_groups'][0]['lr']
print(f'这是第{epoch+1}次循环',f'这是第{(i/100)+1}次进程:',f'loss为{loss.item()}',f'当前学习率为{lr}',f'准确率为{acc}')
print('该轮次的总loss为:',epoch_loss / len(loader))
model.to('cpu')
model.save_pretrained('model_1')
train()