训练和验证完整代码如下:
'''
项目:通过huggingface的transformers库,datasets库,函数,调用BERT的分词器和预训练模型进行中文分类,如情感分析。
时间:2024年6月23日
'''
import sys
import torch
from datasets import load_dataset, load_from_disk
class Dataset(torch.utils.data.Dataset):
def __init__(self, dataset):
self.dataset = dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset[i]['text']
label = self.dataset[i]['label']
return text, label
train_dataset = load_from_disk('./data/ChnSentiCorp/train')
valid_dataset = load_from_disk('./data/ChnSentiCorp/validation')
test_dataset = load_from_disk('./data/ChnSentiCorp/test')
train_dataset = Dataset(train_dataset)
valid_dataset = Dataset(valid_dataset)
test_dataset = Dataset(test_dataset)
print("train_dataset:", len(train_dataset))
print("valid_dataset:", len(valid_dataset))
print("test_dataset:", len(valid_dataset))
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('D:/Codes/Models/bert-base-chinese')
'''
# 测试代码成功!
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = pretrained_model(**inputs)
print(outputs)
'''
def collate_fn(data):
sents = [i[0] for i in data]
labels = [i[1] for i in data]
data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
truncation=True,
padding='max_length',
max_length=500,
return_tensors='pt',
return_length=True
)
input_ids = data['input_ids']
attention_mask = data['attention_mask']
token_type_ids = data['token_type_ids']
labels = torch.LongTensor(labels)
return input_ids, attention_mask, token_type_ids, labels
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
break
print('loader:', len(train_loader))
print('input_ids.shpae:', input_ids.shape)
print('attention_mask.shpae:', attention_mask.shape)
print('token_type_ids.shpae:', token_type_ids.shape)
pretrained_model = BertModel.from_pretrained('D:/Codes/Models/bert-base-chinese')
for param in pretrained_model.parameters():
param.requires_grad_(False)
out = pretrained_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
print('original_out:', out.last_hidden_state.shape)
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc = torch.nn.Linear(768, 2)
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad():
out = pretrained_model(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
out = self.fc(out.last_hidden_state[:, 0])
out = out.softmax(dim=1)
return out
model = Model()
output_size = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).shape
print('our_output_size:', output_size)
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
loss = criterion(out, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if i % 5 == 0:
out = out.argmax(dim=1)
accuracy = (out == labels).sum().item() / len(labels)
print(i, loss.item(), accuracy)
if i == 300:
break
print('Completely!')
train()
def vaild():
model.eval()
correct = 0
total = 0
loader_validation = torch.utils.data.DataLoader(dataset=valid_dataset,
batch_size=32,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_validation):
if i == 5:
break
print(i)
with torch.no_grad():
out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
out = out.argmax(dim=1)
correct += (out == labels).sum().item()
total += len(labels)
print(correct / total)
vaild()
结果如下: