0 序言
简单使用bert-base-cased模型,和glue/mrpc 数据集,进行自定义的训练。
1 主要包的版本
torch 2.2.0
transformers 4.38.2
pandas 2.0.3
tensorflow 2.10.0
2 数据集和模型
https://pan.baidu.com/s/1RMvSyrtjIeXUyB4fOyjDOQ?pwd=0w63
提取码:0w63
3 pytorch版本代码
import torch
from transformers import AutoModelForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
# 定义自定义Dataset
class CustomDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# 加载本地模型
checkpoint = "F:/transformer/hugging-face-models/bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# 定义Parquet文件的路径
train_path = "F:/transformer/data_set/glue/mrpc/train/0000.parquet"
val_path = "F:/transformer/data_set/glue/mrpc/validation/0000.parquet"
# 读取训练集和验证集Parquet文件
train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)
# 准备训练集和验证集
train_encodings = tokenizer(train_df['sentence1'].tolist(),
train_df['sentence2'].tolist(),
truncation=True, padding=True,
max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_df['sentence1'].tolist(),
val_df['sentence2'].tolist(),
truncation=True, padding=True,
max_length=128, return_tensors="pt")
train_labels = train_df['label'].tolist()
val_labels = val_df['label'].tolist()
# 创建dataset
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
# 创建DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()
# 训练模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(5):
# 使用tqdm创建进度条
progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}")
for batch_idx, batch in progress_bar:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
# 更新进度条的描述信息来显示当前批次进度和损失值
progress_bar.set_description(f"Epoch {epoch + 1}, Batch {batch_idx + 1}/{len(train_loader)}")
progress_bar.set_postfix(Loss=loss.item())
# 可选:在每个epoch结束时打印信息
#print(f"Epoch {epoch + 1} Completed, Last Batch Loss: {loss.item()}")
# 最后读取测试集进行验证
# 1. 读取测试集
test_path = "F:/transformer/data_set/glue/mrpc/test/0000.parquet"
test_df = pd.read_parquet(test_path)
# 2. 数据预处理
# 注意:这里重新生成 test_encodings 以保证包含所有必要的编码信息
test_encodings = tokenizer(test_df['sentence1'].tolist(),
test_df['sentence2'].tolist(),
truncation=True, padding='max_length',
max_length=128, return_tensors="pt")
test_labels = test_df['label'].tolist()
# 由于你之前已定义了 CustomDataset 类,这里直接利用它创建测试数据集
test_dataset = CustomDataset(test_encodings, test_labels)
# 3. 创建 DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 4. 评估
model.eval() # 设置为评估模式
total_loss, total_accuracy, total_samples = 0, 0, 0
with torch.no_grad(): # 在评估阶段不计算梯度
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
total_accuracy += (predictions == labels).sum().item()
total_samples += labels.size(0)
# 计算平均损失和准确率
avg_loss = total_loss / len(test_loader)
avg_accuracy = total_accuracy / total_samples
print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {avg_accuracy:.4f}")
感觉模型、训练方法和数据并没有匹配,每次训练5轮的loss都很大波动,并且有时候严重过拟合,有时候又欠拟合。总之这份代码仅仅是一个思路框架,具体后续可以根据需求调整模型、数据、和超参数
4 tensorflow版本代码 有问题
# 我的电脑是win11,tensorflow版本是2.10(windows最多只能安装到2.10)
# 最后会报错 似乎是trai_dataset的格式和tf的model要求的类型不匹配
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, BertTokenizer
from datasets import load_dataset
import pandas as pd
# 加载本地模型
checkpoint = "F:/transformer/hugging-face-models/bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# 定义Parquet文件的路径
train_path = "F:/transformer/data_set/glue/mrpc/train/0000.parquet"
val_path = "F:/transformer/data_set/glue/mrpc/validation/0000.parquet"
# 读取训练集Parquet文件
train_df = pd.read_parquet(train_path)
print(train_df.columns) # Index(['sentence1', 'sentence2', 'label', 'idx'], dtype='object')
# 读取验证集Parquet文件
val_df = pd.read_parquet(val_path)
# 准备训练集
train_encodings = tokenizer(train_df['sentence1'].tolist(), train_df['sentence2'].tolist(),
truncation=True, padding=True, max_length=128, return_tensors="tf")
train_labels = train_df['label'].tolist()
# 准备验证集
val_encodings = tokenizer(val_df['sentence1'].tolist(), val_df['sentence2'].tolist(),
truncation=True, padding=True, max_length=128, return_tensors="tf")
val_labels = val_df['label'].tolist()
# 转换为tf dataset格式
train_dataset = tf.data.Dataset.from_tensor_slices((
{"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
train_labels
)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((
{"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
val_labels
)).batch(32)
# 训练模型
model.compile(optimizer='adam', loss=loss)
model.fit(train_dataset, validation_data=val_dataset, epochs=3)
最后会报错:
"""
All PyTorch model weights were used when initializing TFBertForSequenceClassification.
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model
and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Index(['sentence1', 'sentence2', 'label', 'idx'], dtype='object')
Epoch 1/3
Traceback (most recent call last):
File C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\spyder_kernels\py3compat.py:356 in compat_exec
exec(code, globals, locals)
File f:\transformer\code\3\find_tune.py:54
model.fit(train_dataset, validation_data=val_dataset, epochs=3)
File C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\transformers\modeling_tf_utils.py:1161 in fit
return super().fit(*args, **kwargs)
File C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\keras\utils\traceback_utils.py:70 in error_handler
raise e.with_traceback(filtered_tb) from None
File ~\AppData\Local\Temp\__autograph_generated_filev7qdorbq.py:15 in tf__train_function
retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
File C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\transformers\modeling_tf_utils.py:1562 in train_step
x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
AttributeError: in user code:
File "C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\keras\engine\training.py", line 1160, in train_function *
return step_function(self, iterator)
File "C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\keras\engine\training.py", line 1146, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\keras\engine\training.py", line 1135, in run_step **
outputs = model.train_step(data)
File "C:\ProgramData\anaconda3\envs\Transformer\lib\site-packages\transformers\modeling_tf_utils.py", line 1562, in train_step
x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'
"""
# 不知道如何解决,windows最新版本的tf就是2.10,没办法用更新的了