大家好,最近我在用CLIP模型去对图片进行分类并输出每个类大的概率值。但是,在训练时出现了每一个epoch的loss值都是nan的情况,也尝试着去降低学习率,对数据集进行统一等等方法去解决。但是目前还是每一个epoch都是nan,有大佬知道还有什么方法去解决嘛?代码我放在下面啦,数据集里面我对图像进行了简单的4分类,每一类里面只有10张图片。之前运行的时候是成功的,而且准确率也特别高,但是后来这个代码也没动,再次去运行的时候就会出现nan。
数据集:
全部代码:
import os
from PIL import Image
from torch.utils.data import Dataset
import torch
import clip
from PIL import Image
from torch.utils.data import DataLoader
class DefectDataset(Dataset):
def __init__(self, root_dir, preprocess):
self.root_dir = root_dir
self.preprocess = preprocess
self.image_paths = []
self.labels = []
self.classes = ["Irregular defects1", "One or several lines1", "Small punctate defects1","Square block defects1"]
for label in self.classes:
class_dir = os.path.join(root_dir, label)
for filename in os.listdir(class_dir):
if filename.endswith(".png"): # 假设图像格式为PNG
self.image_paths.append(os.path.join(class_dir, filename))
self.labels.append(label)
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
image_path = self.image_paths[idx]
label = self.labels[idx]
image = Image.open(image_path)
image = self.preprocess(image)
return image, label
# 使用示例
root_dir = "C:/Users/zhanjie0470/Desktop/CLIP_WeiTiao/CLIP_WeiTiao/dateset_similar size" # 替换为你的数据集路径
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 创建数据集
dataset = DefectDataset(root_dir, preprocess)
# 使用DataLoader加载数据
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
# 微调CLIP模型的代码与之前相同
# 读取数据路径和标签
root_dir = "C:/Users/zhanjie0470/Desktop/CLIP_WeiTiao/CLIP_WeiTiao/dateset_similar size" # 替换为你的数据集路径
# 加载CLIP模型和预处理器
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# 创建数据集和数据加载器
dataset = DefectDataset(root_dir, preprocess)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
# 定义优化器 1e-8 eps=1e-8-----不行
optimizer = torch.optim.Adam(model.parameters(), lr=1e-8)
criterion = torch.nn.CrossEntropyLoss()
# 微调模型
model.train()
for epoch in range(5): # 训练5个epoch
for images, labels in dataloader:
images = images.to(device)
text = clip.tokenize(labels).to(device)
# 前向传播
logits_per_image, logits_per_text = model(images, text)
ground_truth = torch.arange(len(images), device=device)
# 计算损失 criterion函数计算的是两个不同模态之间的匹配程度的损失
loss = (criterion(logits_per_image, ground_truth) + criterion(logits_per_text, ground_truth)) / 2
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}, Loss: {loss.item()}")
# 保存微调后的模型
torch.save(model.state_dict(), "fine_tuned_clip.pth")
# 进行推理
model.eval()
image = preprocess(Image.open("image.png")).unsqueeze(0).to(device)
text = clip.tokenize(["Irregular defects1", "One or several lines1", "Small punctate defects1","Square block defects1"]).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", dict(zip(["Irregular defects1", "One or several lines1", "Small punctate defects1","Square block defects1"], probs[0])))
真诚期待大家的回复!