自定义损失函数的数值稳定性
在完成
请基于 pytorch 实现 ModernNCA, 在 abalone 数据集上测试性能, 并简单分析两种实
现的差异, 及其性能差距的原因. ModernNCA 的参考实现可以在 LAMDA-TALENT 中获取.
作业时,我需要自己定义损失函数来刻画降维后的输入样本的距离性质:同类样本的距离要近,异类样本间距离要远。
class NCA_Loss(nn.Module):
def __init__(self):
super(NCA_Loss, self).__init__()
def forward(self, embedded, targets):
batch_size = embedded.size(0)
pairwise_distances = torch.cdist(embedded, embedded)
same_class_mask = targets.unsqueeze(1) == targets.unsqueeze(0)
diff_class_mask = 1 - same_class_mask.float()
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
P = torch.exp(-pairwise_distances)
P_same = torch.sum(P * same_class_mask, dim=1)
P_diff = torch.sum(P * diff_class_mask, dim=1)
# loss = -torch.mean(torch.log(P_same / P_diff)) #这是第一种loss
# loss = P_diff.sum()-P_same.sum() #这是第二种Loss
return loss
我发现,第一种loss在训练的时候会nan,而第二种不会
在分析这两种损失函数时,我们需要关注它们在计算过程中可能遇到的数值问题和稳定性问题。以下是详细的分析:
第一种损失函数分析
class NCA_Loss(nn.Module):
def __init__(self):
super(NCA_Loss, self).__init__()
def forward(self, embedded, targets):
batch_size = embedded.size(0)
pairwise_distances = torch.cdist(embedded, embedded)
same_class_mask = targets.unsqueeze(1) == targets.unsqueeze(0)
diff_class_mask = 1 - same_class_mask.float()
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
P = torch.exp(-pairwise_distances)
P_same = torch.sum(P * same_class_mask, dim=1)
P_diff = torch.sum(P * diff_class_mask, dim=1)
loss = -torch.mean(torch.log(P_same / P_diff))
return loss
(可以调整lr,但最多让epoch20的时候不是nan,后面一定是nan)
问题分析
- 数值稳定性:
P = torch.exp(-pairwise_distances)
:在计算负对数似然损失时,torch.exp(-pairwise_distances)
可能会导致数值不稳定。特别是当pairwise_distances
很大时,torch.exp(-pairwise_distances)
会非常接近于 0,这会导致除法时的数值问题。P_same / P_diff
:如果P_diff
中有为 0 的值,会导致除法结果为无穷大。后续的torch.log
操作会将无穷大变成nan
。
- 对数操作:
torch.log(P_same / P_diff)
:如果P_same / P_diff
为负数或 0,torch.log
会返回nan
或inf
,这会导致最终的损失值为nan
。
- 对角线处理:
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
:这一步是为了排除每个样本自身的距离(对角线元素),但可能会导致P_same
为 0 或非常接近 0 的情况,尤其是在样本数量较少时。
第二种损失函数分析
class NCA_Loss(nn.Module):
def __init__(self):
super(NCA_Loss, self).__init__()
def forward(self, embedded, targets):
batch_size = embedded.size(0)
pairwise_distances = torch.cdist(embedded, embedded)
same_class_mask = targets.unsqueeze(1) == targets.unsqueeze(0)
diff_class_mask = 1 - same_class_mask.float()
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
P = torch.exp(-pairwise_distances)
P_same = torch.sum(P * same_class_mask, dim=1)
P_diff = torch.sum(P * diff_class_mask, dim=1)
loss = P_diff.sum() - P_same.sum()
return loss
(lr=1e-2)
问题分析
- 数值稳定性:
P = torch.exp(-pairwise_distances)
:这一部分仍然可能导致数值不稳定,但对最终的损失计算影响较小。
- 损失计算:
loss = P_diff.sum() - P_same.sum()
:这种计算方式不会涉及对数或除法操作,因此不会产生nan
或inf
问题。P_diff.sum()
和P_same.sum()
都是非负的(因为P
是exp
函数的输出),所以它们的差值也是稳定的。
解决方法
为了改进第一种损失函数,使其在训练过程中不产生 nan
,可以采取以下措施:
- 数值稳定性:
- 使用
torch.log_softmax
或torch.logsumexp
来处理指数和对数运算,以提高数值稳定性。 - 添加一个小的常数
epsilon
来避免除法操作中的 0 问题。
- 使用
- 对角线处理:
- 确保
P_same
和P_diff
中的值不会变成 0。
- 确保
改进后的第一种损失函数
import torch
import torch.nn as nn
class NCA_Loss(nn.Module):
def __init__(self, epsilon=1e-10):
super(NCA_Loss, self).__init__()
self.epsilon = epsilon
def forward(self, embedded, targets):
batch_size = embedded.size(0)
pairwise_distances = torch.cdist(embedded, embedded)
same_class_mask = targets.unsqueeze(1) == targets.unsqueeze(0)
diff_class_mask = 1 - same_class_mask.float()
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
# 计算负距离的指数
P = torch.exp(-pairwise_distances)
# 计算相同类别和不同类别的概率和
P_same = torch.sum(P * same_class_mask, dim=1) + self.epsilon
P_diff = torch.sum(P * diff_class_mask, dim=1) + self.epsilon
# 计算损失
loss = -torch.mean(torch.log(P_same / (P_diff + self.epsilon)))
return loss
详细说明
epsilon
:添加一个小的常数epsilon
(如1e-10
)来避免P_same
和P_diff
为 0 的情况,从而防止对数操作中的nan
问题。P_diff + self.epsilon
:在除法操作中也添加epsilon
来进一步提高数值稳定性。
补充一个小问题
深度学习自定义损失函数的时候,需不需要保证损失函数一定是正数?
损失函数的值不一定要严格保证是正数,但通常情况下,损失函数设计为非负值是有益的。
- 稳定性:非负的损失函数更容易在训练过程中保持数值稳定,尤其是在使用梯度下降等优化算法时。
- 方向性:损失函数的值为负并不会改变梯度的方向。优化算法仍然会朝着使损失最小化(即负值更小,正值更小)的方向更新参数。
- 非负的损失函数更容易解释。
最后放上完整代码:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
# 1. 数据加载与预处理
class AbaloneDataset(Dataset):
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.long)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
# 从本地文件加载数据
X_train = np.load('abalone/N_train.npy', allow_pickle=True)
X_val = np.load('abalone/N_val.npy', allow_pickle=True)
X_test = np.load('abalone/N_test.npy', allow_pickle=True)
y_train = np.load('abalone/y_train.npy', allow_pickle=True)
y_val = np.load('abalone/y_val.npy', allow_pickle=True)
y_test = np.load('abalone/y_test.npy', allow_pickle=True)
# 检查和转换标签类型
if isinstance(y_train[0], str):
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)
# 标准化特征
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
# 创建数据集和数据加载器
train_dataset = AbaloneDataset(X_train, y_train)
val_dataset = AbaloneDataset(X_val, y_val)
test_dataset = AbaloneDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 2. 模型定义
class ModernNCA(nn.Module):
def __init__(self, input_dim, embed_dim):
super(ModernNCA, self).__init__()
self.embed = nn.Sequential(
nn.Linear(input_dim, embed_dim),
nn.BatchNorm1d(embed_dim),
nn.ReLU(),
nn.Linear(embed_dim, embed_dim),
)
def forward(self, x):
return self.embed(x)
# 3. 损失函数和训练过程
class NCA_Loss(nn.Module):
def __init__(self):
super(NCA_Loss, self).__init__()
def forward(self, embedded, targets):
batch_size = embedded.size(0)
pairwise_distances = torch.cdist(embedded, embedded)
same_class_mask = targets.unsqueeze(1) == targets.unsqueeze(0)
diff_class_mask = 1 - same_class_mask.float()
same_class_mask = same_class_mask.float() - torch.eye(batch_size, device=embedded.device)
P = torch.exp(-pairwise_distances)
P_same = torch.sum(P * same_class_mask, dim=1)
P_diff = torch.sum(P * diff_class_mask, dim=1)
# loss = -torch.mean(torch.log(P_same / P_diff))
loss = P_diff.sum()-P_same.sum()
return loss
# 初始化模型、损失函数和优化器
input_dim = X_train.shape[1]
embed_dim = 3
model = ModernNCA(input_dim, embed_dim)
criterion = NCA_Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
# 训练模型
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for inputs, targets in train_loader:
optimizer.zero_grad()
embedded = model(inputs)
loss = criterion(embedded, targets)
loss.backward()
optimizer.step()
running_loss += loss.item()
train_loss = running_loss / len(train_loader)
if (epoch + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}')
# 验证模型
model.eval()
running_val_loss = 0.0
with torch.no_grad():
for inputs, targets in val_loader:
embedded = model(inputs)
loss = criterion(embedded, targets)
running_val_loss += loss.item()
val_loss = running_val_loss / len(val_loader)
if (epoch + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}')
# 训练模型
train_model(model, criterion, optimizer, train_loader, val_loader)
# 4. 性能评估
def evaluate_model(model, train_loader, x_test, y_test):
model.eval()
with torch.no_grad():
all_embeddings = []
all_labels = []
for inputs, targets in train_loader:
embedded = model(inputs)
all_embeddings.append(embedded)
all_labels.append(targets)
all_embeddings = torch.cat(all_embeddings).numpy()
all_labels = torch.cat(all_labels).numpy()
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(all_embeddings, all_labels)
x_test = torch.tensor(x_test, dtype=torch.float32)
model.eval()
with torch.no_grad():
test_embeddings = model(x_test)
y_pred = knn.predict(test_embeddings)
print(classification_report(y_test, y_pred))
# 评估模型
evaluate_model(model, train_loader,X_test, y_test)
# 5. 传统 NCA 实现
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(n_components=3, random_state=42)
nca.fit(X_train, y_train)
# 使用 NCA 转换特征
X_train_nca = nca.transform(X_train)
X_val_nca = nca.transform(X_val)
X_test_nca = nca.transform(X_test)
# 训练 KNN 分类器
knn_nca = KNeighborsClassifier(n_neighbors=3)
knn_nca.fit(X_train_nca, y_train)
# 评估性能
y_pred_nca = knn_nca.predict(X_test_nca)
print("传统 NCA 的分类报告:\n", classification_report(y_test, y_pred_nca))