废话不多说,直接上代码
1.数值型特征数据
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# 生成模拟的多标签多分类数据
X, y = make_multilabel_classification(n_samples=1000, n_classes=5, n_labels=2, random_state=42)
# 将数据分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 计算每个类别的样本数量
class_sample_counts = y_train.sum(axis=0)
# 设置每个类别的最大样本数
max_samples_per_class = 200
# 创建一个数据加载器,使用欠采样方法
X_resampled, y_resampled = resample(X_train, y_train, n_samples=max_samples_per_class, random_state=42)
# 输出每个类别的样本数量
for i in range(y_train.shape[1]):
print(f"Class {i}: Original samples: {class_sample_counts[i]}, Resampled samples: {max_samples_per_class}")
# 可以使用 X_resampled 和 y_resampled 来训练模型
#返回
#Class 0: Original samples: 233, Resampled samples: 200
#Class 1: Original samples: 432, Resampled samples: 200
#Class 2: Original samples: 370, Resampled samples: 200
#Class 3: Original samples: 327, Resampled samples: 200
#Class 4: Original samples: 131, Resampled samples: 200
2.图像型特征数据(torch.utils.data.sampler.WeightedRandomSampler()方法)
########根据每个类别的权重进行随机采样
def some_custom_weight_function(class_sample_counts):
"""
计算每个类别的权重,基于每个类别的样本数量。
参数:
class_sample_counts (numpy.ndarray): 每个类别的样本数量数组。
返回:
class_weights (numpy.ndarray): 每个类别的权重数组。
"""
total_samples = class_sample_counts.sum()
class_weights = total_samples / (class_sample_counts + 1e-6) # 添加一个小的值以避免除零错误
# 可以进一步归一化权重,使其总和等于类别数量,如果需要
class_weights /= class_weights.sum()
return class_weights
# 计算每个类别的样本数量
class_sample_counts = np.sum(train_data.labels, axis=0)
# 计算每个类别的权重,可以使用一些自定义的权重计算方法
class_weights = some_custom_weight_function(class_sample_counts)
# 转换成 PyTorch 的权重张量
class_weights = torch.FloatTensor(class_weights)
# 使用 WeightedRandomSampler 来创建采样器
sampler = torch.utils.data.sampler.WeightedRandomSampler(class_weights, len(train_data), replacement=True)
# 创建一个数据加载器,用于遍历数据集
Data_Loader = DataLoader(train_data, batch_size=your_bs, sampler=sampler) #设置一下你的批次
3. 重采样
import random
from torch.utils.data.sampler import Sampler
class MultilabelBalancedRandomSampler(Sampler):
"""
MultilabelBalancedRandomSampler: Given a multilabel dataset of length n_samples and
number of classes n_classes, samples from the data with equal probability per class
effectively oversampling minority classes and undersampling majority classes at the
same time. Note that using this sampler does not guarantee that the distribution of
classes in the output samples will be uniform, since the dataset is multilabel and
sampling is based on a single class. This does however guarantee that all classes
will have at least batch_size / n_classes samples as batch_size approaches infinity
"""
def __init__(self, labels, indices=None, class_choice="least_sampled"):
"""
Parameters:
-----------
labels: a multi-hot encoding numpy array of shape (n_samples, n_classes)
indices: an arbitrary-length 1-dimensional numpy array representing a list
of indices to sample only from
class_choice: a string indicating how class will be selected for every
sample:
"least_sampled": class with the least number of sampled labels so far
"random": class is chosen uniformly at random
"cycle": the sampler cycles through the classes sequentially
"""
self.labels = labels
self.indices = indices
# print(self.labels)
if self.indices is None:
self.indices = range(len(labels))
self.num_classes = self.labels.shape[1]
# List of lists of example indices per class
self.class_indices = []
for class_ in range(self.num_classes):
lst = np.where(self.labels[:, class_] == 1)[0]
lst = lst[np.isin(lst, self.indices)]
self.class_indices.append(lst)
self.counts = [0] * self.num_classes
# print(self.counts)
assert class_choice in ["least_sampled", "random", "cycle"]
self.class_choice = class_choice
self.current_class = 0
# print(self.class_indices)
def __iter__(self):
self.count = 0
return self
def __next__(self):
if self.count >= len(self.indices):
raise StopIteration
self.count += 1
return self.sample()
def sample(self):
class_ = self.get_class()
class_indices = self.class_indices[class_]
chosen_index = np.random.choice(class_indices)
if self.class_choice == "least_sampled":
for class_, indicator in enumerate(self.labels[chosen_index]):
if indicator == 1:
self.counts[class_] += 1
return chosen_index
def get_class(self):
if self.class_choice == "random":
class_ = random.randint(0, self.labels.shape[1] - 1)
elif self.class_choice == "cycle":
class_ = self.current_class
self.current_class = (self.current_class + 1) % self.labels.shape[1]
elif self.class_choice == "least_sampled":
min_count = self.counts[0]
min_classes = [0]
for class_ in range(1, self.num_classes):
if self.counts[class_] < min_count:
min_count = self.counts[class_]
min_classes = [class_]
if self.counts[class_] == min_count:
min_classes.append(class_)
class_ = np.random.choice(min_classes)
return class_
def __len__(self):
return len(self.indices)
4. 加权损失
def compute_class_freqs(labels):
"""
Compute positive and negative frequences for each class.
Args:
labels (np.array): matrix of labels, size (num_examples, num_classes)
Returns:
positive_frequencies (np.array): array of positive frequences for each
class, size (num_classes)
negative_frequencies (np.array): array of negative frequences for each
class, size (num_classes)
"""
# total number of patients (rows)
N = labels.shape[0]
positive_frequencies = (labels.sum(axis = 0))/N
negative_frequencies = 1.0 - positive_frequencies
return positive_frequencies, negative_frequencies
freq_pos, freq_neg = compute_class_freqs(train_dataset.labels)
pos_weights = freq_neg
neg_weights = freq_pos
pos_contribution = freq_pos * pos_weights
neg_contribution = freq_neg * neg_weights
class WeightedLoss(nn.Module):
def __init__(self):
super(WeightedLoss, self).__init__()
def forward(self, y_pred, y_true):
#y_true = torch.LongTensor(y_true)
loss = 0.0
epsilon = 1e-9
for i in range(len(pos_weights)):
# for each class, add average weighted loss for that class
loss_pos = -1 * torch.mean(pos_weights[i] * y_true[:, i] * \
torch.log(y_pred[:, i] + epsilon))
loss_neg = -1 * torch.mean(neg_weights[i] * (1 - y_true[:, i]) * torch.log(1\
- y_pred[:, i] + epsilon))
loss += loss_pos + loss_neg
return loss