解决多标签多分类数据中类不均衡的问题

废话不多说,直接上代码

1.数值型特征数据

from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# 生成模拟的多标签多分类数据
X, y = make_multilabel_classification(n_samples=1000, n_classes=5, n_labels=2, random_state=42)

# 将数据分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 计算每个类别的样本数量
class_sample_counts = y_train.sum(axis=0)

# 设置每个类别的最大样本数
max_samples_per_class = 200

# 创建一个数据加载器,使用欠采样方法
X_resampled, y_resampled = resample(X_train, y_train, n_samples=max_samples_per_class, random_state=42)

# 输出每个类别的样本数量
for i in range(y_train.shape[1]):
    print(f"Class {i}: Original samples: {class_sample_counts[i]}, Resampled samples: {max_samples_per_class}")

# 可以使用 X_resampled 和 y_resampled 来训练模型


#返回

#Class 0: Original samples: 233, Resampled samples: 200
#Class 1: Original samples: 432, Resampled samples: 200
#Class 2: Original samples: 370, Resampled samples: 200
#Class 3: Original samples: 327, Resampled samples: 200
#Class 4: Original samples: 131, Resampled samples: 200

2.图像型特征数据(torch.utils.data.sampler.WeightedRandomSampler()方法)

########根据每个类别的权重进行随机采样
def some_custom_weight_function(class_sample_counts):
    """
    计算每个类别的权重,基于每个类别的样本数量。

    参数:
        class_sample_counts (numpy.ndarray): 每个类别的样本数量数组。

    返回:
        class_weights (numpy.ndarray): 每个类别的权重数组。
    """
    total_samples = class_sample_counts.sum()
    class_weights = total_samples / (class_sample_counts + 1e-6)  # 添加一个小的值以避免除零错误

    # 可以进一步归一化权重,使其总和等于类别数量,如果需要
    class_weights /= class_weights.sum()

    return class_weights

# 计算每个类别的样本数量
class_sample_counts = np.sum(train_data.labels, axis=0)

# 计算每个类别的权重,可以使用一些自定义的权重计算方法
class_weights = some_custom_weight_function(class_sample_counts)

# 转换成 PyTorch 的权重张量
class_weights = torch.FloatTensor(class_weights)

# 使用 WeightedRandomSampler 来创建采样器
sampler = torch.utils.data.sampler.WeightedRandomSampler(class_weights, len(train_data), replacement=True)

# 创建一个数据加载器,用于遍历数据集
Data_Loader = DataLoader(train_data, batch_size=your_bs, sampler=sampler)  #设置一下你的批次

3. 重采样

import random

from torch.utils.data.sampler import Sampler


class MultilabelBalancedRandomSampler(Sampler):
    """
    MultilabelBalancedRandomSampler: Given a multilabel dataset of length n_samples and
    number of classes n_classes, samples from the data with equal probability per class
    effectively oversampling minority classes and undersampling majority classes at the
    same time. Note that using this sampler does not guarantee that the distribution of
    classes in the output samples will be uniform, since the dataset is multilabel and
    sampling is based on a single class. This does however guarantee that all classes
    will have at least batch_size / n_classes samples as batch_size approaches infinity
    """

    def __init__(self, labels, indices=None, class_choice="least_sampled"):
        """
        Parameters:
        -----------
            labels: a multi-hot encoding numpy array of shape (n_samples, n_classes)
            indices: an arbitrary-length 1-dimensional numpy array representing a list
            of indices to sample only from
            class_choice: a string indicating how class will be selected for every
            sample:
                "least_sampled": class with the least number of sampled labels so far
                "random": class is chosen uniformly at random
                "cycle": the sampler cycles through the classes sequentially
        """
        self.labels = labels
        self.indices = indices
        # print(self.labels)
        if self.indices is None:
            self.indices = range(len(labels))

        self.num_classes = self.labels.shape[1]

        # List of lists of example indices per class
        self.class_indices = []
        for class_ in range(self.num_classes):
            lst = np.where(self.labels[:, class_] == 1)[0]
            lst = lst[np.isin(lst, self.indices)]
            self.class_indices.append(lst)

        self.counts = [0] * self.num_classes
        # print(self.counts)
        assert class_choice in ["least_sampled", "random", "cycle"]
        self.class_choice = class_choice
        self.current_class = 0
        # print(self.class_indices)

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count >= len(self.indices):
            raise StopIteration
        self.count += 1
        return self.sample()

    def sample(self):
        class_ = self.get_class()
        class_indices = self.class_indices[class_]
        chosen_index = np.random.choice(class_indices)
        if self.class_choice == "least_sampled":
            for class_, indicator in enumerate(self.labels[chosen_index]):
                if indicator == 1:
                    self.counts[class_] += 1
        return chosen_index

    def get_class(self):
        if self.class_choice == "random":
            class_ = random.randint(0, self.labels.shape[1] - 1)
        elif self.class_choice == "cycle":
            class_ = self.current_class
            self.current_class = (self.current_class + 1) % self.labels.shape[1]
        elif self.class_choice == "least_sampled":
            min_count = self.counts[0]
            min_classes = [0]
            for class_ in range(1, self.num_classes):
                if self.counts[class_] < min_count:
                    min_count = self.counts[class_]
                    min_classes = [class_]
                if self.counts[class_] == min_count:
                    min_classes.append(class_)
            class_ = np.random.choice(min_classes)
        return class_

    def __len__(self):
        return len(self.indices)

4. 加权损失

def compute_class_freqs(labels):
    """
    Compute positive and negative frequences for each class.

    Args:
        labels (np.array): matrix of labels, size (num_examples, num_classes)
    Returns:
        positive_frequencies (np.array): array of positive frequences for each
                                         class, size (num_classes)
        negative_frequencies (np.array): array of negative frequences for each
                                         class, size (num_classes)
    """    
    # total number of patients (rows)
    N = labels.shape[0]
    positive_frequencies = (labels.sum(axis = 0))/N
    negative_frequencies = 1.0 - positive_frequencies
    
    return positive_frequencies, negative_frequencies


freq_pos, freq_neg = compute_class_freqs(train_dataset.labels)

pos_weights = freq_neg
neg_weights = freq_pos
pos_contribution = freq_pos * pos_weights 
neg_contribution = freq_neg * neg_weights


class WeightedLoss(nn.Module):
    def __init__(self):
        super(WeightedLoss, self).__init__()

    def forward(self, y_pred, y_true):
        #y_true = torch.LongTensor(y_true)
        loss = 0.0
        epsilon = 1e-9
        for i in range(len(pos_weights)):
            # for each class, add average weighted loss for that class 
            loss_pos = -1 * torch.mean(pos_weights[i] * y_true[:, i] * \
              torch.log(y_pred[:, i] + epsilon))
            loss_neg = -1 * torch.mean(neg_weights[i] * (1 - y_true[:, i]) * torch.log(1\ 
            - y_pred[:, i] + epsilon))
            loss += loss_pos + loss_neg
        return loss

  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Python_GNN-DL

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值