【代码复现】ELDB

1、算法

本文是论文《Multi-Instance Ensemble Learning With Discriminative Bags》中算法代码的复现。

2、代码

2.1、MIL.py

这一部分为多示例学习的原型文件,用于获取数据集名词、包空间、包大小等。

import warnings
import numpy as np
import os as os
from Function import load_file
warnings.filterwarnings("ignore")


class MIL:
    def __init__(self,data_path,save_home="../Data/Distance/",bag_space=None):
        """

        :param data_path: 数据集的存储路径
        :param save_home: 距离矩阵的存储主目录
        :param bag_space: 格式与.mat文件一致
        """
        self.data_path = data_path
        self.save_home = save_home
        self.bag_space = bag_space
        self.__init_mil()


    def __init_mil(self):
        """
        初始化函数
        :return: null
        N:包空间的大小
        bag_size:记录每个包大小的向量,长度为N,类型为int
        bag_lab:包标签向量
        bag_idx:包索引向量

        n:实例数量
        d:实例的维度
        C:数据里的类别数

        ins_space:实例空间
        ins_idx:实例空间中 包所对应的实例的范围
        ins_lab:实例标签
        ins_bag_idx:实例空间中 实例对应的包的序号

        data_name:数据集的名词
        zero_ratio:数据集含零比率

        """
        if self.bag_space is None:
            self.bag_space = load_file(self.data_path)
        self.N = len(self.bag_space)

        self.bag_size = np.zeros(self.N,dtype=int)
        self.bag_lab = np.zeros_like(self.bag_size,dtype=int)

        self.bag_idx = np.arange(self.N)
        for i in range(self.N):
            self.bag_size[i] = len(self.bag_space[i][0])
            self.bag_lab[i] = self.bag_space[i][1]
        # 将所有包的标签调整到 [0, C - 1]的范围,C为数据集的类别数量
        self.__bag_lab_map()

        self.n = sum(self.bag_size)
        self.d = len(self.bag_space[0, 0][0]) - 1
        self.C = len(list(set(self.bag_lab)))

        self.ins_space = np.zeros((self.n, self.d))
        self.ins_idx = np.zeros(self.N + 1, dtype=int)
        self.ins_lab = np.zeros(self.n)
        self.ins_bag_idx = np.zeros(self.n, dtype=int)
        for i in range(self.N):
            self.ins_idx[i+1] = self.bag_size[i] + self.ins_idx[i]
            self.ins_space[self.ins_idx[i]: self.ins_idx[i + 1]] = self.bag_space[i, 0][:, :self.d]
            self.ins_lab[self.ins_idx[i]: self.ins_idx[i + 1]] = self.bag_space[i, 0][:, -1]
            self.ins_bag_idx[self.ins_idx[i]: self.ins_idx[i + 1]] = np.ones(self.bag_size[i]) * i

        self.data_name = self.data_path.strip().split("/")[-1].split(".")[0]
        self.zero_ratio = len(self.ins_space[self.ins_space == 0]) / (self.n * self.d)
        self.__generate_save_home()

    def __generate_save_hone(self):
        """
        Generate the save home.
        如果不存在存储路径,则生成
        :return:
        """
        if not os.path.exists(self.save_home):
            os.makedirs(self.save_home)

    def __bag_lab_map(self):
        """
        将包标签映射为对应的类别 [0, 1, 2, ...]
        :return:
        """
        lab_list = list(set(self.bag_lab))
        lab_dict = {}
        for i, lab in enumerate(lab_list):
            lab_dict[lab] = i
        for i in range(self.N):
            self.bag_lab[i] = lab_dict[self.bag_lab[i]]

    def get_data_info(self):
        """
        Print the data set information.
        打印数据集信息

        data_name:   数据集的名称
        bag_space:   包空间,详细格式请查看../Data/Benchmark/musk1+.mat
        ins_space:   实例空间
        bag_size:    记录每个包大小的向量,长度为N
        bag_lab:     包标签向量
        ins_lab:     实例标签
        bag_idx:     包索引向量
        ins_idx:     实例空间中 包所对应的实例的范围
        ins_bag_idx: 实例空间中 实例对应的包的序号
        zero_ratio:  数据集含零比率
        N:           包空间的大小
        n:           实例数量
        d:           实例的维度
        C:           数据集的类别数

        """
        temp_idx = 5 if self.N > 5 else self.N
        print("The {}'s information is:".format(self.data_name), "\n"
              "Number bags:", self.N, "\n"
              "Number classes:", self.C, "\n"
              "Bag size:", self.bag_size[:temp_idx], "...\n"
              "Bag label", self.bag_lab[:temp_idx], "...\n"
              "Maximum bag's size:", np.max(self.bag_size), "\n"
              "Minimum bag's size:", np.min(self.bag_size), "\n"
              "Zero ratio:", self.zero_ratio, "\n"
              "Number instances:", self.n, "\n"
              "Instance dimensions:", self.d, "\n"
              "Instance index:", self.ins_idx[: temp_idx], "...\n"
              "Instance label:", self.ins_lab[: temp_idx], "...\n"
              "Instance label corresponding bag'S index:", self.ins_bag_idx[:temp_idx], "...\n")

    def get_sub_ins_space(self, bag_idx):
        """
        提高包索引数组,返回一个实例空间子集
        :param bag_idx: 包索引
        :return:
        """
        n = sum(self.bag_size[bag_idx])
        ret_ins_space = np.zeros((n,self.d))
        ret_ins_label = np.zeros(n)
        ret_ins_bag_idx = np.zeros(n, dtype=int)
        count = 0
        for i in bag_idx:
            bag_size = self.bag_size[i]
            ret_ins_space[count: count + bag_size] = self.bag_space[i, 0][:, :-1]
            ret_ins_label[count: count + bag_size] = self.bag_lab[i]
            ret_ins_bag_idx[count: count + bag_size] = i
            count += bag_size
        return ret_ins_space, ret_ins_label, ret_ins_bag_idx


if __name__ == '__main__':
    temp_file_name = r"ucsb_breast.mat"
    mil = MIL(temp_file_name)
    mil.get_info()
2.2 Distance.py

距离度量文件,提供了一些距离度量函数,包括:欧氏距离、平均豪斯多夫距离。用来测量包与包之间的相似度。

import os
import numpy as np
from Function import print_progress_bar
# 由于需要进行文件读取,所有这里进行了存储精度的控制
np.set_printoptions(precision=6)


def i2i_euclidean(ins1, ins2):
    """
    欧式距离
    :param
        ins1:  向量1
        ins2:  向量2
    @return
        两个向量的欧式距离值
    """
    return np.sqrt(np.sum((ins1 - ins2)**2))


def ave_hausdorff(bag1, bag2):
    """
    平均豪斯多夫距离,用于度量两个包的相似度
    :param bag1: 包1
    :param bag2: 包2
    :return: 距离度量
    """
    sum_dis = 0
    for ins1 in bag1:
        #计算包1中实例与最近实例的距离
        temp_min = np.inf
        for ins2 in bag2:
            temp_min = min(i2i_euclidean(ins1, ins2), temp_min)
        sum_dis += temp_min


def simple_dis(bag1, bag2):
    return i2i_euclidean(np.average(bag1, 0), np.average(bag2, 0))


class B2B:
    """
    用于初始化数据集相关的包距离矩阵
    """

    def __init__(self, data_name, bags, b2b_type="ave", b2b_save_home="Data/Distance/"):
        """
        :param data_name: 数据集名称
        :param bags: 包空间
        :param b2b_type: 包间距离度量方式
        :param b2b_save_home: 默认距离矩阵存储目录
        """
        self._data_name = data_name
        self._bags = bags
        self._b2b_type = b2b_type
        self._b2b_save_home = b2b_save_home
        self.__initialize__b2b()

    def __initialize__b2b(self):
        """
        初始化函数
        :return:null
        """
        # 存储计算的距离矩阵
        self._dis = []
        # 获取距离矩阵的存储路径
        self._save_b2b_path = self._b2b_save_home + "b2b_" + self._data_name + '_' + self._b2b_type + ".npz"
        self._b2b_name = {"ave": "ave_hausdorff",
                          "sim": "simple_dis"}
        self.__compute_dis()

    def __compute_dis(self):
        """
        计算距离
        :return:
        """
        if not os.path.exists(self._save_b2b_path):
            #包大小
            N = len(self._bags)
            dis = np.zeros((N,N))
            print("使用%s距离计算距离矩阵..." % self._b2b_name[self._b2b_type])
            if self._b2b_type == "ave":
                dis_func = ave_hausdorff
            else:
                dis_func = simple_dis
            for i in range(N):
                #打印进度条
                print_progress_bar(i, N)
                for j in range(i,N):
                    dis[i, j] = dis[j, i] = dis_func(self._bags[i][0][:, : -1], self._bags[j][0][:, : -1])
            print()
            np.savez(self._save_b2b_path, dis=dis)
        self._dis = np.load(self._save_b2b_path)['dis']

    def get_dis(self):
        """
              获取距离矩阵
              """
        return self._dis
2.3 Function.py

本文件提供一些工具函数,诸如:交叉验证、打印进度条、获取分类性能度量等。

import numpy as np
from scipy.io import loadmat


def load_file(data_path):
    """
    载入.mat类型的多示例数据集
    :param data_path:
    :return:
    """
    return loadmat(data_path)['data']


def get_iter(tr, tr_lab, te, te_lab):
    """

    :param tr: 训练集
    :param tr_lab: 训练集标签
    :param te:
    :param te_lab:
    :return: 相应的迭代器
    """
    yield tr, tr_lab, te, te_lab


def get_k_cv_idx(num_x, k=10):
    """
    获取k次交叉验证的索引,本函数k取10
    :param num_x:数据集的大小
    :param k:交叉验证次数
    :return:训练集索引,测试集索引
    """

    # 随机初始化索引,从数据集索引打乱
    rand_idx = np.random.permutation(num_x)
    # 每一折的大小
    fold = int(np.floor(num_x/ k))
    ret_tr_idx = []
    ret_te_idx = []
    for i in range(k):
        # 获取当前折的训练集索引
        tr_idx = rand_idx[0: i*fold].tolist()
        tr_idx.extend(rand_idx[(i + 1) * fold:])
        ret_tr_idx.append(tr_idx)
        # 添加当前折的测试集索引
        ret_te_idx.append(rand_idx[i * fold: (i + 1) * fold].tolist())
    return ret_tr_idx, ret_te_idx


def get_performance(type_performance):
    """
    获取分类性能度量
    :param type_performance:分类性能度量指标
    :return: 分类性能度量函数
    """

    ret_per = {}
    for type_per in type_performance:
        if type_per == "acc":
            # 如果是准确度
            from sklearn.metrics import accuracy_score
            metric = accuracy_score
        else:
            # 否则为f1_score
            from sklearn.metrics import f1_score
            metric = f1_score
        ret_per[type_per] = metric

    return ret_per


def print_progress_bar(idx, size):
    """
    打印进度条
    :param idx: 当前位置
    :param size: 总进度
    :return:
    """
    print('\r' + '▇' * int(idx // (size / 50)) + str(np.ceil((idx + 1) * 100 / size)) + '%', end='')
2.4 ClassifyTool.py

本文件调用分类器,对多示例向量进行分类。

import warnings
warnings.filterwarnings("ignore")


class Classify:
    """
    调用sklearn库中的分类器,实现多示例学习的映射向量分类
    """

    def __init__(self, classifier_type=None, performance_type=None):
        """

        :param classifier_type: 所使用分类器的列表,已有分类器包括“knn”、“svm"
        :param performance_type: 性能度量指标列表
        """
        self.__classifier_type = classifier_type
        self.__performance_type = performance_type
        self.tr_true_label_arr = {}
        self.tr_predict_arr = {}
        self.te_true_label_arr = {}
        self.te_predict_arr = {}
        self.tr_per = {}
        self.te_per = {}
        self.__init_classify()

    def __init_classify(self):
        """
        分类器初始化
        :return:
        """

        self.__classifier = []
        self.__performance_er = []
        if self.__classifier_type is None:#如果没有默认分类器
            self.__classifier_type = ["knn"]#默认knn
        for classifier_type in self.__classifier_type:
            if classifier_type == "knn":
                from sklearn.neighbors import KNeighborsClassifier
                self.__classifier.append(KNeighborsClassifier)
            elif classifier_type == "svm":
                from sklearn.svm import SVC
                self.__classifier.append(SVC(max_iter=10000))
            elif classifier_type == "j48":
                from sklearn.tree import DecisionTreeClassifier
                self.__classifier.append(DecisionTreeClassifier())

        if self.__performance_type is None:
            self.__performance_type = ["f1_score"]
        for performance_type in self.__performance_type:
            if performance_type == "f1_score":
                from sklearn.metrics import f1_score
                self.__performance_er.append(f1_score())
            elif performance_type == "acc":
                from sklearn.metrics import accuracy_score
                self.__performance_er.append(accuracy_score())
            elif performance_type == "roc":
                from sklearn.metrics import roc_auc_score
                self.__performance_er.append(roc_auc_score())

    def __reset_record(self):
        """
        重设记录向量
        :return:
        """
        for classifier_type in self.__classifier_type:
            self.tr_predict_arr[classifier_type], self.tr_true_label_arr[classifier_type] = [], []
            self.tr_per[classifier_type] = []
            self.te_predict_arr[classifier_type], self.te_true_label_arr[classifier_type] = [], []
            self.te_per[classifier_type] = []
2.5 ELDB.py

ELDB算法主类,主要包括ELDB算法初始化、映射、记录得分等。

import numpy as np
import warnings
from ClassifyTool import Classify
from Distance import B2B
from Function import get_k_cv_idx, get_iter, get_performance
from MIL import MIL
from sklearn.metrics import euclidean_distances as eucl
warnings.filterwarnings('ignore')


def compute_discer(vectors, labels):
    """

    :param vectors:实例向量
    :param labels:实例对应标签
    :return:
    """
    positive_vectors, negative_vectors = [], []
    for i in range(len(vectors)):
        if labels[i] == 1:
            positive_vectors.append(vectors[i])#若为正包
        elif labels[i] == 0:
            negative_vectors.append(vectors[i])#若为负包
    positive_vectors = np.array(positive_vectors)#生成对应向量
    negative_vectors = np.array(negative_vectors)
    #均值向量
    positive_mean = np.mean(positive_vectors, axis=0)
    negative_mean = np.mean(negative_vectors, axis=0)
    #平均距离
    positive_dis = np.mean(eucl(positive_vectors), axis=None)
    negative_dis = np.mean(eucl(negative_vectors), axis=None)
    fenmu = positive_dis + negative_dis#分母
    return eucl([positive_mean], [negative_mean])[0][0] / fenmu
    # if fenmu > 1e-3 else 1e-3


class ELDB(MIL):
    """
    ELDB算法主类
    """

    def __init__(self, data_path, psi=0.9, alpha=0.75, batch=None, psi_max=200,
                 type_b2b="ave", mode_bag_init="g", mode_action="a", k=10,
                 type_classifier=None, type_performance=None, print_loop=False,
                 save_home="../Data/Distance/", bag_space=None):
        """

        :param data_path: 数据存储路径
        :param psi: 基础dBagSet的大小
        :param alpha: 学习率,即基础dBagSet的大小与训练集的比值
        :param batch: 批次大小
        :param psi_max: 基础dBagSet的最大容量
        :param type_b2b: 距离函数的类型
        :param mode_bag_init: 基础dBagSet的初始化模式
        :param mode_action: 行为模式
        :param k:k折交叉验证
        :param type_classifier:转为单实例后的分类器,默认None时使用["knn", "svm, "j48]
        :param type_performance:性能度量类型,默认None时使用["acc", "f1_score"]
        :param print_loop:是否输出每一折的轮次
        :param save_home:保存地址
        :param bag_space:
        """
        super(ELDB, self).__init__(data_path, save_home=save_home, bag_space=bag_space)
        self._psi = psi
        self._alpha = alpha
        self._batch = batch
        self._psi_max = psi_max
        self._type_b2b = type_b2b
        self._mode_bag_init = mode_bag_init
        self._mode_action = mode_action
        self._k = k
        self._type_classifier = type_classifier
        self._type_performance = type_performance
        self._print_loop = print_loop
        self.__init_eldb()

    def __init_eldb(self):
        """
         ELDB的初始化函数
        :return:
        """
        self._type_classifier = ["knn", "svm", "j48"] if self._type_classifier is None else self._type_classifier
        self._type_performance = ["accuracy", "f1_score"] if self._type_performance is None else self._type_performance
        # 距离矩阵
        self.dis = B2B(self.data_name, self.bag_space, self._type_b2b, self.save_home).get_dis()
        # 记录不同分类器、不同分类性能的分类结果
        self.lab_predict = {}
        # 记录按照交叉验证顺利的真实标签
        self.lab_true = []
        # 记录分类性能
        self.val_performance = {}

    def __reset_record(self):
        """
        重设记录相关的变量
        :return:
        """
        self.lab_predict = {}
        self.lab_true = []
        self.val_performance = {}

    def __get_classifier(self):
        """
         获取分类器
        :return:
        """
        return Classify(self._type_classifier, self._type_performance)

    def get_state(self):
        """
        获取使用的分类器以及度量性能
        :return:
        """
        return self._type_classifier, self._type_performance

    def get_mapping(self):
        """
        获取映射结果
        :return:
        """
        def __dBagSet_update_r(para_idx_dBagSet, para_score_dBagSet, idx_cur, score_cur):
            """

            :param para_idx_dBagSet:包集合的索引
            :param para_score_dBagSet: 包集合所得分数拷贝
            :param idx_cur: 第i个包
            :param score_cur: 包的得分
            :return:
            """
            for idx_find in np.arange(len(para_idx_dBagSet)):
                if score_cur > para_score_dBagSet[idx_find]:#若有包的分数大于包集合中某个包的分数
                    continue
                else:
                    idx_find += 1#向前推进
                    idx_find = len(para_idx_dBagSet) - 1 if idx_find == len(para_idx_dBagSet) else idx_find#如果超出界限
                    para_idx_dBagSet[idx_find + 1:] = para_idx_dBagSet[idx_find: -1]
                    para_score_dBagSet[idx_find + 1:] = para_score_dBagSet[idx_find: -1]
                    para_idx_dBagSet[idx_find], para_score_dBagSet[idx_find] = idx_cur, score_cur
                    break
            return para_idx_dBagSet, para_score_dBagSet

        # 获取训练集和测试集的索引
        idxes_tr, idxes_te = get_k_cv_idx(self.N, self._k)

        # 正负包标签
        lab_positive, lab_negative = np.max(self.bag_lab), np.min(self.bag_lab)

        # 获取单实例分类器
        classifier = self.__get_classifier(self._type_classifier)

        # 性能度量器
        performance = get_performance(self._type_performance)

        # 记录参数重设
        self.__reset_record(self)

        #主循环
        discer_list = []

        for loop, (idx_tr, idx_te) in enumerate(zip(idxes_tr, idxes_te)):
            """步骤0:初始化操作"""
            # 进度条
            if self._print_loop:
                print("第{}折交叉验证".format(loop))

            # 计算训练集、基准数据集和更新数据集的大小
            N_T = len(idx_tr)
            N_Ts = int(N_T * (1 - self._alpha))

            # 计算批次大小
            batch = N_Ts // 2 if self._batch is None else self._batch

            # 计算最大更新次数
            n_l = N_Ts // batch
            N_Td = N_T - (n_l * batch)

            # 获取T_d和T_s的索引
            idx_td, idx_ts = np.array(idx_tr[:N_Td]), np.array(idx_tr[N_Td:])

            """步骤1:模型和参数初始化"""

            # 计算Delta矩阵
            matrix_Delta = np.zeros((N_Td, N_Td), dtype=int)
            for i in range(N_Td):
                for j in range(N_Td):
                    # 若标签相同,Delta设置为1,否则为0
                    if self.bag_lab[idx_td[i]] == self.bag_lab[idx_td[j]]:
                        # 对角矩阵, 对应位置设置为-1
                        matrix_Delta[i, j] = -1
                    else:
                        matrix_Delta[i, j] = 1

            # 计算Gamma矩阵
            matrix_Gamma = np.diag(np.sum(matrix_Delta, 1))

            # 计算L矩阵
            matrix_L = matrix_Gamma - matrix_Delta

            # 只保留L矩阵
            del matrix_Delta, matrix_Gamma

            # 基于整个T_d进行映射,先获取所有训练包的映射,后面再分配
            mapping_bag = self.dis[idx_tr, :][:, idx_td]

            # 使用矩阵乘法
            score_t = np.dot(np.dot(mapping_bag, matrix_L))

            # 对角元素是包的得分
            score_t = np.diag(score_t)

            # 获取T_d和T_s中每一包的得分
            score_td, score_ts = score_t[: N_Td], score_t[N_Td:]

            # 获取初始dBagSet的大小
            psi = int(min(self._psi_max, N_Td) * self._psi)

            # 由大到小的得分索引排序
            arg_score_td = np.argsort(score_td)[::-1]

            # 获取dBagSet在训练集中的真实索引和相应得分
            if self._mode_bag_init == 'g':
                idx_dBagSet = arg_score_td[:psi].tolist()
            else:
                idx_dBagSet = []
                count = 0
                for i in arg_score_td:
                    if count >= psi:
                        break
                    if (self._mode_bag_init == 'p' and self.bag_lab[idx_td[i]] == lab_positive) or \
                            (self._mode_bag_init == 'n' and self.bag_lab[idx_td[i]] != lab_positive):
                        idx_dBagSet.append(i)
                    count += 1
            score_dBagSet, idx_dBagSet = score_td[idx_dBagSet], [idx_td[idx_dBagSet].tolist()]
            del score_t, arg_score_td

            # 记录最小得分的索引和得分
            tau, p = len(idx_dBagSet[-1])
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值