1、算法
本文是论文《Multi-Instance Ensemble Learning With Discriminative Bags》中算法代码的复现。
2、代码
2.1、MIL.py
这一部分为多示例学习的原型文件,用于获取数据集名词、包空间、包大小等。
import warnings
import numpy as np
import os as os
from Function import load_file
warnings.filterwarnings("ignore")
class MIL:
def __init__(self,data_path,save_home="../Data/Distance/",bag_space=None):
"""
:param data_path: 数据集的存储路径
:param save_home: 距离矩阵的存储主目录
:param bag_space: 格式与.mat文件一致
"""
self.data_path = data_path
self.save_home = save_home
self.bag_space = bag_space
self.__init_mil()
def __init_mil(self):
"""
初始化函数
:return: null
N:包空间的大小
bag_size:记录每个包大小的向量,长度为N,类型为int
bag_lab:包标签向量
bag_idx:包索引向量
n:实例数量
d:实例的维度
C:数据里的类别数
ins_space:实例空间
ins_idx:实例空间中 包所对应的实例的范围
ins_lab:实例标签
ins_bag_idx:实例空间中 实例对应的包的序号
data_name:数据集的名词
zero_ratio:数据集含零比率
"""
if self.bag_space is None:
self.bag_space = load_file(self.data_path)
self.N = len(self.bag_space)
self.bag_size = np.zeros(self.N,dtype=int)
self.bag_lab = np.zeros_like(self.bag_size,dtype=int)
self.bag_idx = np.arange(self.N)
for i in range(self.N):
self.bag_size[i] = len(self.bag_space[i][0])
self.bag_lab[i] = self.bag_space[i][1]
# 将所有包的标签调整到 [0, C - 1]的范围,C为数据集的类别数量
self.__bag_lab_map()
self.n = sum(self.bag_size)
self.d = len(self.bag_space[0, 0][0]) - 1
self.C = len(list(set(self.bag_lab)))
self.ins_space = np.zeros((self.n, self.d))
self.ins_idx = np.zeros(self.N + 1, dtype=int)
self.ins_lab = np.zeros(self.n)
self.ins_bag_idx = np.zeros(self.n, dtype=int)
for i in range(self.N):
self.ins_idx[i+1] = self.bag_size[i] + self.ins_idx[i]
self.ins_space[self.ins_idx[i]: self.ins_idx[i + 1]] = self.bag_space[i, 0][:, :self.d]
self.ins_lab[self.ins_idx[i]: self.ins_idx[i + 1]] = self.bag_space[i, 0][:, -1]
self.ins_bag_idx[self.ins_idx[i]: self.ins_idx[i + 1]] = np.ones(self.bag_size[i]) * i
self.data_name = self.data_path.strip().split("/")[-1].split(".")[0]
self.zero_ratio = len(self.ins_space[self.ins_space == 0]) / (self.n * self.d)
self.__generate_save_home()
def __generate_save_hone(self):
"""
Generate the save home.
如果不存在存储路径,则生成
:return:
"""
if not os.path.exists(self.save_home):
os.makedirs(self.save_home)
def __bag_lab_map(self):
"""
将包标签映射为对应的类别 [0, 1, 2, ...]
:return:
"""
lab_list = list(set(self.bag_lab))
lab_dict = {}
for i, lab in enumerate(lab_list):
lab_dict[lab] = i
for i in range(self.N):
self.bag_lab[i] = lab_dict[self.bag_lab[i]]
def get_data_info(self):
"""
Print the data set information.
打印数据集信息
data_name: 数据集的名称
bag_space: 包空间,详细格式请查看../Data/Benchmark/musk1+.mat
ins_space: 实例空间
bag_size: 记录每个包大小的向量,长度为N
bag_lab: 包标签向量
ins_lab: 实例标签
bag_idx: 包索引向量
ins_idx: 实例空间中 包所对应的实例的范围
ins_bag_idx: 实例空间中 实例对应的包的序号
zero_ratio: 数据集含零比率
N: 包空间的大小
n: 实例数量
d: 实例的维度
C: 数据集的类别数
"""
temp_idx = 5 if self.N > 5 else self.N
print("The {}'s information is:".format(self.data_name), "\n"
"Number bags:", self.N, "\n"
"Number classes:", self.C, "\n"
"Bag size:", self.bag_size[:temp_idx], "...\n"
"Bag label", self.bag_lab[:temp_idx], "...\n"
"Maximum bag's size:", np.max(self.bag_size), "\n"
"Minimum bag's size:", np.min(self.bag_size), "\n"
"Zero ratio:", self.zero_ratio, "\n"
"Number instances:", self.n, "\n"
"Instance dimensions:", self.d, "\n"
"Instance index:", self.ins_idx[: temp_idx], "...\n"
"Instance label:", self.ins_lab[: temp_idx], "...\n"
"Instance label corresponding bag'S index:", self.ins_bag_idx[:temp_idx], "...\n")
def get_sub_ins_space(self, bag_idx):
"""
提高包索引数组,返回一个实例空间子集
:param bag_idx: 包索引
:return:
"""
n = sum(self.bag_size[bag_idx])
ret_ins_space = np.zeros((n,self.d))
ret_ins_label = np.zeros(n)
ret_ins_bag_idx = np.zeros(n, dtype=int)
count = 0
for i in bag_idx:
bag_size = self.bag_size[i]
ret_ins_space[count: count + bag_size] = self.bag_space[i, 0][:, :-1]
ret_ins_label[count: count + bag_size] = self.bag_lab[i]
ret_ins_bag_idx[count: count + bag_size] = i
count += bag_size
return ret_ins_space, ret_ins_label, ret_ins_bag_idx
if __name__ == '__main__':
temp_file_name = r"ucsb_breast.mat"
mil = MIL(temp_file_name)
mil.get_info()
2.2 Distance.py
距离度量文件,提供了一些距离度量函数,包括:欧氏距离、平均豪斯多夫距离。用来测量包与包之间的相似度。
import os
import numpy as np
from Function import print_progress_bar
# 由于需要进行文件读取,所有这里进行了存储精度的控制
np.set_printoptions(precision=6)
def i2i_euclidean(ins1, ins2):
"""
欧式距离
:param
ins1: 向量1
ins2: 向量2
@return
两个向量的欧式距离值
"""
return np.sqrt(np.sum((ins1 - ins2)**2))
def ave_hausdorff(bag1, bag2):
"""
平均豪斯多夫距离,用于度量两个包的相似度
:param bag1: 包1
:param bag2: 包2
:return: 距离度量
"""
sum_dis = 0
for ins1 in bag1:
#计算包1中实例与最近实例的距离
temp_min = np.inf
for ins2 in bag2:
temp_min = min(i2i_euclidean(ins1, ins2), temp_min)
sum_dis += temp_min
def simple_dis(bag1, bag2):
return i2i_euclidean(np.average(bag1, 0), np.average(bag2, 0))
class B2B:
"""
用于初始化数据集相关的包距离矩阵
"""
def __init__(self, data_name, bags, b2b_type="ave", b2b_save_home="Data/Distance/"):
"""
:param data_name: 数据集名称
:param bags: 包空间
:param b2b_type: 包间距离度量方式
:param b2b_save_home: 默认距离矩阵存储目录
"""
self._data_name = data_name
self._bags = bags
self._b2b_type = b2b_type
self._b2b_save_home = b2b_save_home
self.__initialize__b2b()
def __initialize__b2b(self):
"""
初始化函数
:return:null
"""
# 存储计算的距离矩阵
self._dis = []
# 获取距离矩阵的存储路径
self._save_b2b_path = self._b2b_save_home + "b2b_" + self._data_name + '_' + self._b2b_type + ".npz"
self._b2b_name = {"ave": "ave_hausdorff",
"sim": "simple_dis"}
self.__compute_dis()
def __compute_dis(self):
"""
计算距离
:return:
"""
if not os.path.exists(self._save_b2b_path):
#包大小
N = len(self._bags)
dis = np.zeros((N,N))
print("使用%s距离计算距离矩阵..." % self._b2b_name[self._b2b_type])
if self._b2b_type == "ave":
dis_func = ave_hausdorff
else:
dis_func = simple_dis
for i in range(N):
#打印进度条
print_progress_bar(i, N)
for j in range(i,N):
dis[i, j] = dis[j, i] = dis_func(self._bags[i][0][:, : -1], self._bags[j][0][:, : -1])
print()
np.savez(self._save_b2b_path, dis=dis)
self._dis = np.load(self._save_b2b_path)['dis']
def get_dis(self):
"""
获取距离矩阵
"""
return self._dis
2.3 Function.py
本文件提供一些工具函数,诸如:交叉验证、打印进度条、获取分类性能度量等。
import numpy as np
from scipy.io import loadmat
def load_file(data_path):
"""
载入.mat类型的多示例数据集
:param data_path:
:return:
"""
return loadmat(data_path)['data']
def get_iter(tr, tr_lab, te, te_lab):
"""
:param tr: 训练集
:param tr_lab: 训练集标签
:param te:
:param te_lab:
:return: 相应的迭代器
"""
yield tr, tr_lab, te, te_lab
def get_k_cv_idx(num_x, k=10):
"""
获取k次交叉验证的索引,本函数k取10
:param num_x:数据集的大小
:param k:交叉验证次数
:return:训练集索引,测试集索引
"""
# 随机初始化索引,从数据集索引打乱
rand_idx = np.random.permutation(num_x)
# 每一折的大小
fold = int(np.floor(num_x/ k))
ret_tr_idx = []
ret_te_idx = []
for i in range(k):
# 获取当前折的训练集索引
tr_idx = rand_idx[0: i*fold].tolist()
tr_idx.extend(rand_idx[(i + 1) * fold:])
ret_tr_idx.append(tr_idx)
# 添加当前折的测试集索引
ret_te_idx.append(rand_idx[i * fold: (i + 1) * fold].tolist())
return ret_tr_idx, ret_te_idx
def get_performance(type_performance):
"""
获取分类性能度量
:param type_performance:分类性能度量指标
:return: 分类性能度量函数
"""
ret_per = {}
for type_per in type_performance:
if type_per == "acc":
# 如果是准确度
from sklearn.metrics import accuracy_score
metric = accuracy_score
else:
# 否则为f1_score
from sklearn.metrics import f1_score
metric = f1_score
ret_per[type_per] = metric
return ret_per
def print_progress_bar(idx, size):
"""
打印进度条
:param idx: 当前位置
:param size: 总进度
:return:
"""
print('\r' + '▇' * int(idx // (size / 50)) + str(np.ceil((idx + 1) * 100 / size)) + '%', end='')
2.4 ClassifyTool.py
本文件调用分类器,对多示例向量进行分类。
import warnings
warnings.filterwarnings("ignore")
class Classify:
"""
调用sklearn库中的分类器,实现多示例学习的映射向量分类
"""
def __init__(self, classifier_type=None, performance_type=None):
"""
:param classifier_type: 所使用分类器的列表,已有分类器包括“knn”、“svm"
:param performance_type: 性能度量指标列表
"""
self.__classifier_type = classifier_type
self.__performance_type = performance_type
self.tr_true_label_arr = {}
self.tr_predict_arr = {}
self.te_true_label_arr = {}
self.te_predict_arr = {}
self.tr_per = {}
self.te_per = {}
self.__init_classify()
def __init_classify(self):
"""
分类器初始化
:return:
"""
self.__classifier = []
self.__performance_er = []
if self.__classifier_type is None:#如果没有默认分类器
self.__classifier_type = ["knn"]#默认knn
for classifier_type in self.__classifier_type:
if classifier_type == "knn":
from sklearn.neighbors import KNeighborsClassifier
self.__classifier.append(KNeighborsClassifier)
elif classifier_type == "svm":
from sklearn.svm import SVC
self.__classifier.append(SVC(max_iter=10000))
elif classifier_type == "j48":
from sklearn.tree import DecisionTreeClassifier
self.__classifier.append(DecisionTreeClassifier())
if self.__performance_type is None:
self.__performance_type = ["f1_score"]
for performance_type in self.__performance_type:
if performance_type == "f1_score":
from sklearn.metrics import f1_score
self.__performance_er.append(f1_score())
elif performance_type == "acc":
from sklearn.metrics import accuracy_score
self.__performance_er.append(accuracy_score())
elif performance_type == "roc":
from sklearn.metrics import roc_auc_score
self.__performance_er.append(roc_auc_score())
def __reset_record(self):
"""
重设记录向量
:return:
"""
for classifier_type in self.__classifier_type:
self.tr_predict_arr[classifier_type], self.tr_true_label_arr[classifier_type] = [], []
self.tr_per[classifier_type] = []
self.te_predict_arr[classifier_type], self.te_true_label_arr[classifier_type] = [], []
self.te_per[classifier_type] = []
2.5 ELDB.py
ELDB算法主类,主要包括ELDB算法初始化、映射、记录得分等。
import numpy as np
import warnings
from ClassifyTool import Classify
from Distance import B2B
from Function import get_k_cv_idx, get_iter, get_performance
from MIL import MIL
from sklearn.metrics import euclidean_distances as eucl
warnings.filterwarnings('ignore')
def compute_discer(vectors, labels):
"""
:param vectors:实例向量
:param labels:实例对应标签
:return:
"""
positive_vectors, negative_vectors = [], []
for i in range(len(vectors)):
if labels[i] == 1:
positive_vectors.append(vectors[i])#若为正包
elif labels[i] == 0:
negative_vectors.append(vectors[i])#若为负包
positive_vectors = np.array(positive_vectors)#生成对应向量
negative_vectors = np.array(negative_vectors)
#均值向量
positive_mean = np.mean(positive_vectors, axis=0)
negative_mean = np.mean(negative_vectors, axis=0)
#平均距离
positive_dis = np.mean(eucl(positive_vectors), axis=None)
negative_dis = np.mean(eucl(negative_vectors), axis=None)
fenmu = positive_dis + negative_dis#分母
return eucl([positive_mean], [negative_mean])[0][0] / fenmu
# if fenmu > 1e-3 else 1e-3
class ELDB(MIL):
"""
ELDB算法主类
"""
def __init__(self, data_path, psi=0.9, alpha=0.75, batch=None, psi_max=200,
type_b2b="ave", mode_bag_init="g", mode_action="a", k=10,
type_classifier=None, type_performance=None, print_loop=False,
save_home="../Data/Distance/", bag_space=None):
"""
:param data_path: 数据存储路径
:param psi: 基础dBagSet的大小
:param alpha: 学习率,即基础dBagSet的大小与训练集的比值
:param batch: 批次大小
:param psi_max: 基础dBagSet的最大容量
:param type_b2b: 距离函数的类型
:param mode_bag_init: 基础dBagSet的初始化模式
:param mode_action: 行为模式
:param k:k折交叉验证
:param type_classifier:转为单实例后的分类器,默认None时使用["knn", "svm, "j48]
:param type_performance:性能度量类型,默认None时使用["acc", "f1_score"]
:param print_loop:是否输出每一折的轮次
:param save_home:保存地址
:param bag_space:
"""
super(ELDB, self).__init__(data_path, save_home=save_home, bag_space=bag_space)
self._psi = psi
self._alpha = alpha
self._batch = batch
self._psi_max = psi_max
self._type_b2b = type_b2b
self._mode_bag_init = mode_bag_init
self._mode_action = mode_action
self._k = k
self._type_classifier = type_classifier
self._type_performance = type_performance
self._print_loop = print_loop
self.__init_eldb()
def __init_eldb(self):
"""
ELDB的初始化函数
:return:
"""
self._type_classifier = ["knn", "svm", "j48"] if self._type_classifier is None else self._type_classifier
self._type_performance = ["accuracy", "f1_score"] if self._type_performance is None else self._type_performance
# 距离矩阵
self.dis = B2B(self.data_name, self.bag_space, self._type_b2b, self.save_home).get_dis()
# 记录不同分类器、不同分类性能的分类结果
self.lab_predict = {}
# 记录按照交叉验证顺利的真实标签
self.lab_true = []
# 记录分类性能
self.val_performance = {}
def __reset_record(self):
"""
重设记录相关的变量
:return:
"""
self.lab_predict = {}
self.lab_true = []
self.val_performance = {}
def __get_classifier(self):
"""
获取分类器
:return:
"""
return Classify(self._type_classifier, self._type_performance)
def get_state(self):
"""
获取使用的分类器以及度量性能
:return:
"""
return self._type_classifier, self._type_performance
def get_mapping(self):
"""
获取映射结果
:return:
"""
def __dBagSet_update_r(para_idx_dBagSet, para_score_dBagSet, idx_cur, score_cur):
"""
:param para_idx_dBagSet:包集合的索引
:param para_score_dBagSet: 包集合所得分数拷贝
:param idx_cur: 第i个包
:param score_cur: 包的得分
:return:
"""
for idx_find in np.arange(len(para_idx_dBagSet)):
if score_cur > para_score_dBagSet[idx_find]:#若有包的分数大于包集合中某个包的分数
continue
else:
idx_find += 1#向前推进
idx_find = len(para_idx_dBagSet) - 1 if idx_find == len(para_idx_dBagSet) else idx_find#如果超出界限
para_idx_dBagSet[idx_find + 1:] = para_idx_dBagSet[idx_find: -1]
para_score_dBagSet[idx_find + 1:] = para_score_dBagSet[idx_find: -1]
para_idx_dBagSet[idx_find], para_score_dBagSet[idx_find] = idx_cur, score_cur
break
return para_idx_dBagSet, para_score_dBagSet
# 获取训练集和测试集的索引
idxes_tr, idxes_te = get_k_cv_idx(self.N, self._k)
# 正负包标签
lab_positive, lab_negative = np.max(self.bag_lab), np.min(self.bag_lab)
# 获取单实例分类器
classifier = self.__get_classifier(self._type_classifier)
# 性能度量器
performance = get_performance(self._type_performance)
# 记录参数重设
self.__reset_record(self)
#主循环
discer_list = []
for loop, (idx_tr, idx_te) in enumerate(zip(idxes_tr, idxes_te)):
"""步骤0:初始化操作"""
# 进度条
if self._print_loop:
print("第{}折交叉验证".format(loop))
# 计算训练集、基准数据集和更新数据集的大小
N_T = len(idx_tr)
N_Ts = int(N_T * (1 - self._alpha))
# 计算批次大小
batch = N_Ts // 2 if self._batch is None else self._batch
# 计算最大更新次数
n_l = N_Ts // batch
N_Td = N_T - (n_l * batch)
# 获取T_d和T_s的索引
idx_td, idx_ts = np.array(idx_tr[:N_Td]), np.array(idx_tr[N_Td:])
"""步骤1:模型和参数初始化"""
# 计算Delta矩阵
matrix_Delta = np.zeros((N_Td, N_Td), dtype=int)
for i in range(N_Td):
for j in range(N_Td):
# 若标签相同,Delta设置为1,否则为0
if self.bag_lab[idx_td[i]] == self.bag_lab[idx_td[j]]:
# 对角矩阵, 对应位置设置为-1
matrix_Delta[i, j] = -1
else:
matrix_Delta[i, j] = 1
# 计算Gamma矩阵
matrix_Gamma = np.diag(np.sum(matrix_Delta, 1))
# 计算L矩阵
matrix_L = matrix_Gamma - matrix_Delta
# 只保留L矩阵
del matrix_Delta, matrix_Gamma
# 基于整个T_d进行映射,先获取所有训练包的映射,后面再分配
mapping_bag = self.dis[idx_tr, :][:, idx_td]
# 使用矩阵乘法
score_t = np.dot(np.dot(mapping_bag, matrix_L))
# 对角元素是包的得分
score_t = np.diag(score_t)
# 获取T_d和T_s中每一包的得分
score_td, score_ts = score_t[: N_Td], score_t[N_Td:]
# 获取初始dBagSet的大小
psi = int(min(self._psi_max, N_Td) * self._psi)
# 由大到小的得分索引排序
arg_score_td = np.argsort(score_td)[::-1]
# 获取dBagSet在训练集中的真实索引和相应得分
if self._mode_bag_init == 'g':
idx_dBagSet = arg_score_td[:psi].tolist()
else:
idx_dBagSet = []
count = 0
for i in arg_score_td:
if count >= psi:
break
if (self._mode_bag_init == 'p' and self.bag_lab[idx_td[i]] == lab_positive) or \
(self._mode_bag_init == 'n' and self.bag_lab[idx_td[i]] != lab_positive):
idx_dBagSet.append(i)
count += 1
score_dBagSet, idx_dBagSet = score_td[idx_dBagSet], [idx_td[idx_dBagSet].tolist()]
del score_t, arg_score_td
# 记录最小得分的索引和得分
tau, p = len(idx_dBagSet[-1])