【算法实现】:ELDB算法实现
1.ELDB主要代码
"""
-*- coding:utf-8 -*-
@FileName: ELDB.py
@Author: zjy
@DateTime: 2022/8/13 17:38
@Description:ELDB算法实现
@IDE:PyCharm
"""
import numpy as np
import warnings
from MILFrame.MIL import MIL
from MILFrame.Classifier import Classifier
from MILFrame import Utils, B2B
from sklearn.metrics import euclidean_distances
from sklearn.cluster import MiniBatchKMeans
warnings.filterwarnings('ignore')
class ELDB(MIL):
"""ELDB实现类"""
def __init__(self, data_path, psi=0.9, alpha=0.75, batch=None, psi_max=200,
type_b2b="ave", mode_bag_init="g", mode_action="a", k=10,
type_classifier=None, type_performance=None, print_loop=False,
save_home="../data/Distance/", bag_space=None):
"""
构造方法
:param data_path: 数据存储路径
:param psi: basic dBagSet 的占比
:param alpha: 划分比例
:param batch: 批次大小
:param psi_max: 最大basic dBagSet
:param type_b2b: 包距离类型
:param mode_bag_init: 判别包dBagSet初始模式 g:p:n
:param mode_action: 判别包更新模式 a:add r:replace
:param k: 交叉验证折数
:param type_classifier: 分类器类型
:param type_performance: 分类器性能度量
:param print_loop: 是否打印循环
:param save_home: 距离矩阵存储地址
:param bag_space: 包空间
"""
super(ELDB, self).__init__(data_path, distance_save_home=save_home, bag_space=bag_space)
self._psi = psi
self._psi_max = psi_max
self._alpha = alpha
self._batch = batch
self._type_b2b = type_b2b
self._mode_bag_init = mode_bag_init
self._mode_action = mode_action
self._k = k
self._type_classifier = type_classifier
self._type_performance = type_performance
self._print_loop = print_loop
self.__init_ELDB()
def __init_ELDB(self):
'''
初始化 分类器 性能度量 标记向量
:return:
'''
self._type_classifier = ["knn", "svm", "j48"] if self._type_classifier is None else self._type_classifier
self._type_performance = ["accuracy", "f1_score"] if self._type_performance is None else self._type_performance
self.label_predict = {}
self.label_true = []
self.val_performance = {}
def __reset_record(self):
self.label_predict = {}
self.label_true = []
self.val_performance = {}
def __get_classifier(self):
'''
获取分类器
:return:
'''
return Classifier(self._type_classifier, self._type_performance)
def __kernel(self, VECTOR):
KERNEL = np.zeros((self.num_bag, self.num_bag))
for i in range(self.num_bag):
for j in range(self.num_bag):
KERNEL[i][j] = KERNEL[j][i] = (0.5 * Utils.kernel_rbf(VECTOR[i][0], VECTOR[j][0], gamma=1)
+ 0.5 * Utils.kernel_rbf(VECTOR[i][1], VECTOR[j][1], gamma=1))
return KERNEL
def __mapping(self, positive_instance, negative_instance):
'''
获取包映射向量
:param positive_instance: 正实例
:param negative_instance: 负实例
:return: 映射向量
'''
VECTOR = []
positive_shape, negative_shape = positive_instance.shape[0], negative_instance.shape[0]
for i in range(self.num_bag):
bag = self.bag_space[i][0][:, :-1]
VECTOR.append([euclidean_distances(bag, positive_instance).mean(0).reshape(1, positive_shape),
euclidean_distances(bag, negative_instance).mean(0).reshape(1, negative_shape)])
return VECTOR
def get_state(self):
'''
获取分类器与性能评估度量类型
:return:
'''
return self._type_classifier, self._type_performance
def get_mapping(self):
'''
ELDB主要实现
:return: 性能评估结果
'''
def __dBagSet_update_r(para_idx_dBagSet, para_score_dBagSet, idx_cur, score_cur):
'''
判别包更新模式 "r"
:param para_idx_dBagSet: 判别包索引
:param para_score_dBagSet: 判别包分数
:param idx_cur: 当前候选包索引
:param score_cur: 当前候选包分数
:return: 更新后的判别包
'''
for idx_find in np.arange(len(para_idx_dBagSet) - 1, -1, -1):
if score_cur > para_score_dBagSet[idx_find]:
continue
else:
idx_find += 1
idx_find = len(para_idx_dBagSet) - 1 if idx_find == len(para_idx_dBagSet) else idx_find
para_idx_dBagSet[idx_find + 1:] = para_idx_dBagSet[idx_find:-1]
para_score_dBagSet[idx_find + 1:] = para_score_dBagSet[idx_find:-1]
para_idx_dBagSet[idx_find], para_score_dBagSet[idx_find] = idx_cur, score_cur
break
return para_idx_dBagSet, para_score_dBagSet
# 获取k折交叉验证的训练集和测试集索引
idxes_train, idxes_test = Utils.get_k_cv_idx(self.num_bag, self._k)
# 获取正包和负包标记
label_positive, label_negative = np.max(self.bag_labels), np.min(self.bag_labels)
# 获取分类器和性能度量
classifier = self.__get_classifier()
performance = Utils.get_performance(self._type_performance)
self.__reset_record()
# 计算距离矩阵
if self._type_b2b != "msk":
dis = B2B.B2B(self.data_name, self.bag_space, self._type_b2b,
b2b_save_home=self.distance_save_home, min_max_vector=self.get_min_max()).get_dis()
else:
dis = None
# 主循环
positive_label, negative_label = np.max(self.bag_labels), np.min(self.bag_labels)
for loop, (temp_idx_train, temp_idx_test) in enumerate(zip(idxes_train, idxes_test)):
'''step 0. 初始化'''
if self._print_loop:
print("The {}-th CV ... ".format(loop))
if dis is None:
train_label = self.bag_labels[temp_idx_train]
positive_idx, negative_idx = np.where(train_label == positive_label)[0], \
np.where(train_label == negative_label)[0]
'''寻找最负代表实例'''
# 获取训练集中负包所有的实例
negative_instances, _, _ = self.get_sub_instance_space(negative_idx)
# 通过聚类选取包中最负的前n个实例
k_means = MiniBatchKMeans(n_clusters=10)
k_means.fit(negative_instances)
negative_instances = k_means.cluster_centers_
'''寻找最正代表实例'''
positive_instances = []
for j in positive_idx:
# 获取当前正包
bag = self.bag_space[j][0][:, :-1]
# 选取距离最负代表实例最远的实例
idx = euclidean_distances(bag, negative_instances).sum(1).argmax()
# 加入最正代表实例
positive_instances.append(bag[idx].tolist())
positive_instances = np.array(positive_instances)
# 找到指定数量的最正实例
dis = euclidean_distances(positive_instances).sum(0)
max_distance_idx = np.argsort(dis)[::-1]
positive_instances = positive_instances[max_distance_idx[:10]]
# 映射
VECTOR = self.__mapping(positive_instances, negative_instances)
""" 计算核矩阵"""
dis = self.__kernel(VECTOR)
# 计算训练集和更新数据集的大小
num_trainset = len(temp_idx_train)
nun_updateset = int(num_trainset * (1 - self._alpha))
batch = num_trainset // 2 if self._batch is None else self._batch
# 计算最大更新次数
num_max_update_times = num_trainset // batch
num_basic_set = num_trainset - (num_max_update_times * batch)
# 获取基础数据集和更新数据集索引
idx_basic_set, idx_update_set = np.array(temp_idx_train[:num_basic_set]), np.array(
temp_idx_train[num_basic_set:])
"""step 1.计算模型参数"""
# 计算\Delta矩阵
matrix_Delta = np.zeros((num_basic_set, num_basic_set), dtype=int)
for i in range(num_basic_set):
for j in range(num_basic_set):
if self.bag_labels[idx_basic_set[i]] == self.bag_labels[idx_basic_set[j]]:
matrix_Delta[i][j] = 1
else:
matrix_Delta[i][j] = -1
# 计算\Gamma矩阵
matrix_Gamma = np.diag(np.sum(matrix_Delta, 1))
# 计算判别矩阵L
matrix_L = matrix_Gamma - matrix_Delta
# 删除矩阵gamma和矩阵Delta只保留判别矩阵L
del matrix_Gamma, matrix_Delta
# 基于整个数据集进行包的映射,获取映射向量
mapping_bag = dis[temp_idx_train, :][:, idx_basic_set]
# 按照公式计算训练数据集每个映射向量的分数(优先级)
score_train = np.dot(np.dot(mapping_bag, matrix_L), np.transpose(mapping_bag))
score_train = np.diag(score_train)
# 获取基本数据集和更新数据集的分数
score_basic_set, score_update_set = score_train[:num_basic_set], score_train[num_basic_set:]
# 计算 辨别数据集dBagSet的初始大小
psi = int(min(self._psi, num_basic_set) * self._psi)
# 得分索引
arg_score_basic = np.argsort(score_basic_set)[::-1]
# 初始化判别包dBagSet
if self._mode_bag_init == 'g':
idx_dBagSet = arg_score_basic[:psi].tolist()
else:
idx_dBagSet = []
count = 0
for i in arg_score_basic:
if count >= psi:
break
else:
if (self._mode_bag_init == 'p' and self.bag_labels[idx_basic_set[i] == label_positive]) or \
(self._mode_bag_init == 'n' and self.bag_labels[idx_basic_set[i]] != positive_label):
idx_dBagSet.append(i)
count += 1
score_dBagSet, idx_dBagSet = score_basic_set[idx_dBagSet], [idx_basic_set[idx_dBagSet].tolist()]
del score_basic_set, arg_score_basic
tau, p = len(idx_dBagSet[-1]) - 1, score_dBagSet[-1]
# 更新 dBagSet
for i in range(num_max_update_times):
idx_dBagSet_update, score_dBagSet_update = idx_dBagSet[-1].copy, score_dBagSet.copy()
for j in range(batch):
idx_temp = i * batch + j
if score_update_set[idx_temp] <= p:
continue
if self._mode_action == 'a':
idx_dBagSet_update.appand(idx_update_set[idx_temp])
else:
idx_dBagSet_update, score_dBagSet = __dBagSet_update_r(idx_dBagSet_update, score_dBagSet_update,
idx_update_set[idx_temp],
score_update_set[idx_temp])
if idx_dBagSet_update == idx_dBagSet[-1]:
continue
idx_dBagSet.append(idx_dBagSet_update)
del idx_dBagSet_update, score_dBagSet, mapping_bag
"""step 2.构建加权模型"""
label_basic_set, label_update_set = self.bag_labels[idx_basic_set], self.bag_labels[idx_update_set]
label_train, label_test = self.bag_labels[temp_idx_train], self.bag_labels[temp_idx_test]
Predict, Weight = [], []
for i, dBagSet in enumerate(idx_dBagSet):
# 获取映射向量及迭代器
mapping_basic, mapping_update = dis[idx_basic_set, :][:, dBagSet], dis[idx_update_set:][:dBagSet]
data_iter = Utils.get_iter(mapping_basic, label_basic_set, mapping_update, label_update_set)
# 添加权值,使用分类器性能度量作为权值
Weight.append(classifier.test(data_iter))
# 获取测试集和训练集
mapping_train, mapping_test = np.vstack([mapping_basic, mapping_update])
del mapping_basic, mapping_update
# 重新训练分类模型
data_iter = Utils.get_iter(mapping_train, label_train, mapping_test, label_test)
classifier.test(data_iter)
Predict.append(classifier.test_predicts)
del data_iter, mapping_test, mapping_train, label_train, label_test
'''获取预测标签向量,分类器集成'''
test_predict_all = {}
# 根据基分类器分类结果和性能度量进行分类结果集成
for i, (predict, weight) in enumerate(zip(Predict, Weight)):
# 处理每个基分类器
for classifier_name in self._type_classifier:
weight_classifier = weight[classifier_name]
for j, metric in enumerate(weight_classifier):
metric = 1 if metric < 0.5 else metric
weight_predict = metric * np.array(predict[classifier_name])
test_predict_all[classifier_name + ' ' + self._type_performance[j]] = \
test_predict_all.get(classifier_name + ' ' + self._type_performance[j],
np.zeros_like(weight_predict)) + weight_predict
# 分类器阈值计算
tau = len(Weight) / 2
# 阈值函数
def sign(arr):
arr[arr >= tau] = label_positive
arr[arr != label_positive] = label_negative
return arr
for key, val in test_predict_all.items():
if loop == 0:
self.label_predict[key] = sign(val).tolist()
else:
self.label_true.extend(self.bag_labels[temp_idx_test])
self.label_true.extend(self.bag_labels[temp_idx_test])
break
# 获取分类器性能度量
for key, val in self.label_predict.items():
key_temp = key.split()
val = np.array(val, dtype=int).tolist()
self.val_performance[key] = performance[key_temp[-1]](val, self.label_true)
return self.val_performance
if __name__ == "__main__":
pass
2.算法测试
略