缺点:大概率首选离群点
改进效果:
首先剔除密度较小的点,作为虚拟的已标记样本点。
粗放版代码:
'''
author:Daniel
date:2021-04-03
organization: CQUPT
Reference: D. Wu, C. Lin, J. Huang. Active learning for regression using greeding sampling. Information Sciences, 2019, 474: 90-105.
'''
import xlwt
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.metrics import accuracy_score, mean_absolute_error
from collections import OrderedDict
from mord import LogisticAT
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform
class GSx():
def __init__(self,X_pool,y_pool,labeled,budget,X_test,y_test):
self.X_pool = X_pool
self.y_pool = y_pool
self.X_test = X_test
self.y_test = y_test
self.nSample = self.X_pool.shape[0]
self.labeled = list(deepcopy(labeled))
self.unlabeled = self.initialization()
self.budgetLeft = deepcopy(budget)
self.budget = deepcopy(budget)
self.pdist = pdist(self.X_pool,metric="euclidean")
self.matrix = squareform(self.pdist)
self.meanDist = np.mean(self.pdist)
self.rho = self.cal_rho()
self.unavaiable = self.get_unavailable()
self.model = LogisticAT()
self.AccList = []
self.MAEList = []
self.ALC_Acc = 0.0
self.ALC_MAE = 0.0
self.ALC_Acc_10k = 0.0
self.ALC_MAE_10k = 0.0
def cal_rho(self):
print("meanDist",self.meanDist)
rho = np.zeros(self.nSample)
for idx in range(self.nSample):
rho[idx] = 0
for jdx in range(self.nSample):
if idx != jdx:
rho[idx] += np.exp(-(self.matrix[idx,jdx]/2)**2)
return rho
def get_unavailable(self):
unavaiable = []
print(self.rho)
ordes = np.argsort(self.rho)
indes = [i for i in range(self.nSample)]
for i in range(80):
# print(i)
# print(ordes[i])
# print(indes[ordes[i]])
unavaiable.append(indes[ordes[i]])
return unavaiable
def D(self, a, b):
return np.sqrt(sum((a - b) ** 2))
def initialization(self):
unlabeled = [i for i in range(len(self.y_pool))]
for j in self.labeled:
unlabeled.remove(j)
return unlabeled
def select(self):
while self.budgetLeft > 0:
criteria = OrderedDict()
available = [i for i in range(self.nSample)]
for idx in self.unavaiable:
available.remove(idx)
for idx in self.labeled:
if idx in available:
available.remove(idx)
contrast = deepcopy(self.labeled)
for idx in self.unavaiable:
contrast.append(idx)
for idx in available:
little = np.inf
for jdx in contrast:
if self.matrix[idx,jdx] < little:
little = self.matrix[idx,jdx]
criteria[idx] = little
tar_idx = max(criteria,key=criteria.get)
# max_metric
#
#
# dist = self.D(self.X_pool[idx], self.X_pool[jdx])
# if dist < little:
# little = dist
# max_metric[idx] = little
# tar_idx = max(max_metric, key=max_metric.get)
plt.scatter(self.X_pool[:,0],self.X_pool[:,1],c=y)
plt.scatter(self.X_pool[self.labeled][:,0],self.X_pool[self.labeled][:,1],c=self.y_pool[self.labeled],edgecolors='r',linewidths=1)
plt.scatter(self.X_pool[self.unavaiable][:,0],self.X_pool[self.unavaiable][:,1],c=self.y_pool[self.unavaiable],edgecolors='b',linewidths=1.5)
plt.scatter(self.X_pool[tar_idx][0],self.X_pool[tar_idx][1],marker="*",c="r",s=120)
plt.show()
self.labeled.append(tar_idx)
self.unlabeled.remove(tar_idx)
self.budgetLeft -= 1
# print("预算剩余:{}".format(self.budgetLeft))
# self.model.fit(X=self.X_pool[self.labeled], y=self.y_pool[self.labeled])
# Acc = accuracy_score(y_true=self.y_test, y_pred=self.model.predict(self.X_test))
# MAE = mean_absolute_error(y_true=self.y_test, y_pred=self.model.predict(self.X_test))
# self.AccList.append(Acc)
# self.MAEList.append(MAE)
# self.ALC_Acc += Acc
# self.ALC_MAE += MAE
# if self.budgetLeft >= 0.5 * self.budget:
# self.ALC_Acc_10k += Acc
# self.ALC_MAE_10k += MAE
if __name__ == '__main__':
'''generate the dataset contain four categories data'''
add = [0, 6, 12, 19]
data = None
for i in range(4):
X, yy = datasets.make_blobs(n_samples=200, n_features=2, center_box=(0, 0), cluster_std=3, random_state=45)
X += add[i]
if i == 0:
data = X
else:
data = np.vstack((data, X))
y = np.ones(800)
y[:200] = 0
y[200:400] = 1
y[400:600] = 2
y[600:800] = 3
X = data
labeled = [6, 206, 406, 606]
budget = 40
'''Execute the algorithm'''
model = GSx(X_pool=X,y_pool=y,labeled=labeled,budget=budget,X_test=X,y_test=y)
model.select()