'''
author:Daniel
date:2020-12-11
organization: CQUPT
Reference: D. Wu, C. Lin, J. Huang. Active learning for regression using greeding sampling. Information Sciences, 2019, 474: 90-105.
'''
import xlwt
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.metrics import accuracy_score, mean_absolute_error
from collections import OrderedDict
from mord import LogisticAT
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform
class Max():
def __init__(self,X_pool,y_pool,labeled,budget,X_test,y_test):
self.X_pool = X_pool
self.y_pool = y_pool
self.X_test = X_test
self.y_test = y_test
self.nSample = self.X_pool.shape[0]
self.labeled = list(deepcopy(labeled))
self.unlabeled = self.initialization()
self.budgetLeft = deepcopy(budget)
self.budget = deepcopy(budget)
self.pdist = pdist(self.X_pool,metric="euclidean")
self.matrix = squareform(self.pdist)
self.meanDist = np.mean(self.pdist)
self.rho = self.cal_rho()
self.model = LogisticAT()
def cal_rho(self):
print("meanDist",self.meanDist)
rho = np.zeros(self.nSample)
for idx in range(self.nSample):
rho[idx] = 0
for jdx in range(self.nSample):
if idx != jdx:
rho[idx] += np.exp(-(self.matrix[idx,jdx]/1)**2)
return rho
def initialization(self):
unlabeled = [i for i in range(self.nSample)]
for j in self.labeled:
unlabeled.remove(j)
return unlabeled
def select(self):
while self.budgetLeft > 0:
criteria = OrderedDict()
for idx in self.unlabeled:
criteria[idx] = 0
for jdx in self.labeled:
criteria[idx] += self.matrix[idx,jdx]
tar_idx = max(criteria,key=criteria.get)
plt.scatter(self.X_pool[:,0],self.X_pool[:,1],c=y)
plt.scatter(self.X_pool[self.labeled][:,0],self.X_pool[self.labeled][:,1],c=self.y_pool[self.labeled],edgecolors='r',linewidths=1)
# plt.scatter(self.X_pool[self.unavaiable][:,0],self.X_pool[self.unavaiable][:,1],c=self.y_pool[self.unavaiable],edgecolors='b',linewidths=1.5)
plt.scatter(self.X_pool[tar_idx][0],self.X_pool[tar_idx][1],marker="*",c="r",s=120)
self.labeled.append(tar_idx)
self.unlabeled.remove(tar_idx)
self.budgetLeft -= 1
plt.show()
if __name__ == '__main__':
'''generate the dataset contain four categories data'''
add = [0, 6, 12, 19]
data = None
for i in range(4):
X, yy = datasets.make_blobs(n_samples=200, n_features=2, center_box=(0, 0), cluster_std=3, random_state=45)
X += add[i]
if i == 0:
data = X
else:
data = np.vstack((data, X))
y = np.ones(800)
y[:200] = 0
y[200:400] = 1
y[400:600] = 2
y[600:800] = 3
X = data
labeled = [6, 206, 406, 606]
budget = 40
'''Execute the algorithm'''
model = Max(X_pool=X,y_pool=y,labeled=labeled,budget=budget,X_test=X,y_test=y)
model.select()