Two strategies:BD--,BD++
5 times 10-fold cross validation
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
class BDOR():
def __init__(self,X_train,y_train,labeled,X_test,y_test):
self.X = X_train
self.y = y_train
self.X_test = X_test
self.y_test = y_test
self.N = len(y_train)
#######################
self.absLabeled = list(deepcopy(labeled))
self.unLabeled = self.init_unlabeled()
#####################
self.labels = np.unique(y_train)
self.target = np.array([_ for _ in np.arange(self.labels[0], self.labels[-1], 1)])
self.labNum = len(self.labels)
############################
self.ocModel = self.init_learning_model()
self.trainIndex = OrderedDict()
self.trainTarget = OrderedDict()
######################################
self.Acc_1 = None
self.MAE_1 = None
self.F1_1 = None
self.Acc_2 = None
self.MAE_2 = None
self.F1_2 = None
self.Acc_3 = None
self.MAE_3 = None
self.F1_3 = None
self.Acc_4 = None
self.MAE_4 = None
self.F1_4 = None
def init_unlabeled(self):
unlabeled = [_ for _ in range(self.N)]
for ele in self.absLabeled:
unlabeled.remove(ele)
return unlabeled
def init_learning_model(self):
model_dict = OrderedDict()
for tar in self.target:
model_dict[tar] = LogisticRegression(solver='newton-cg', penalty='l2')
return model_dict
def reconstruct_and_train(self):
self.trainIndex = OrderedDict()
self.trainTarget = OrderedDict()
for tar in self.target:
self.trainIndex[tar] = deepcopy(self.absLabeled)
self.trainTarget[tar] = deepcopy(self.y[self.absLabeled])
for j in range(len(self.absLabeled)):
if self.trainTarget[tar][j] <= tar:
self.trainTarget[tar][j] = tar
else:
self.trainTarget[tar][j] = tar + 1
"""add the interval labeled instances into the training set"""
for tar in self.target:
if len(self.trainIndex[tar]) != len(self.trainTarget[tar]):
print("位置:bdocReConstruct。原因:索引和标记常数不相同")
for tar, model in self.ocModel.items():
model.fit(self.X[self.trainIndex[tar]], self.trainTarget[tar])
def predict_one(self):
self.reconstruct_and_train()
proDict = OrderedDict()
for tar, model in self.ocModel.items():
proDict[tar] = model.predict_proba(self.X_test)
y_pred = np.zeros(len(self.y_test))
for j in range(len(self.y_test)):
prob = OrderedDict()
for ele in self.labels:
if ele == self.labels[0]:
prob[ele] = proDict[ele][j][0]
elif ele == self.labels[-1]:
prob[ele] = proDict[ele - 1][j][1]
else:
prob[ele] = proDict[ele - 1][j][1] - proDict[ele][j][1]
y_pred[j] = max(prob, key=prob.get)
self.Acc_1 = accuracy_score(y_true=self.y_test,y_pred=y_pred)
self.F1_1 = f1_score(y_true=self.y_test, y_pred=y_pred,average='macro')
self.MAE_1 = mean_absolute_error(y_true=self.y_test,y_pred=y_pred)
def predict_second(self):
self.reconstruct_and_train()
testNum = len(self.y_test)
proDict = OrderedDict()
for tar, model in self.ocModel.items():
proDict[tar] = model.predict_proba(self.X_test)
conf_dict = OrderedDict()
for lab in self.labels:
conf_dict[lab] = 0
y_pred = np.zeros(testNum)
for j in range(testNum):
conf_Dict = deepcopy(conf_dict)
for tar, prob in proDict.items():
for ele in self.labels:
if ele <= tar:
conf_Dict[ele] += prob[j][0]
else:
conf_Dict[ele] += prob[j][1]
y_pred[j] = max(conf_Dict,key=conf_Dict.get)
self.Acc_2 = accuracy_score(y_true=self.y_test,y_pred=y_pred)
self.F1_2 = f1_score(y_true=self.y_test, y_pred=y_pred,average='macro')
self.MAE_2 = mean_absolute_error(y_true=self.y_test,y_pred=y_pred)
def predict_third(self):
self.reconstruct_and_train()
testNum = len(self.y_test)
y_pred = np.zeros(testNum)
for i in range(testNum):
prob_dict = OrderedDict()
for tar, model in self.ocModel.items():
prob_dict[tar] = model.predict_proba(self.X_test[[i]])[0]
prob_matrix = np.zeros((self.labNum,self.labNum+1))
for tar, prob in prob_dict.items():
for lab in self.labels:
tar = int(tar)
lab = int(lab)
if lab <= tar:
prob_matrix[tar, lab] = prob[0]
else:
prob_matrix[tar, lab] = prob[1]
prolist = np.sum(prob_matrix, axis=0)
prolist = prolist / (self.labNum - 1)
y_pred[i] = np.argmax(prolist)
self.Acc_3 = accuracy_score(y_true=self.y_test,y_pred=y_pred)
self.F1_3 = f1_score(y_true=self.y_test, y_pred=y_pred,average='macro')
self.MAE_3 = mean_absolute_error(y_true=self.y_test,y_pred=y_pred)
def predict_four(self):
model = LogisticRegression(solver='newton-cg', penalty='l2',multi_class='ovr')
model.fit(X=self.X[self.absLabeled],y=self.y[self.absLabeled])
y_pred = model.predict(self.X_test)
self.Acc_4 = accuracy_score(y_true=self.y_test,y_pred=y_pred)
self.F1_4 = f1_score(y_true=self.y_test, y_pred=y_pred,average='macro')
self.MAE_4 = mean_absolute_error(y_true=self.y_test,y_pred=y_pred)
if __name__ == '__main__':
p = Path("D:\OCdata")
# names = ["abalone15-5bin", "balance-scale", "bank15-5bin", "car", "computer15-5bin", "ERA", "ESL", "eucalyptus",
# "housing-5bin", "LEV", "machine-5bin", "newthyroid", "stock-5bin", "SWD", "winequality-red"]
# names = ["car","housing-5bin", "LEV", "machine-5bin", "newthyroid", "stock-5bin", "SWD", "winequality-red"]
names = ["housing-5bin"]
for name in names:
path = p.joinpath(name + ".csv")
print("#####################################################{}".format(path))
data = np.array(pd.read_csv(path, header=None))
X = data[:, :-1]
y = data[:, -1]
Rounds = 5
labNum = len(np.unique(y))
print("数据集信息{}".format(set(y)))
budgetlist = np.array([labNum * i for i in range(1, 21)])
Budget = labNum * 20
###------------------------------
Acc_1_list = []
Acc_2_list = []
Acc_3_list = []
Acc_4_list = []
MAE_1_list = []
MAE_2_list = []
MAE_3_list = []
MAE_4_list = []
F1_1_list = []
F1_2_list = []
F1_3_list = []
F1_4_list = []
for r in range(Rounds):
SKF = StratifiedKFold(n_splits=10, shuffle=True)
for train_idx, test_idx in SKF.split(X, y):
# print("类别个数=",labNum,"训练数据=",len(train_idx),"测试数据=",len(test_idx))
train_X = X[train_idx]
train_y = y[train_idx]
test_X = X[test_idx]
test_y = y[test_idx]
labeled = []
label_dict = OrderedDict()
for lab in np.unique(train_y):
label_dict[lab] = []
for idx in range(len(train_y)):
label_dict[train_y[idx]].append(idx)
for idxlist in label_dict.values():
for jdx in np.random.choice(idxlist, size=1, replace=False):
labeled.append(jdx)
# print("已标记样本:",labeled)
labeled = range(len(train_idx))
model = BDOR(X_train=train_X, y_train=train_y, labeled=labeled, X_test=test_X, y_test=test_y)
model.predict_one()
model.predict_second()
model.predict_third()
model.predict_four()
Acc_1_list.append(model.Acc_1)
Acc_2_list.append(model.Acc_2)
Acc_3_list.append(model.Acc_3)
Acc_4_list.append(model.Acc_4)
MAE_1_list.append(model.MAE_1)
MAE_2_list.append(model.MAE_2)
MAE_3_list.append(model.MAE_3)
MAE_4_list.append(model.MAE_4)
F1_1_list.append(model.F1_1)
F1_2_list.append(model.F1_2)
F1_3_list.append(model.F1_3)
F1_4_list.append(model.F1_4)
# print("精度对比=",model.Acc_1,":",model.Acc_2,":",model.Acc_3)
# print("绝对误差=", model.MAE_1, ":", model.MAE_2,":",model.MAE_3)
# print("F1指标=", model.F1_1, ":", model.F1_2,":",model.F1_3)
print("精度对比=",np.mean(Acc_1_list),"|",np.mean(Acc_2_list),"|",np.mean(Acc_3_list), "|", np.mean(Acc_4_list))
print("绝对误差=", np.mean(MAE_1_list), "|", np.mean(MAE_2_list), "|", np.mean(MAE_3_list), "|", np.mean(MAE_4_list))
print("F1指标=", np.mean(F1_1_list), "|", np.mean(F1_2_list), "|", np.mean(F1_3_list), "|", np.mean(F1_4_list))
break