The Linear Case, So easy.....
'''
The Linear Cases
Date:20210418
'''
import matlab
import matlab.engine
import numpy as np
import pandas as pd
import xlwt
from pathlib import Path
from copy import deepcopy
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score,recall_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import cvxpy as cp
import cvxopt as opt
class KDLAOR():
def __init__(self,X_train,y_train):
self.X_train = X_train
self.y_train = y_train
self.nSample, self.nAtt = self.X_train.shape
self.labels = np.sort(np.unique(self.y_train))
self.nClass = len(self.labels)
# print("cClass",self.nClass)
self.nEachClass = []
self.Mean = self.get_Mean()
self.totalMean = np.mean(self.X_train,axis=0)
self.Sw = self.get_Sw()
self.Sb = self.get_Sb()
self.subMean = self.get_subMean()
self.invSw = np.linalg.inv(self.Sw)
self.C = 0.003
self.w = self.Optimal()
self.Bk = self.get_Bk()
def get_Mean(self):
Mean = np.zeros((self.nClass,self.nAtt))
for i, lab in enumerate(self.labels):
idx_list = np.where(self.y_train == lab)
Mean[i] = np.mean(self.X_train[idx_list],axis=0)
self.nEachClass.append(len(idx_list))
return Mean
def get_Sw(self):
Sw = np.zeros((self.nAtt,self.nAtt))
for i, lab in enumerate(self.labels):
X_i = self.X_train[np.where(self.y_train == lab)] - self.Mean[i]
Sw += (self.nEachClass[i] / self.nSample) * X_i.T @ X_i
return Sw
def get_Sb(self):
Sb = np.zeros((self.nAtt,self.nAtt))
for i, lab in enumerate(self.labels):
A = self.Mean[i] - self.totalMean
Sb += (self.nEachClass[i] / self.nSample) * np.outer(A,A)
return Sb
def get_subMean(self):
subMean = np.ones((self.nClass-1,self.nAtt))
for k in range(self.nClass-1):
subMean[k] = self.Mean[k+1] - self.Mean[k]
return subMean
def Optimal(self):
alpha = cp.Variable((self.nClass-1,1))
print("K-1:",self.nClass-1)
print("alpha::",alpha.shape)
print("subMean::",self.subMean.shape)
print("invSw::",self.invSw.shape)
Mid = self.subMean @ self.invSw @ self.subMean.T
###################行不通的做法
# obj = cp.Minimize(alpha.T @ Mid @ alpha)
# constraint = [alpha >=0, cp.sum(alpha) == 10]
# cp.Problem(obj,constraint).solve()
# print(Mid)
##################
P = Mid * 2
x = cp.Variable(self.nClass-1)
q = np.zeros(self.nClass-1)
A = np.ones(self.nClass-1)
b = 0.003
G = -1 * np.eye(self.nClass-1)
h = np.zeros(self.nClass-1)
prob = cp.Problem(cp.Minimize((1 / 2) * cp.quad_form(x, P) + q.T @ x),[G @ x <= h, A @ x ==b])
prob.solve()
# print(prob.value)
# print(x.value)
w = 0.5 * self.invSw @ self.subMean.T @ x.value
# print("w=",w)
return w
def get_Bk(self):
sumMean = np.zeros((self.nClass - 1, self.nAtt))
for k in range(self.nClass - 1):
sumMean[k] = 1 / (self.nEachClass[k] + self.nEachClass[k + 1]) * (
self.nEachClass[k] * self.Mean[k] + self.nEachClass[k + 1] * self.Mean[k + 1])
Bk = sumMean @ self.w
return Bk
def predict(self,X_test,y_test):
tmp = X_test @ self.w - self.Bk[:,None]
pred = np.sum(tmp > 0, axis=0).astype(np.int) + 1
Acc = accuracy_score(y_true=y_test,y_pred=pred)
print(Acc)
if __name__ == '__main__':
path0 = r"D:\OCdata\balance-scale.csv"
path1 = r"D:\OCdata\car.csv"
path2 = r"D:\OCdata\ERA.csv"
path3 = r"D:\OCdata\ESL.csv"
path4 = r"D:\OCdata\eucalyptus.csv"
path5 = r"D:\OCdata\LEV.csv"
path6 = r"D:\OCdata\newthyroid.csv"
path7 = r"D:\OCdata\SWD.csv"
path8 = r"D:\OCdata\toy.csv"
path9 = r"D:\OCdata\winequality-red.csv"
path10 = r"D:\OCdata\regression\abalone15-5bin.csv"
path11 = r"D:\OCdata\regression\bank15-5bin.csv"
path12 = r"D:\OCdata\regression\census15-5bin.csv"
path13 = r"D:\OCdata\regression\computer15-5bin.csv"
path14 = r"D:\OCdata\regression\housing-5bin.csv"
path15 = r"D:\OCdata\regression\machine-5bin.csv"
data = np.array(pd.read_csv(path15, header=None))
X = data[:, :-1]
y = data[:, -1].astype(np.int)
print("数据类别:",np.unique(y))
print("类别个数:",len(np.unique(y)))
print("阈值个数:",len(np.unique(y))-1)
# X,y = load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
clf = KDLAOR(X_train=X_train,y_train=y_train)
clf.predict(X_test=X_test,y_test=y_test)
KDLOR线性模式的的类型性能很差,跟Logistic AT,SVMIM和SVMEX完全没发比。
'''
The Linear Cases
Date:20210627
'''
import matlab
import matlab.engine
import numpy as np
import pandas as pd
import xlwt
from pathlib import Path
from copy import deepcopy
from mord import LogisticAT
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score,recall_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import cvxpy as cp
import cvxopt as opt
class KDLAOR():
def __init__(self,X,y):
self.X = X
self.y = y
self.nSample, self.nDim = self.X.shape
self.labels = np.sort(np.unique(self.y))
self.nClass = len(self.labels)
self.nEachClass = []
self.ClassIndex = OrderedDict()
self.Mean = self.get_Mean()
self.subMean = self.get_subMean()
self.Sw = self.get_Sw()
self.invSw = np.linalg.inv(self.Sw)
self.w = self.optimal_w()
self.boundary = self.get_boundary()
def get_Mean(self):
Mean = np.zeros((self.nClass,self.nDim))
for i, lab in enumerate(self.labels):
idx_list = np.where(self.y == lab)[0]
Mean[i] = np.mean(self.X[idx_list],axis=0)
self.nEachClass.append(np.size(idx_list))
self.ClassIndex[i] = idx_list
return Mean
def get_Sw(self):
Sw = np.zeros((self.nDim, self.nDim))
for i, lab in enumerate(self.labels):
Xi = self.X[self.ClassIndex[i]] - self.Mean[i]
Sw += (self.nEachClass[i]/self.nSample) * Xi.T @ Xi
Sw = Sw + 0.0001 * np.identity(self.nDim)
return Sw
def get_subMean(self):
subMean = np.ones((self.nClass-1, self.nDim))
for k in range(self.nClass-1):
subMean[k] = self.Mean[k+1] - self.Mean[k]
return subMean
def optimal_w(self):
alpha = cp.Variable(self.nClass-1) # 需要求解的变量
P = self.subMean @ self.invSw @ self.subMean.T # 对应P
C = 1
constraint = [-1 * np.eye(self.nClass-1) @ alpha <= np.zeros(self.nClass-1), np.ones(self.nClass-1) @ alpha == C ]
prob = cp.Problem(cp.Minimize(cp.quad_form(alpha, P)), constraint)
prob.solve()
# print(alpha.value)
w = 0.5 * self.invSw @ self.subMean.T @ alpha.value
print("w===",w)
return w
def get_boundary(self):
sumMean = np.zeros((self.nClass-1, self.nDim))
for k in range(self.nClass-1):
sumMean[k] = 1 / (self.nEachClass[k] + self.nEachClass[k+1]) * (self.nEachClass[k] * self.Mean[k] + self.nEachClass[k+1] * self.Mean[k+1])
boundary = sumMean @ self.w
return boundary
def predict(self,X):
tmp = X @ self.w - self.boundary[:,None]
y_pred = np.sum(tmp>0, axis=0).astype(np.int) +1
return y_pred
if __name__ == '__main__':
path0 = r"D:\OCdata\balance-scale.csv"
path1 = r"D:\OCdata\car.csv"
path2 = r"D:\OCdata\ERA.csv"
path3 = r"D:\OCdata\ESL.csv"
path4 = r"D:\OCdata\eucalyptus.csv"
path5 = r"D:\OCdata\LEV.csv"
path6 = r"D:\OCdata\newthyroid.csv"
path7 = r"D:\OCdata\SWD.csv"
path8 = r"D:\OCdata\toy.csv"
path9 = r"D:\OCdata\winequality-red.csv"
path10 = r"D:\OCdata\regression\abalone15-5bin.csv"
path11 = r"D:\OCdata\regression\bank15-5bin.csv"
path12 = r"D:\OCdata\regression\census15-5bin.csv"
path13 = r"D:\OCdata\regression\computer15-5bin.csv"
path14 = r"D:\OCdata\regression\housing-5bin.csv"
path15 = r"D:\OCdata\regression\machine-5bin.csv"
path16 = r"D:\OCdata\HDI2.csv"
data = np.array(pd.read_csv(path16, header=None))
X = data[:, :-1]
y = data[:, -1].astype(np.int)
print("数据类别:",np.unique(y))
print("类别个数:",len(np.unique(y)))
print("阈值个数:",len(np.unique(y))-1)
# X,y = load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
clf = KDLAOR(X=X_train,y=y_train)
y_pred = clf.predict(X=X_test)
Acc_1 = accuracy_score(y_test,y_pred)
print("KDLOR的精度:",Acc_1)
model = LogisticAT()
model.fit(X=X_train,y=y_train)
y_pred = model.predict(X=X_test)
acc = accuracy_score(y_true=y_test,y_pred=y_pred)
print("LAT的精度:",acc)
本来一维是写错了。现在重新写了,结果还是一样的。 KDLOR线性模型的效果确实不怎么好。
另外参数 C 设置为大于零的任意实数 对结果都没有影响。
从上面这张图可以看出为什么新鲜KDLOR为什么表现不好了。
The Non-linear Case:
'''
The Non-linear Cases
Date:20210418
'''
import matlab
import matlab.engine
import numpy as np
import pandas as pd
import xlwt
from pathlib import Path
from copy import deepcopy
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,mean_absolute_error,f1_score,recall_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import cvxpy as cp
class KDLOR_Kernel():
def __init__(self,X_train,y_train):
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.nSample, self.nAtt = self.X_train.shape
self.labels = np.sort(np.unique(self.y_train))
self.nClass = len(self.labels)
self.nEachClass = self.get_nEachClass()
self.kernelType = "rbf"
self.u = 0.001
self.gamma = 0.01
self.C = 0.003
self.K_Train = rbf_kernel(X=self.X_train,Y=self.X_train,gamma=self.gamma)
self.Mean = self.get_Mean()
# print("meanClass:::")
# print(self.Mean)
self.subMean = self.get_subMean()
self.H = self.get_H()
self.H_inv = np.linalg.inv(self.H)
self.Q = self.subMean @ self.H_inv @ self.subMean.T
print("Q::",self.Q)
# print("Q::",self.Q)
self.alpha = self.Optimal()
print("alpha==",self.alpha)
self.beta = self.get_beta()
print("beta:::",self.beta)
self.threshold = self.get_threshold()
print("阈值::",self.threshold)
def get_nEachClass(self):
nEachClass = []
for i, lab in enumerate(self.labels):
nEachClass.append(len(np.where(self.y_train == lab)[0]))
return nEachClass
def get_Mean(self):
meanClass = np.zeros((self.nClass, self.nSample))
for i, lab in enumerate(self.labels):
meanClass[i] = np.mean(self.K_Train[np.where(self.y_train==lab)[0]],axis=0)
return meanClass
def get_subMean(self):
subMean = np.zeros((self.nClass-1,self.nSample))
for k in range(self.nClass-1):
subMean[k] = self.Mean[k+1] - self.Mean[k]
return subMean
def get_H(self):
H = np.zeros((self.nSample, self.nSample))
for i, lab in enumerate(self.labels):
P = self.K_Train[:, np.where(self.y_train == lab)[0]]
Mid = (np.eye(self.nEachClass[i]) - (1 / self.nEachClass[i]) * np.ones((self.nEachClass[i], self.nEachClass[i])))
H += P @ Mid @ P.T
H += self.u * np.eye(self.nSample)
return H
def Optimal(self):
P = self.Q
x = cp.Variable(self.nClass-1)
##不等式约束
G = -1 * np.eye(self.nClass-1)
h = np.zeros(self.nClass-1)
##等式约束
A = np.ones(self.nClass-1)
b = self.C
prob = cp.Problem(cp.Minimize(cp.quad_form(x, P)), [G @ x <= h, A @ x == b])
prob.solve()
# print("x::",x.value)
# aa = np.array([0.0007501,0.0007518,0.0007505,0.0007476])
return x.value
def get_beta(self):
return 0.5 * self.H_inv @ self.subMean.T @ self.alpha.T
def get_threshold(self):
threshold = np.zeros(self.nClass-1)
for i in range(self.nClass-1):
threshold[i] = self.beta.T @ (self.Mean[i+1] + self.Mean[i])/2
return threshold
def predict(self,X_test,y_test):
self.X_test = X_test
self.y_test = y_test
K_Test = rbf_kernel(X=self.X_train, Y=self.X_test, gamma=self.gamma)
# print("K_Test::",K_Test.shape)
# print("beta::",self.beta.shape)
Z_test = self.beta.T @ K_Test
# print("Z_test::",Z_test)
tmp = Z_test - self.threshold[:, None]
pred = np.sum(tmp > 0, axis=0).astype(np.int) + 1
print("pred::",pred)
Acc = accuracy_score(y_pred=pred, y_true=self.y_test)
print("预测精度::",Acc)
if __name__ == '__main__':
path0 = r"D:\OCdata\balance-scale.csv"
path1 = r"D:\OCdata\car.csv"
path2 = r"D:\OCdata\ERA.csv"
path3 = r"D:\OCdata\ESL.csv"
path4 = r"D:\OCdata\eucalyptus.csv"
path5 = r"D:\OCdata\LEV.csv"
path6 = r"D:\OCdata\newthyroid.csv"
path7 = r"D:\OCdata\SWD.csv"
path8 = r"D:\OCdata\toy.csv"
path9 = r"D:\OCdata\winequality-red.csv"
path10 = r"D:\OCdata\regression\abalone15-5bin.csv"
path11 = r"D:\OCdata\regression\bank15-5bin.csv"
path12 = r"D:\OCdata\regression\census15-5bin.csv"
path13 = r"D:\OCdata\regression\computer15-5bin.csv"
path14 = r"D:\OCdata\regression\housing-5bin.csv"
path15 = r"D:\OCdata\regression\machine-5bin.csv"
data = np.array(pd.read_csv(path11, header=None))
X = data[:, :-1]
y = data[:, -1].astype(np.int)
print("数据类别:",np.unique(y))
print("类别个数:",len(np.unique(y)))
print("阈值个数:",len(np.unique(y))-1)
# X,y = load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
print("训练数据个数:",X_train.shape[0])
print("测试数据个数:",X_test.shape[0])
clf = KDLOR_Kernel(X_train=X_train,y_train=y_train)
# clf.get_Mean_H()
# clf.Optimal()
clf.predict(X_test=X_test,y_test=y_test)