我竟然写出如此垃圾的算法!计算复杂度上天!
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn import datasets
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from collections import OrderedDict
from itertools import combinations, product
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from itertools import combinations, product
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
class pwknn():
def __init__(self,X,y,neiborNum,train_idx,test_idx):
self.X = X
self.y = y
self.trainIdx = train_idx
self.testIdx = np.random.choice(test_idx,size=20,replace=False)
self.X_test = X[self.testIdx]
self.y_test = y[self.testIdx]
self.labels = np.unique(y)
self.idxlist = [idx for idx in range(len(y))]
self.absLabeled = self.initLabeledData()
self.neiNum = neiborNum
self.pairSet = self.contructPairSet()
self.Acc = 0
self.F1macro = 0
self.MAE = 0
self.knnAcc = 0
self.knnF1 = 0
self.knnMAE = 0
def D(self, a, b):
return np.sqrt(sum((a - b) ** 2))
def PD(self, pairA, pairB):
return self.D(self.X[pairA[0]], self.X[pairB[0]]) + self.D(self.X[pairA[1]], self.X[pairB[1]])
def initLabeledData(self):
labeledIndex = []
labelDict = OrderedDict()
for label in np.unique(y):
labelDict[label] = []
for idx in self.trainIdx:
labelDict[self.y[idx]].append(idx)
for idxlist in labelDict.values():
for jdx in np.random.choice(idxlist, size=5, replace=False, p=None):
labeledIndex.append(jdx)
return labeledIndex
def contructPairSet(self):
pairset = OrderedDict()
obj = combinations(self.absLabeled,2)
for item in obj:
if self.y[item[0]] < self.y[item[1]]:
pairset[item] = (self.y[item[0]],self.y[item[1]])
elif self.y[item[0]] < self.y[item[1]]:
pairset[item] = (self.y[item[1]],self.y[item[0]])
print("完成样本对组建")
return pairset
def fit(self):
y_pred = []
print("测试样本=",len(self.testIdx),self.testIdx)
for idx in self.testIdx:
print("测试样本=",idx)
confidence = OrderedDict()
for label in self.labels:
tempObj = 0
tempPat = 0
fenzi = 0
fenmu = 0
for jdx in self.trainIdx:
if self.y[jdx] == label:
continue
elif self.y[jdx] < label:
fenmu += 1
tempObj = (jdx,idx)
tempPat = (self.y[jdx],label)
elif self.y[jdx] > label:
fenmu += 1
tempObj = (idx,jdx)
tempPat = (label,self.y[jdx])
pairDistance = []
pairPattern = []
for pair, pattern in self.pairSet.items():
pairDistance.append(self.PD(pair,tempObj))
pairPattern.append(pattern)
# print("完成距离计算=",len(pairPattern))
ordidx = np.argsort(pairDistance) # 根据距离进行排序
neiborWeightDict = OrderedDict()
for i in range(self.neiNum): ##邻域包含的序样本对模式
neiborWeightDict[pairPattern[ordidx[i]]] = 0
for i in range(self.neiNum):
down = pairDistance[ordidx[self.neiNum - 1]] - pairDistance[ordidx[0]]
if down == 0:
neiborWeightDict[pairPattern[ordidx[i]]] += 1
else:
neiborWeightDict[pairPattern[ordidx[i]]] += (pairDistance[ordidx[self.neiNum - 1]] -
pairDistance[
ordidx[i]]) / down
if tempPat == max(neiborWeightDict, key=neiborWeightDict.get):
fenzi += 1
confidence[label] = fenzi / fenmu
# print("该样本的信度=",max(confidence,key=confidence.get))
y_pred.append(max(confidence,key=confidence.get))
y_pre = np.array(y_pred)
self.Acc = accuracy_score(y_true=self.y_test,y_pred=y_pre)
self.F1macro = f1_score(y_true=self.y_test,y_pred=y_pre,average='macro')
self.MAE = np.mean(abs(self.y_test - y_pre))
# knnModel = KNeighborsClassifier(6,weights='distance')
knnModel = LogisticRegression(solver='newton-cg', penalty='l2')
knnModel.fit(self.X[self.absLabeled],self.y[self.absLabeled])
y_predict = knnModel.predict(self.X_test)
self.knnAcc = accuracy_score(y_true=self.y_test,y_pred=y_predict)
self.knnF1 = f1_score(y_true=self.y_test,y_pred=y_predict,average='macro')
self.knnMAE = np.mean(abs(self.y_test - y_predict))
if __name__ == '__main__':
path0 = r"D:\OCdata\balance-scale.csv"
path1 = r"D:\OCdata\car.csv"
path2 = r"D:\OCdata\ERA.csv"
path3 = r"D:\OCdata\ESL.csv"
path4 = r"D:\OCdata\eucalyptus.csv"
path5 = r"D:\OCdata\LEV.csv"
path6 = r"D:\OCdata\newthyroid.csv"
path7 = r"D:\OCdata\SWD.csv"
path8 = r"D:\OCdata\toy.csv"
path9 = r"D:\OCdata\winequality-red.csv"
path10 = r"D:\OCdata\regression\abalone15-5bin.csv"
path11 = r"D:\OCdata\regression\bank15-5bin.csv"
path12 = r"D:\OCdata\regression\census15-5bin.csv"
path13 = r"D:\OCdata\regression\computer15-5bin.csv"
path14 = r"D:\OCdata\regression\housing-5bin.csv"
path15 = r"D:\OCdata\regression\machine-5bin.csv"
data = np.array(pd.read_csv(path7, header=None))
X = data[:, :-1]
y = data[:, -1]
SKF = StratifiedKFold(n_splits=5, shuffle=True)
AccList = []
F1List = []
MAEList = []
knnAccList = []
knnF1List = []
knnMAEList = []
for train_idx, test_idx in SKF.split(X, y):
print("###########################")
pw = pwknn(X=X,y=y,neiborNum=5,train_idx=test_idx,test_idx=test_idx)
pw.fit()
AccList.append(pw.Acc)
F1List.append(pw.F1macro)
MAEList.append(pw.MAE)
knnAccList.append(pw.knnAcc)
knnF1List.append(pw.knnF1)
knnMAEList.append(pw.MAE)
print("平均精度=",np.mean(AccList))
print("平均F1=",np.mean(F1List))
print("平均MAE=",np.mean(MAEList))
print("knn平均精度=",np.mean(knnAccList))
print("knn平均F1=",np.mean(knnF1List))
print("knn平均MAE=",np.mean(knnMAEList))