import csv
from random import shuffle
from math import sqrt
#读取train.csv、test.csv
train = []
with open('D:\\train.csv', 'rt') as f:
csvread = csv.reader(f)
for i in csvread:
i[0] = float(i[0])
i[1] = float(i[1])
i[2] = int(i[2])
train.append(i)
test = []
with open('D:\\test.csv', 'rt') as f:
csvread = csv.reader(f)
for i in csvread:
i[0] = float(i[0])
i[1] = float(i[1])
test.append(i)
#将训练集按4:1划分成估计集和验证集
train0 = []
train1 = []
gj = []
yz = []
for sample in train:
if sample[-1] == 0:
train0.append(sample)
else:
train1.append(sample)
l0 = int(len(train0) / 5)
l1 = int(len(train1) / 5)
shuffle(train0)
shuffle(train1)
yz = yz+train0[:l0]+train1[:l1]
gj = gj+train0[l0:]+train1[l1:]
#预处理
#0均值,1方差预处理方式
def a_v(train):
average = []
variance = []
for i in range(len(train[0])-1):
sum = 0
for sample in train:
sum += sample[i]
average.append(sum/len(train))
for i in range(len(train[0])-1):
sum = 0
for sample in train:
sum += (sample[i]-average[i])**2
variance.append(sqrt(sum/len(train)))
for i in range(len(train)):
for j in range(len(train[0])-1):
train[i][j] = (train[i][j]-average[j])/variance[j]
return average, variance
def a_v_t(test, average, variance):
for i in range(len(test)):
test[i][0] = (test[i][0]-average[0])/variance[0]
test[i][1] = (test[i][1]-average[1])/variance[1]
#线性映射至[0,1]
def min_max(train):
min = [10000,10000]
max = [-10000,-10000]
for sample in train:
if sample[0] < min[0]:
min[0] = sample[0]
if sample[0] > max[0]:
max[0] = sample[0]
if sample[1] < min[1]:
min[1] = sample[1]
if sample[1] > max[1]:
max[1] = sample[1]
for i in range(len(train)):
train[i][0] = (train[i][0]-min[0])/max[0]
train[i][1] = (train[i][0]-min[1])/max[1]
return min, max
def min_max_t(test,min,max):
for i in range(len(test)):
test[i][0] = (test[i][0]-min[0])/max[0]
test[i][1] = (test[i][1]-min[1])/max[1]
#距离度量
#绝对值距离
def l1_dis(sample1,sample2):
return abs(sample1[0]-sample2[0])+abs(sample1[1]-sample[1])
#欧式距离
def l2_dis(sample1,sample2):
return sqrt((sample1[0]-sample2[0])**2 + (sample1[1]-sample2[1])**2)
#切氏距离
def l00_dis(sample1,sample2):
return abs(sample1[0]-sample2[0]) if abs(sample1[0]-sample2[0]) > abs(sample1[1]-sample2[1]) else abs(sample1[1]-sample2[1])
#预处理
a, v = a_v(gj)
a_v_t(yz,a,v)
# min, max = min_max(gj)
# min_max_t(yz, min, max)
#用KNN模型进行预测
def KNN(train, test, k, dis_function):
rig = 0
for test_sample in test:
temp_d = []
temp_s = []
for train_sample in train:
d = dis_function(test_sample,train_sample)
if len(temp_s) <= k:
temp_d.append(d)
temp_s.append(train_sample)
else:
for z in range(len(temp_d)):
if d < temp_d[z]:
temp_d[z], d = d, temp_d[z]
temp_s[z], train_sample = train_sample, temp_s[z]
sum_0 = 0
sum_1 = 0
for m in temp_s:
if m[-1] == 0:
sum_0 += 1
else:
sum_1 +=1
if sum_0 >= sum_1:
p = 0
else:
p = 1
if test_sample[-1] == p:
rig+=1
return rig/len(test)
for k in range(1,11):
print('k= ', k,':')
print(round(KNN(gj, yz, k, l1_dis), 4))
K近邻分类--python实现KNN分类,包含各种距离度量和预处理方法
于 2022-05-30 14:32:15 首次发布