import csv
import random
from math import sqrt
#随机划分,data_sample为数据集,n为按那种比例进行划分
def random_divide(data_sample, n):
list_class = {} #存放数据集中各类及其数量
iris_train = []
for i in data_sample:
iris_train.append(i)
iris_test = []
for sample in data_sample:
if sample[-1] not in list_class:
list_class[sample[-1]] = 1
else:
list_class[sample[-1]] += 1
#print("数据集类别的情况为:", list_class)
for item in list_class.items():
a = int(item[1]*n)#每一个类需要抽出来的数量
for i in range(0, a):
pop_num = -1
list_num = -1
pop_list = random.randint(0, list_class[item[0]]-1)
for temp_sample in iris_train:
list_num += 1
if temp_sample[-1] == item[0]:
pop_num += 1
if pop_num == pop_list:
iris_test.append(iris_train.pop(list_num))
list_class[item[0]] -= 1
break
return iris_train, iris_test
# 训练集预处理 采用:0均值,1方差
# train为测试样本,average_variance为每个特征[均值,方差]的一个列表
def train_pretreatment(train):
average_variance=[]
train_p = []#预处理后的结果
for i in range(len(train[0])-1):
a_v = []#每一个特征的均值和方差
sum = 0#特征的和
for j in range(len(train)):
sum += train[j][i]
a_v.append(round(sum/len(train), 3))#特征的均值
sq = 0#每一个特征和均值的差的平方的和
for j in range(len(train)):
sq += (a_v[0]-train[j][i])**2
a_v.append(round(sqrt(sq/len(train)),3))#方差
average_variance.append(a_v)
for i in range(len(train)):
sample_p = []
for j in range(len(train[0])-1):
sample_p.append( round((train[i][j]-average_variance[j][0])/average_variance[j][1],3) )
sample_p.append(train[i][-1])
train_p.append(sample_p)
return train_p, average_variance
#对测试集的预处理
def test_pretreatment(test, average_variance):
test_p = []
for i in range(len(test)):
p = []#每个测试样本的预处理
for j in range(len(test[i])-1):
p.append(round((test[i][j] - average_variance[j][0]) / average_variance[j][1], 3))#第i个样本的第j个特征的预处理
p.append(test[i][-1])
test_p.append(p)
return test_p
#取k个近邻,欧式距离
def found_k(test_sample, train_p, k):
num_k = {}#用于存放最近的k个样本,样本变成元组做key,距离做value,把字典你的容量设成k个,节省空间
for i in range(len(train_p)):
sum = 0 # 各个特征的差的平方和
for j in range(len(test_sample)-1):
sum += (test_sample[j]-train_p[i][j])**2
distance = round(sqrt(sum), 3)#一个测试样本的欧式距离
if len(num_k) < k:
num_k[tuple(train_p[i])] = distance#如果字典的元素没有到k个,直接放进去
else:#如果满了
tempk = ()
tempv = distance
for item in num_k.items():
if tempv < item[1]:
tempk = item[0]
tempv = item[1]
if tempv != distance:
del num_k[tempk]
num_k[tuple(train_p[i])] = distance
return num_k
#算出k个近邻中各种类的数量 采用:多数选举
def count_class(num_k):
sample_count_class = {}#用于记录k个近邻中的类别及个数
for item in num_k.items():
if item[0][-1] in sample_count_class:
sample_count_class[item[0][-1]] += 1
else:
sample_count_class[item[0][-1]] = 1
return sample_count_class
#预测样本的类
def class_p(count_class):
max_class = ''
max_count = 0
for item in count_class.items():
if max_count <= item[1]:
max_class = item[0]
max_count = item[1]
return max_class
#把上三个函数综合起来 found_k count_class class_[
def accomplish_p(test_p, train_p, k):
pclass = []#测试样本预测类别的集合
for i in range(len(test_p)):
num_k = found_k(test_p[i], train_p, k)
#print(test_p[i], "的k个近邻", num_k)
sample_count_class = count_class(num_k)
#print(test_p[i], "的类别数量:", sample_count_class)
sample_class_p = class_p(sample_count_class)
pclass.append(sample_class_p)
#print(test_p[i], "的预测类别为:", sample_class_p)
error_class = 0
for i in range(len(pclass)):
if test_p[i][-1] != pclass[i]:
error_class += 1
#print("共有%d个测试样本,其中预测错了%d个" %(len(test_p), error_class))
err = round(error_class/len(test_p), 5)
#print("预测错误率:", err)
return err
#选取k近邻k的值,分成10份,用k折交叉验证
def chose_k(train_data):
train_data_temp = []#存放临时的抽出一部分之后剩余的样本
train_data_k = []#存放抽出来的样本集列表的列表
good_k = -1
err_a = 1
err_v = 1
for i in range(len(train_data)):
train_data_temp.append(train_data[i])
n = int(len(train_data)/10)
for i in range(10):
train_data_temp, train_data_test = random_divide(train_data_temp, n/len(train_data_temp))
train_data_k.append(train_data_test)
for k in range(1,10):
errk = 0
errks = []
errks_sum = 0
for i in range(10):
train_data_test = train_data_k[i]
train_data_train = []
for j in range(10):
if(j != i):
for temp in train_data_k[j]:
train_data_train.append(temp)
train_data_train_p, train_data_train_average_variance = train_pretreatment(train_data_train)
train_data_test_p = test_pretreatment(train_data_test, train_data_train_average_variance)
err_temp = accomplish_p(train_data_test_p, train_data_train_p, k)
errk+=err_temp
errks.append(err_temp)
errk_a = round(errk/10, 6)
for i in range(10):
errks_sum += (errk_a-errks[i])**2
errk_v = round(sqrt(errks_sum), 6)
print("k=%d, 错误率平均:%f,方差:%f" % (k, errk_a, errk_v))
if err_a > errk_a:
good_k = k
err_a = errk_a
err_v = errk_v
elif err_a == errk_a:
if err_v > errk_v:
good_k = k
err_a = errk_a
err_v = errk_v
print("k选取%d最合适,错误率平均:%f,方差:%f" %(good_k, err_a,err_v))
return good_k
print("---------------------获取样本集--------------------------")
f = csv.reader(open("D:/iris.csv", "r"))
iris_data = []
for i in f:
for j in range(len(i)-1):
i[j] = float(i[j])
iris_data.append(i)
iris_class = []
for iris_sample in iris_data:
if iris_sample[len(iris_sample)-1] not in iris_class:
iris_class.append(iris_sample[len(iris_sample)-1])
print("iris的样本数量:", len(iris_data))
print("iris的特征维数:", len(iris_data[0])-1)
print("iris的类别情况:", iris_class)
print("---------------------随机划分----------------------------")
iris_train, iris_test = random_divide(iris_data, 0.2)
print("len(iris_data) = ", len(iris_data))
print("len(iris_train) = ", len(iris_train))
print("len(iris_test) = ", len(iris_test))
print("----------------------选取k的值--------------------------")
k = chose_k(iris_train)
print("--------------------训练集预处理---------------------------")
iris_train_p, iris_average_variance = train_pretreatment(iris_train)
print("iris_train_p = ", iris_train_p)
print("len(iris_train_p) = ", len(iris_train_p))
print("iris_average_variance = ", iris_average_variance)
print("-------------------测试集预处理-----------------------------")
iris_test_p = test_pretreatment(iris_test, iris_average_variance)
print("iris_test_p = ", iris_test_p)
print("len(iris_test_p) = ", len(iris_test_p))
print("-------------------对测试集进行预测--------------------------")
err = accomplish_p(iris_test_p, iris_train_p,k)
print("错误率 = ", err)
KNN--用python原生代码实现k近邻分类模型
最新推荐文章于 2022-10-26 14:48:42 发布