机器学习作业 —— KNN算法
"""
date: 2020-3-21
description: knn分类算法的实现
by: jing
"""
import operator
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
# 以欧式距离为KNN算法中的距离
breast_cancer_data = datasets.load_breast_cancer()
features = breast_cancer_data.data # fea [569][30]
targets = breast_cancer_data.target
data_total = np.array(features, dtype=np.float64)
label_total = np.array(targets, dtype=np.float64)
for i in range(len(label_total)): # 将标签中的数据0转化为-1 便于分析
if label_total[i] == 0:
label_total[i] = -1
data_train, data_test = train_test_split(data_total, train_size=0.9, random_state=1) # 切分训练集与测试集
label_train, label_test = train_test_split(label_total, train_size=0.9, random_state=1)
def knn(sample_data, data_tra, label_tra, k):
train_row = data_tra.shape[0]# 先做减法
sample_mat = np.tile(sample_data, (train_row, 1))
sq_mat = (sample_mat - data_tra) ** 2 #差值平方
sq_dist = sq_mat.sum(axis=1) # 按行相加于列表中
distance = sq_dist ** 0.5
dist_index = distance.argsort() #这个函数实现排序+索引
count = {}
for i in range(k):
label = label_tra[dist_index[i]]
count[label] = count.get(label, 0) + 1
count_list = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
return count_list[0][0]
if __name__ == '__main__':
for j in range(3, 20):
error = 0.0
num = len(label_test)
for i in range(len(data_test)):
ret = knn(data_test[i], data_train, label_train, j)
if ret != label_test[i]:
error += 1
print("k为{0}时,错误率为 {1}".format(j, (error / num)))
用的是乳腺癌数据集 CSDN里面就有这个资源
主要参考了这篇文章