这里使用python复现KNN算法:
import pandas as pd
import numpy as np
class KNN:
def __init__(self):
self.x_train: pd.DataFrame = None
self.x_test: pd.DataFrame = None
def euclidean_distance(self, x1: pd.Series, x2: pd.Series):
x1 = np.mat(x1.tolist())
x2 = np.mat(x2.tolist())
return np.sqrt(np.sum(np.square(x1 - x2)))
def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
def get_classify(self, x_test: pd.DataFrame, topk: int):
x_train = self.x_train
y_train = self.y_train
if x_train is None or x_test is None or topk <= 0:
return None
assert x_train.shape == x_test.shape
label_classify = []
for test_i, test_data in x_test.iterrows():
# for test_data, test_i in enumerate(x_test): # 对每个测试数据都要遍历整个训练数据
distance_list = []
for train_index, train_data in x_train.iterrows():
distance = self.euclidean_distance(train_data, test_data)
distance_list.append(distance)
index_list = np.argsort(np.array(distance_list)) # 升序排列,返回index
# 开始寻找最接近的类别,并使用投票机制返回
classify_list = [0] * topk
for i, index in enumerate(index_list[:topk]):
classify_list[i] = y_train[index]
label_classify.append(max(classify_list, key=classify_list.count))
return label_classify
# 整理数据
data = pd.read_csv('测试数据集.csv', header=None)
test_data = pd.read_csv('验证数据集.csv', header=None)
x_train = data.drop(0, axis=1) # 测试数据集的特征
y_train = data[0] # 测试数据集的分类
x_test = test_data.drop(0, axis=1) # 验证数据集的特征
# 调用模型
knn_model = KNN()
knn_model.fit(x_train=x_train, y_train=y_train)
y_test = knn_model.get_classify(x_test=x_test, topk=20)
# 测试
y_test_true = test_data[0]
err = 0 # 表示错误的个数
for i, data in enumerate(y_test):
true_num = y_test_true[i]
if data != true_num:
err += 1