本文非原创,原博主地址
目的
原生python实现knn分类算法(使用鸢尾花数据集)
KNN算法核心思想就是,要确定测试样本属于哪一类,就寻找所有训练样本中与该测试样本“距离”最近的前K个样本,然后看这K个样本大部分属于哪一类,那么就认为这个测试样本也属于哪一类。简单的说就是让最相似的K个样本来投票决定。因此,实现 K 近邻算法时,主要考虑的问题是如何对训练数据进行快速 K 近邻搜索。
算法设计
源码展示
# -*- coding: utf-8 -*-
import string
import csv # 用于处理csv文件
import random # 用于随机数
import math
import operator #
# 加载数据集
def loadDataset(filename, split, trainingSet=[], testSet=[]):
dataset = []
temp = []
mid_temp = []
for line in open(filename, "r"):
temp.append(line)
for i in temp:
i = i[0:-2]
mid_temp.append(i)
for i in mid_temp:
dataset.append(list(i.split(",")))
for x in range(len(dataset) - 1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
# 计算距离
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
# 返回K个最近邻
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance) - 1
# 计算每一个测试实例到训练集实例的距离
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist))
# 对所有的距离进行排序
distances.sort(key=operator.itemgetter(1))
neighbors = []
# 返回k个最近邻
for x in range(k):
neighbors.append(distances[x][0])
return neighbors
# 对k个近邻进行合并,返回value最大的key
def getResponse(neighbors):
class_votes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in class_votes:
class_votes[response] += 1
else:
class_votes[response] = 1
# 排序
sorted_votes = sorted(class_votes.items(), key=lambda x: x[1], reverse=True)
return sorted_votes[0][0]
# 计算准确率
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
trainingSet = [] # 训练数据集
testSet = [] # 测试数据集
split = 0.67# 分割的比例
loadDataset(r"iris.txt", split, trainingSet, testSet)
print("Train set :" + repr(len(trainingSet)))
print("Test set :" + repr(len(testSet)))
predictions = []
k = 3
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print(">predicted = " + repr(result) + ",actual = " + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print("Accuracy:" + repr(accuracy) + "%")
if __name__ == "__main__":
main()
结果展示