下面的代码实现根据http://blog.csdn.net/bdss58/article/details/40928827这篇文章的算法介绍,理解算法请查看这篇文章。这里就不再详细介绍算法了。
__author__ = 'jianyong'
# 从csv文件中加载数据,并且将数据分成训练集和测试集,训练集和测试集比例是split
import csv
import random
def loadDataset(filename,split,trainingset=[],testset=[]):
with open(filename,'rb') as csvfile:
lines=csv.reader(csvfile)
dataset=list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y]=float(dataset[x][y])
if random.random()<split:
trainingset.append(dataset[x])
else:
testset.append(dataset[x])
# 计算欧几里得几何距离
import math
def euclideanDistance(instance1,instance2,length):
distance=0
for x in range(length):
distance+=math.pow((instance1[x]-instance2[x]),2)
return distance
# 根据欧几里得距离确定k个邻居
import operator
def getNeighbors(trainingset,testinstance,k):
distances=[]
for x in len(trainingset):
dist=euclideanDistance(trainingset[x],testinstance,len(testinstance)-1)
distances.append((trainingset,dist))
distances.sort(key=operator.itemgetter(1))
neighbors=[]
for x in range(k):
neighbors.append(distances[x][x])
return neighbors
# 找出邻居中的主要成分
def getResponse(neighbors):
votes={}
for x in len(neighbors):
response=neighbors[x][-1]
if response in votes:
votes[response]+=1
else:
votes[response]=1
sortedvotes=sorted(votes.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedvotes[0][0]
# 计算正确率
def getAccuracy(testset,predictions):
correct=0
for x in range(len(testset)):
if testset[x][-1] is predictions[x]:
correct+=1
return (correct/float(len(testset)))*100.0
# 运行一下试试
def main():
trainingset=[]
testset=[]
split=0.67
loadDataset('iris.data',split,trainingset,testset)
print 'training set:' + repr(len(trainingset))
print 'test set:' + repr(len(testset))
predictions=[]
k=3
for x in range(len(testset)):
neighbors=getNeighbors(trainingset,testset[x],k)
result=getResponse(neighbors)
predictions.append(result)
print 'predicted:' + repr(result) + ',actual:'+repr(testset[x][-1])
accuracy=getAccuracy(testset,predictions)
print 'accuracy:'+ repr(accuracy)+'%'