最近邻 Nearest Neighbors
1、利用自带鸢尾花数据简单实现最近邻算法
from sklearn import neighbors
from sklearn import datasets
# ##执行k最近邻的分类器
knn = neighbors.KNeighborsClassifier()
# ##加载鸢尾花数据集
iris = datasets.load_iris()
# print iris
# ##使用X作为训练数据拟合模型,y作为目标值
knn.fit(iris.data,iris.target)
# ##预测提供的数据的类标签
predictedLabel = knn.predict([[0.1,0.2,0.3,0.4]])
print predictedLabel
print iris.target_names[predictedLabel]
预测[萼片长,萼片宽度,花瓣长度,花瓣宽度]分别为[0.1,0.2,0.3,0.4](cm)对应的类型
打印结果为:
[0]
['setosa']
即setosa花:
2、独立实现最近邻算法
利用鸢尾花数据集进行测试
#!/usr/bin/python
# -*- coding: utf-8 -*-
import csv
import random
import math
import operator
def loadDataset(filename, split, trainingSet=[], testSet=[]):
with open(filename, 'rb') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
# ##将数据添加到训练集数据中
trainingSet.append(dataset[x])
else:
# ##将数据添加到测试集数据中
testSet.append(dataset[x])
# ##传入两个实例即其维度,输出两者之间的距离
def euclideanDistance(instance1,instance2,length):
distances = 0
for x in range(length):
# ##对实例的每个维度进行做差并求所有维度的平方和
distances += pow((instance1[x]-instance2[x]),2)
return math.sqrt(distances)
# ##从训练集中选出和实例testInstance最邻近的k个实例
def getNeighobors(trainingSet,testInstance,k):
distances = []
# #计算testInstance的维度
length = len(testInstance)-1
# ##计算每一个实例和testInstance的距离
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
# #存储所有计算出的距离
distances.append((trainingSet[x], dist))
# #将计算出的距离进行排序
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
# #取前k个距离
neighbors.append(distances[x][0])
return neighbors
# #统计每一个分类投票的多少
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
# #按降序排列
sortedVotes = sorted(classVotes.iteritems(),key=operator.itemgetter(1),reverse=True)
# #返回投票最多的类别
return sortedVotes[0][0]
# ##输出预测正确率
def getAccuracy(testSet,predictions):
# ##初始化预测正确的个数为0
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet)))*100.0
def main():
trainingSet = []
testSet = []
# #设定区分界限
split = 0.67
# #加载数据,传入空的训练集和测试集
loadDataset(r'irisdata.txt', split, trainingSet, testSet)
print 'Train set:' + repr(len(trainingSet))
print 'Test set:' + repr(len(testSet))
predictions = []
# ##取最近的3个
k = 3
for x in range(len(testSet)):
# #对每个测试集中的实例都照到其邻居
neighbors = getNeighobors(trainingSet,testSet[x],k)
result = getResponse(neighbors)
predictions.append(result)
print '>predicted='+repr(result)+',actual='+repr(testSet[x][-1])
# #传入测试集以及预测值,计算预测百分比
accuracy = getAccuracy(testSet,predictions)
print 'Accuracy:'+repr(accuracy)+'%'
main()
输出:
Train set:100
Test set:50
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-setosa',actual='Iris-setosa'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-virginica',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-versicolor',actual='Iris-versicolor'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
>predicted='Iris-virginica',actual='Iris-virginica'
Accuracy:98.0%
则最终预测正确率为98.0%