from sklearn import neighbors
from sklearn import datasets #引入sklearn自有的数据样本库
knn = neighbors.KNeighborsClassifier() #采用KNN分类器
iris = datasets.load_iris() #iris数据样本,通过花瓣四个部分的长度来判别花的种类
print(iris)
knn.fit(iris.data, iris.target)
predictedLabel = knn.predict([[0.1, 0.2, 0.3, 0.4]])
print('predictedLabel:', predictedLabel)
'''
KNN分类算法的具体实现
'''
import csv
import random
import math
import operator
def loadDataset(filename, split, trainingSet=[], testSet=[] ): #函数的作用是利用split,将数据集分成训练集和测试集两部分
with open(filename, 'r') as csvfile:
lines = csv.reader(csvfile)
print('type of lines:', type(lines)) #<class '_csv.reader'>
dataset = list(lines) #将数据集转换成list
for x in range(len(dataset)):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split: #random.random()在0到1中随机生成一个数
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
def euclideanDistance(instance1, instance2, length): #对length维的向量求解距离(距离之差的平方和再求根号)
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def getNeighbors(trainingSet, testInstance, k ): #找出在训练集trainingSet中距离某个实例testInstance最近的K个点
distances = []
length = len(testInstance) - 1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist)) #将训练集中数据以及其距离测试实例的距离,加入distances list中
distances.sort(key=operator.itemgetter(1))#将distances list中按照第1位数据,进行从小到大排列
neighbors = []
#print(repr(distances)) #[([4.3, 3.0, 1.1, 0.1, 'Iris-setosa'], 4.631414470763764),....]输出类似这样,但类型仍然是list
for x in range(k):
neighbors.append(distances[x][0]) #输出前K个
return neighbors
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1] #-1表示输出最后一个元素,即花的类别名字
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1 #classVotes是一个字典,字典第一个元素是花的类别,第二个是在K个neighbors中该类花名出现的次数
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) #按照出现的次数进行进行由大到小的排名 operator.itemgetter(1)对第1个元素由小到大排列,reverse=True表示翻转
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
trainingSet = []
testSet = []
split = 0.67
loadDataset('irisData.txt', split, trainingSet, testSet)
print('Train set:', len(trainingSet))
print('Test set:', len(testSet))
predictions = []
k = 5
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print('>prediction=' + repr(result) + ',actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy:' + repr(accuracy) + '%')
if __name__ == "__main__":
main()