knn拟合sklearn中的iris数据集
class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)
-
n_neighbors:选取几个邻居
-
weights:邻居的权重是平均呢,还是越重话语权越大呢
-
algorithm:后面再展开
‘ball_tree’ will use BallTree ‘kd_tree’ will use KDTree ‘brute’ will use a brute-force search. ‘auto‘默认选项,是看哪个好用那个
-
leaf_size:只有ball_tree和kd_tree才有必要
-
p与metric:距离表示,p=1是manhattan_distance,p=2是euclidean_distance。
from sklearn import neighbors from sklearn import datasets #导出k近邻算法,并导出数据集 knn=neighbors.KNeighborsClassifier() iris=datasets.load_iris() #在数据集中找到iris #print(iris) knn.fit(iris.data,iris.target) #对数据集进行拟合 predictedlabel =knn.predict([[0.1,0.2,0.3,0.4]]) print(predictedlabel)#对数据进行预测 import csv import random import math import operator #导入数据,并分为训练集和测试集 def loadDataset(filename, split, trainingSet = [], testSet = []): with open(filename, 'rt') as csvfile: lines = csv.reader(csvfile) dataset = list(lines) for x in range(len(dataset)-1): for y in range(4): dataset[x][y] = float(dataset[x][y]) if random.random() < split: trainingSet.append(dataset[x]) else: testSet.append(dataset[x]) #求欧拉距离 def euclideanDistance(instance1, instance2, length): distance = 0 for x in range(length): distance += pow((instance1[x]-instance2[x]), 2) return math.sqrt(distance) #计算最近邻(K个数据集),testInstance是实例 def getNeighbors(trainingSet, testInstance, k): distances = [] length = len(testInstance)-1 for x in range(len(trainingSet)): #testinstance dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x], dist))#distance是一个多个元组的list #distances.append(dist) distances.sort(key=operator.itemgetter(1))#按照dist排序 neighbors = [] for x in range(k): neighbors.append(distances[x][0])#要的是数据集 return neighbors #投票法找出最近邻的结果哪种最多 def getResponse(neighbors): classVotes = {}#key--花名字 value--个数 for x in range(len(neighbors)): response = neighbors[x][-1] if response in classVotes: classVotes[response] += 1 else: classVotes[response] = 1 sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) return sortedVotes[0][0] #求出精确性 def getAccuracy(testSet, predictions): correct = 0 for x in range(len(testSet)): if testSet[x][-1] == predictions[x]: correct += 1 return (correct/float(len(testSet)))*100.0 def main(): #prepare data trainingSet = [] testSet = [] split = 0.8 loadDataset('irisdata.txt', split, trainingSet, testSet) print('Train set: '+ repr(len(trainingSet))) print('Test set: ' + repr(len(testSet))) #generate predictions predictions = [] k = 3 for x in range(len(testSet)): # trainingsettrainingSet[x] neighbors = getNeighbors(trainingSet, testSet[x], k) result = getResponse(neighbors) predictions.append(result) print ('>predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1])) accuracy = getAccuracy(testSet, predictions) print('Accuracy: ' + repr(accuracy) + '%') if __name__ == '__main__': main()