多分类KNN学习记录
转载于:https://blog.csdn.net/pengjunlee/article/details/82713047
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
data = make_blobs(n_samples=500, n_features=2, centers=5, cluster_std=1.0, random_state=8) # 此处data为一个元组
X, Y = data # X为500×2的数据,Y为标签
# 将生成的数据集进行可视化
# plt.scatter(X[:, 0], X[:, 1], s=50, c=Y, cmap=plt.cm.spring, edgecolors='k')
# plt.show()
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X, Y)
# 绘制图形
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # 对所有网格点进行预测
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, shading='auto', cmap='Greens')
plt.scatter(X[:, 0], X[:, 1], s=50, c=Y, cmap=plt.cm.spring, edgecolors='k')
plt.grid(True)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier:KNN")
# 把待分类的数据点用五星表示出来
plt.scatter(0, 5, marker='*', c='red', s=200)
# 对待分类的数据点的分类进行判断
res = clf.predict([[0, 5]])
plt.text(0.2, 4.6, 'Classification flag: ' + str(res))
plt.text(3.75, -13, 'Model accuracy: {:.2f}'.format(clf.score(X, Y)))
plt.show()
示例二,自定义函数实现KNN二分类
import math
import csv
import operator
import random
# import numpy as np
# from sklearn.datasets import make_blobs
# 生成样本数据集 samples(样本数量) features(特征向量的维度) centers(类别个数)
# def createDataSet(samples=100, features=2, centers=2):
# return make_blobs(n_samples=samples, n_features=features, centers=centers, cluster_std=1.0, random_state=8)
# 加载鸢尾花卉数据集 filename(数据集文件存放路径),作出初步处理
def loadIrisDataset(filename):
with open(filename, 'r') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
dataset.pop()
for x in range(len(dataset)): # 读出来的数据始终是字符串的形式,需要转成float
for y in range(4):
dataset[x][y] = float(dataset[x][y])
return dataset
# 拆分数据集 dataset(要拆分的数据集) split(训练集所占比例) trainingSet(训练集) testSet(测试集)
def splitDataSet(dataSet, split, trainingSet=[], testSet=[]):
for x in range(len(dataSet)):
if random.random() <= split:
trainingSet.append(dataSet[x])
else:
testSet.append(dataSet[x])
# 计算欧氏距离
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
# 选取距离最近的K个实例
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance) - 1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist)) # 对该列表按照第二个维度进行排序,即对到每个训练集的距离排序
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(distances[x][0])
return neighbors # 返回最近的k个训练集实例
# 获取距离最近的K个实例中占比例较大的分类
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0]
# 计算准确率
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
# 使用自定义创建的数据集进行分类
# x,y = createDataSet(features=2)
# dataSet= np.c_[x,y]
# 使用鸢尾花卉数据集进行分类
dataSet = loadIrisDataset('D:/pycharm/dataset/iris/iris.data.csv')
print(dataSet)
trainingSet = []
testSet = []
splitDataSet(dataSet, 0.75, trainingSet, testSet)
print('Train set:' + repr(len(trainingSet)))
print('Test set:' + repr(len(testSet)))
predictions = []
k = 7
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print('>predicted=' + repr(result) + ',actual=' + repr(testSet[x][-1]))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
main()