1、KNN
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 21 12:16:41 2021
@author: Administrator
"""
import numpy as np
#inx-未知样本(测试集)
#dataSet-训练样本(训练集)
#labels-训练集分类标签向量
#k-选择距离最小的k个点
#maxType-分类结果
def knn(inX, dataSet,labels,k):
dist=(((dataSet-inX)**2).sum(1))**0.5 #计算欧氏距离
sortedDist = dist.argsort() #输出排序后索引(表示第几个、第几个)优点:通过索引号可以在原序列中直接找到该数值
classCount={} #计数
for i in range(k):
voteLabel = labels[sortedDist[i]]
classCount[voteLabel]=classCount.get(voteLabel,0)+1
maxType=0 #求最大
maxCount=0
for key, value in classCount.items(): #遍历字典
if value > maxCount:
maxType = key
maxCount = value
return maxType
2、具体实现
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 21 12:16:51 2021
@author: Administrator
"""
#%%
import numpy as np
import random
import matplotlib.pyplot as plt
import KNN
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
#%%
#读取数据集,将样本数据的格式转换为分类模型可以接收的格式,需要将数据分为特征值矩阵和对应的分类标签向量
#打开并解析文件
#returnMat-特征矩阵
#classLabelVector-分类标签向量
def file2Matrix(filename):
data = np.genfromtxt(filename, delimiter=',')
np.random.seed(1)
np.random.shuffle(data)
returnMat = data[:,:7]
classLabelVector = data[:,7].flatten()
classLabelVector = classLabelVector.astype(np.int)
return returnMat,classLabelVector
#%%
#数据归一化处理(0-1标准化)
#归一化方法(0-1标准化,Z-score标准化,sigmoid压缩法)
#dataSet-特征矩阵
#normDataSet-归一化后的特征矩阵
def autoNorm(dataSet):
minVals = dataSet.min(0) #按列取最小值
maxVals = dataSet.max(0) #按列取最大值
normDataSet = np.zeros(dataSet.shape) #初始化一个零矩阵
normDataSet = (dataSet - minVals) / (maxVals - minVals)
return normDataSet
#%%
#读取文件
datingDataMat,datingLabels = file2Matrix('./seeds_dataset.xls')
print(datingDataMat)
print(datingLabels)
#%%
#不同特征值之间的关系
for i in range(7):
plt.scatter(datingDataMat[:,i],datingDataMat[:,i+1],c=datingLabels)
plt.show()
#%%
#归一化特征矩阵
dataSet = autoNorm(datingDataMat)
# dataSet = datingDataMat
print(dataSet)
#%%
#划分训练集、测试集
m = 0.8
dataSize = dataSet.shape[0]
print("数据集总行数:",dataSize)
# random.shuffle(dataSet)
trainSize = int(m*dataSize)
testSize = int((1-m)*dataSize+1)
print(trainSize,testSize)
#%%
#测试knn计算结果
k=5
predictlist = []
truthlist = []
correct = 0
for i in range(testSize):
predict = KNN.knn(dataSet[trainSize+i-1,:],dataSet[0:trainSize,:],datingLabels[0:trainSize],k)
predictlist.append(predict)
truth = datingLabels[trainSize+i-1]
truthlist.append(truth)
print("预测值为:{0} 真实值为:{1}".format(predict,truth))
if predict == datingLabels[trainSize+i-1]:
correct = correct + 1
print('正确率:',correct/testSize)
#%%
#并对模型的准确性进行计算、分析(评价指标至少2种)
#准确率、查准率、查全率、F1值
accuracy = accuracy_score(predictlist,truthlist)
F1_Measure = f1_score(predictlist, truthlist, average='weighted')
precision = precision_score(predictlist,truthlist, average='weighted')
recall = recall_score(predictlist, truthlist, average='weighted')
print("准确率(accuracy):", accuracy)
print("召回率(recall):", recall)
print("精确率(Precision):", precision)
print("F1-Measure:", F1_Measure)
3、数据集
链接: https://pan.baidu.com/s/1OZz3lpWXSU_Gc1fMVWv2Og?pwd=iwak 提取码: iwak 复制这段内容后打开百度网盘手机App,操作更方便哦