1.KNN算法简介
K近邻法(k-nearest neighbors, KNN)是一种很基本的机器学习方法了,在我们平常的生活中也会不自主的应用。比如,我们判断一个人的人品,只需要观察他来往最密切的几个人的人品好坏就可以得出了,这里就运用了KNN的思想。KNN方法既可以做分类,也可以做回归。
2.KNN流程框图
在建立训练集时,就要确定训练数据及其对应的类别标签;然后把待分类的测试数据与训练集数据依次进行特征比较;从训练集中挑选出最相近的k个数据,这k个数据中投票最多的分类,即为新样本的类别。
3.问题描述
4.KNN代码
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
import seaborn as sns
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
listFromLine = line.strip().split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
#datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
#print(datingDataMat)
#print(datingLabels[0:10])
def corCoefficient():
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
for i in range(3):
a = datingDataMat[:,i]
b = corrcoef(a,datingLabels)
print("第%d个特征与输出的相关系数:"%(i+1),b[0][1])
c = corrcoef(datingDataMat,rowvar=0)
print(c[0][1],c[0][2],c[1][2])
plt.figure(figsize=(5,5))
sns.heatmap(c,annot=True)
fig = plt.figure()
ax1 = fig.add_subplot(111)
plt.xlabel("Miles")
plt.ylabel("Time")
ax1.scatter(datingDataMat[:,0],datingDataMat[:,1],s = [15],c = array(datingLabels))
plt.show()
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,minVals,ranges
#normMat,minVals,ranges = autoNorm(datingDataMat)
#print(normMat)
#print(minVals)
#print(ranges)
#print('------------------------------------------')
def classify(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def datingClassVolidate():
#得到最佳k=5
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,minVals,ranges = autoNorm(datingDataMat)
X_train,X_test,y_train,y_test=train_test_split(normMat,datingLabels,
test_size=0.2,random_state=2)
#print ("train:",len(X_train), "test:",len(X_test))
folds = 5
k_choices = [1,3,5,7,9,11,13,15,17,19,21,23,25]
#k_choices = [1,3,5,7,9,13,15,20,25]
X_folds = []
y_folds = []
X_folds = vsplit(X_train,folds)
y_folds = hsplit(array(y_train),folds)
accuracy_of_k = {}
for k in k_choices:
accuracy_of_k[k] = []
for i in range(folds):
X_train = vstack(X_folds[:i] + X_folds[i+1:])
X_val = X_folds[i]
y_train = hstack(y_folds[:i] + y_folds[i+1:])
y_val = y_folds[i]
#print (X_train.shape,X_val.shape,y_train.shape,y_val.shape)
#print(y_val)
for k in k_choices:
accuracyCount = 0.0
for i in range(X_val.shape[0]):
y_val_pred = classify(X_val[i,:],X_train,y_train,k)
if(y_val_pred == y_val[i]):
accuracyCount += 1.0
#print("the total error rate is: %f,k =" %(errorCount/float(numVoVecs)),k)
accuracy_of_k[k].append(accuracyCount/float(X_val.shape[0]))
#for k in sorted(k_choices):
#for accuracy in accuracy_of_k[k]:
#print ('k = %d,准确率 = %f' %(k,accuracy))
print('-------------------------')
a = {}
for k in k_choices:
a[k] = []
for k,v in accuracy_of_k.items():
a[k].append(mean(v))
print('k = %d,平均准确率:%f'%(k,mean(v)))
b = sorted(a.items(),key=operator.itemgetter(1),reverse=True)
print('最大准确率%f时,k = %d'%(b[0][1][0],b[0][0]))
for k in k_choices:
plt.scatter([k]*len(accuracy_of_k[k]), accuracy_of_k[k])
accuracies_mean = array([mean(v) for k,v in accuracy_of_k.items()])
accuracies_std = array([std(v) for k,v in accuracy_of_k.items()])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('cross volidate on K')
plt.xlabel('K')
plt.ylabel('cross-volidate accuracy')
plt.show()
def datingClassTest():
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,minVals,ranges = autoNorm(datingDataMat)
X_train,X_test,y_train,y_test = train_test_split(normMat,datingLabels,test_size=0.2,random_state=2)
errorCount = 0.0
classifylist = []
for i in range(X_test.shape[0]):
classifierResult = classify(X_test[i,:],X_train,y_train,5)
classifylist.append(classifierResult)
#print("the classifier came back with: %d, the real answer is: %d" %(classifierResult,datingLabels[i]))
if(classifierResult != y_test[i]):
errorCount += 1.0
print("准确率: %f," %(1-errorCount/float(X_test.shape[0])))
#print(classifylist)
#accuracy_s = accuracy_score(y_test, classifylist)
precision_s = precision_score(y_test, classifylist,average='macro') # 精确度
recall_s = recall_score(y_test, classifylist,average='macro') # 召回率
f1_s = f1_score(y_test, classifylist,average='macro') # F1得分
#print(accuracy_s)
print("精确度: %f," %precision_s)
print("召回率: %f," %recall_s)
print("F1得分: %f," %f1_s)
def classifyPerson():
resultList = ['不喜欢的人','魅力一般的人','极具魅力的人']
Miles = float(input("每年的飞行里程数:"))
Time = float(input("玩游戏时间:"))
IceCream = float(input("冰淇淋消耗量:"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,minVals,ranges = autoNorm(datingDataMat)
inArr = array([Miles,Time,IceCream])
classifierResult = classify((inArr-minVals)/ranges,normMat,datingLabels,5)
print("你对这个人的喜欢程度:",resultList[classifierResult - 1])
if __name__ == '__main__':
corCoefficient()
datingClassVolidate()
datingClassTest()
classifyPerson()
5.结果展示
5.1 数据预处理
由于是标准数据,所以在数据导入后直接对数据进行分析,得到以下结果:
第1个特征与标签的相关系数: -0.4118504507209078
第2个特征与标签的相关系数: 0.3430459507835666
第3个特征与标签的相关系数: 0.025614191377917542
可以看出,第1,2特征对分类结果较为重要,由于特征仅有3个就全部使用进行训练模型,其次分析特征之间的相关性,得到下图:
通过观察数据,看到各个特征之间数值差距较大,所以先对其进行归一化处理,再进行之后的训练,可见代码(def autoNorm(dataSet): )部分
5.2 模型训练
首先将数据随机分成80%训练集,20%测试集,使之后能够验证模型的准确率
其次在80%训练集中进行交叉验证,采用的是k=5的交叉验证法,来对KNN中的k值进行验证,得到一个较好的k值,如下图:
5.3 模型测试
用之前得到的20%的测试集进行测试,通过对准确率,精确率,召回率,F1值分析来测试该模型好坏,得到以下值:
5.4 分类预测
通过运行程序可以进行预测,如输入数据(44000,12,0.5),预测结果是"极具魅力的人"