第一周:KNN算法求解约会网站问题

1.KNN算法简介

K近邻法(k-nearest neighbors, KNN)是一种很基本的机器学习方法了,在我们平常的生活中也会不自主的应用。比如,我们判断一个人的人品,只需要观察他来往最密切的几个人的人品好坏就可以得出了,这里就运用了KNN的思想。KNN方法既可以做分类,也可以做回归。

2.KNN流程框图

在建立训练集时,就要确定训练数据及其对应的类别标签;然后把待分类的测试数据与训练集数据依次进行特征比较;从训练集中挑选出最相近的k个数据,这k个数据中投票最多的分类,即为新样本的类别。
在这里插入图片描述

3.问题描述

在这里插入图片描述

4.KNN代码

from numpy import * 
import operator
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
import seaborn as sns
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines) 
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines: 
        listFromLine = line.strip().split('\t')
        returnMat[index,:] = listFromLine[0:3] 
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

#datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
#print(datingDataMat)
#print(datingLabels[0:10])
def corCoefficient():
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    for i in range(3):
        a = datingDataMat[:,i]
        b = corrcoef(a,datingLabels)
        print("第%d个特征与输出的相关系数:"%(i+1),b[0][1])
    c = corrcoef(datingDataMat,rowvar=0)
    print(c[0][1],c[0][2],c[1][2])
    plt.figure(figsize=(5,5))
    sns.heatmap(c,annot=True)
    
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    plt.xlabel("Miles")
    plt.ylabel("Time")
    ax1.scatter(datingDataMat[:,0],datingDataMat[:,1],s = [15],c = array(datingLabels))
    plt.show()

def autoNorm(dataSet):
    minVals = dataSet.min(0) 
    maxVals = dataSet.max(0) 
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals,(m,1))
    normDataSet = normDataSet/tile(ranges,(m,1))
    return normDataSet,minVals,ranges

#normMat,minVals,ranges = autoNorm(datingDataMat)
#print(normMat)
#print(minVals)
#print(ranges)
#print('------------------------------------------')

def classify(inX,dataSet,labels,k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2  
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5 
    sortedDistIndicies = distances.argsort()
    classCount={} 
    for i in range(k): 
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]


def datingClassVolidate():
#得到最佳k=5
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,minVals,ranges = autoNorm(datingDataMat) 
    X_train,X_test,y_train,y_test=train_test_split(normMat,datingLabels,
test_size=0.2,random_state=2)
    #print ("train:",len(X_train), "test:",len(X_test))

    folds = 5
    k_choices = [1,3,5,7,9,11,13,15,17,19,21,23,25]
    #k_choices = [1,3,5,7,9,13,15,20,25]
    X_folds = []
    y_folds = []
    X_folds = vsplit(X_train,folds)
    y_folds = hsplit(array(y_train),folds)

    accuracy_of_k = {}
    for k in k_choices:
        accuracy_of_k[k] = []

    for i in range(folds):
        X_train = vstack(X_folds[:i] + X_folds[i+1:]) 
        X_val = X_folds[i]
        y_train = hstack(y_folds[:i] + y_folds[i+1:])
        y_val = y_folds[i]
        #print (X_train.shape,X_val.shape,y_train.shape,y_val.shape)
        #print(y_val)
        for k in k_choices:
            accuracyCount = 0.0
            for i in range(X_val.shape[0]):
                y_val_pred = classify(X_val[i,:],X_train,y_train,k)
                if(y_val_pred == y_val[i]):
                    accuracyCount += 1.0
            #print("the total error rate is: %f,k =" %(errorCount/float(numVoVecs)),k)
            accuracy_of_k[k].append(accuracyCount/float(X_val.shape[0]))
        
    #for k in sorted(k_choices):
        #for accuracy in accuracy_of_k[k]:
            #print ('k = %d,准确率 = %f' %(k,accuracy))   
    print('-------------------------')
    a = {}
    for k in k_choices:
        a[k] = []
    for k,v in accuracy_of_k.items():
        a[k].append(mean(v))
        print('k = %d,平均准确率:%f'%(k,mean(v)))
    b = sorted(a.items(),key=operator.itemgetter(1),reverse=True)
    print('最大准确率%f时,k = %d'%(b[0][1][0],b[0][0]))
    
    for k in k_choices:
        plt.scatter([k]*len(accuracy_of_k[k]), accuracy_of_k[k])
    accuracies_mean = array([mean(v) for k,v in accuracy_of_k.items()])
    accuracies_std = array([std(v) for k,v in accuracy_of_k.items()])
    plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
    plt.title('cross volidate on K')
    plt.xlabel('K')
    plt.ylabel('cross-volidate accuracy')
    plt.show()
    
def datingClassTest():
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,minVals,ranges = autoNorm(datingDataMat) 
    X_train,X_test,y_train,y_test = train_test_split(normMat,datingLabels,test_size=0.2,random_state=2)
    errorCount = 0.0
    classifylist = []
    for i in range(X_test.shape[0]):
        classifierResult = classify(X_test[i,:],X_train,y_train,5)
        classifylist.append(classifierResult)
        #print("the classifier came back with: %d, the real answer is: %d" %(classifierResult,datingLabels[i]))
        if(classifierResult != y_test[i]):
            errorCount += 1.0
    print("准确率: %f," %(1-errorCount/float(X_test.shape[0])))
    #print(classifylist)
    #accuracy_s = accuracy_score(y_test, classifylist)
    precision_s = precision_score(y_test, classifylist,average='macro')  # 精确度
    recall_s = recall_score(y_test, classifylist,average='macro')  # 召回率
    f1_s = f1_score(y_test, classifylist,average='macro')  # F1得分
    #print(accuracy_s)
    print("精确度: %f," %precision_s)
    print("召回率: %f," %recall_s)
    print("F1得分: %f," %f1_s)
    
def classifyPerson():
    resultList = ['不喜欢的人','魅力一般的人','极具魅力的人']
    Miles = float(input("每年的飞行里程数:"))
    Time = float(input("玩游戏时间:"))
    IceCream = float(input("冰淇淋消耗量:"))
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,minVals,ranges = autoNorm(datingDataMat)
    inArr = array([Miles,Time,IceCream])
    classifierResult = classify((inArr-minVals)/ranges,normMat,datingLabels,5)
    print("你对这个人的喜欢程度:",resultList[classifierResult - 1])

if __name__ == '__main__':
    corCoefficient()
    datingClassVolidate()
    datingClassTest()
    classifyPerson()

5.结果展示

5.1 数据预处理

由于是标准数据,所以在数据导入后直接对数据进行分析,得到以下结果:
第1个特征与标签的相关系数: -0.4118504507209078
第2个特征与标签的相关系数: 0.3430459507835666
第3个特征与标签的相关系数: 0.025614191377917542
可以看出,第1,2特征对分类结果较为重要,由于特征仅有3个就全部使用进行训练模型,其次分析特征之间的相关性,得到下图:
在这里插入图片描述
通过观察数据,看到各个特征之间数值差距较大,所以先对其进行归一化处理,再进行之后的训练,可见代码(def autoNorm(dataSet): )部分

5.2 模型训练

首先将数据随机分成80%训练集,20%测试集,使之后能够验证模型的准确率
其次在80%训练集中进行交叉验证,采用的是k=5的交叉验证法,来对KNN中的k值进行验证,得到一个较好的k值,如下图:
在这里插入图片描述

5.3 模型测试

用之前得到的20%的测试集进行测试,通过对准确率,精确率,召回率,F1值分析来测试该模型好坏,得到以下值:
在这里插入图片描述

5.4 分类预测

通过运行程序可以进行预测,如输入数据(44000,12,0.5),预测结果是"极具魅力的人"
在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值