kNN_约会网站匹配效果改进实现代码

最新推荐文章于 2022-12-14 12:57:25 发布

hikaru112

最新推荐文章于 2022-12-14 12:57:25 发布

阅读量313

点赞数

分类专栏：机器学习实战文章标签：机器学习

本文链接：https://blog.csdn.net/qq_38780574/article/details/77718830

版权

机器学习实战专栏收录该内容

2 篇文章 0 订阅

订阅专栏

kNN_约会网站匹配效果改进

【准备数据】数据处理函数

import numpy as np
import os
def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = np.zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(label2int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

def label2int(labelName):
    if labelName == 'didntLike':
        return 0
    elif labelName == 'smallDoses':
        return 1
    elif labelName == 'largeDoses':
        return 2

datingDataMat,datingLabels = file2matrix('datingTestSet.txt')

【分析数据】绘制数据散点图

import matplotlib
import matplotlib.pyplot as plt

def arrColor(labels):
    arrColor = []
    for i in datingLabels:
        if i == 0:
            arrColor.append('r')
        elif i == 1:
            arrColor.append('y')
        elif i == 2:
            arrColor.append('g')  
    return arrColor


fig = plt.figure(figsize=(8,20))
#plt.axis([-1,22,-0.1,1.8])
ax1 = fig.add_subplot(311)
ax1.scatter(datingDataMat[:,0],datingDataMat[:,1],c = arrColor(datingLabels))

ax2 = fig.add_subplot(312)
ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],c = arrColor(datingLabels))

ax3 = fig.add_subplot(313)
ax3.scatter(datingDataMat[:,0],datingDataMat[:,2],c = arrColor(datingLabels))

plt.show()

这里写图片描述

由数据两两对比的三幅散点图分布可知，取第一列和第二列为x，y轴绘制散点图（图一）时，三种类型的人基本分属于不同的区域。

注:用scatter绘制散点图时，当数据在列表中未分类时，无法按照颜色给出图例。
想要显示图例，需对数据进行分类，然后分别用不同的scatter绘制，则可有不同分类的图例。

import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='C:/Windows/Fonts/msyh.ttf')

def showClassify(datingDataMat,datingLabels,x,y,x_name='',y_name=''):
    type1_x = []
    type1_y = []
    type2_x = []
    type2_y = []
    type3_x = []
    type3_y = []

    for i in range(len(datingLabels)):
        if datingLabels[i] == 0:
            type1_x.append(datingDataMat[i][x])
            type1_y.append(datingDataMat[i][y])
        if datingLabels[i] == 1:
            type2_x.append(datingDataMat[i][x])
            type2_y.append(datingDataMat[i][y])
        if datingLabels[i] == 2:
            type3_x.append(datingDataMat[i][x])
            type3_y.append(datingDataMat[i][y])

    fig = plt.figure()

    plt.xlabel(x_name,fontproperties=myfont)
    plt.ylabel(y_name,fontproperties=myfont)
    #plt.title("pythoner.com",fontproperties=myfont)

    ax = fig.add_subplot(111)
    type1 = ax.scatter(type1_x,type1_y,c = 'r')
    type2 = ax.scatter(type2_x,type2_y,c = 'y')
    type3 = ax.scatter(type3_x,type3_y,c = 'g') 
    ax.legend((type1, type2, type3), (u'不喜欢', u'魅力一般', u'极具魅力'),loc=2,prop=myfont)
    plt.show()

showClassify(datingDataMat,datingLabels,0,1,u'每年获取的飞行常客里程数',u'玩视频游戏所耗时间百分比')

这里写图片描述

【准备数据】归一化特征值函数

def autoNorm(dataSet):
    minValues = dataSet.min(0)
    maxValues = dataSet.max(0)
    ranges = maxValues - minValues
    normDataSet = np.zeros(np.shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minValues,(m,1))
    normDataSet = normDataSet / np.tile(ranges,(m,1))
    return normDataSet, ranges, minValues

normMat, ranges, minValues = autoNorm(datingDataMat)

array([[ 0.44832535,  0.39805139,  0.56233353],
       [ 0.15873259,  0.34195467,  0.98724416],
       [ 0.28542943,  0.06892523,  0.47449629],
       ..., 
       [ 0.29115949,  0.50910294,  0.51079493],
       [ 0.52711097,  0.43665451,  0.4290048 ],
       [ 0.47940793,  0.3768091 ,  0.78571804]])

k-近邻算法

import operator
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

classify0([ 0.28542943,  0.06892523,  0.47449629],normMat,datingLabels,5)

【测试算法】测试错误率函数

def datingClassTest():
    hoRatio = 0.1
    datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
    normMat, ranges, minValues = autoNorm(datingDataMat)
    m=normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:,:],datingLabels[numTestVecs:],5)
        if(classifierResult != datingLabels[i]):
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))

datingClassTest()

the total error rate is: 0.040000

【使用算法】采集数据并输出预测结果

def classifyPerson():
    resultList = ['not at all','in small doses','in large doses']
    percentTats = float(input("percentage of thime spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
    normMat, ranges, minValues = autoNorm(datingDataMat)
    inArr = np.array([ffMiles,percentTats,iceCream])
    classifierResult = classify0((inArr-minValues)/ranges,normMat,datingLabels,5)
    print("You will probably like this person:",resultList[classifierResult])

classifyPerson()

percentage of thime spent playing video games?8
frequent flier miles earned per year?40000
liters of ice cream consumed per year?0.95
You will probably like this person: in large doses