机器学习实战--KNN约会数据分类

约会数据: https://github.com/pbharrin/machinelearninginaction/blob/master/Ch02/datingTestSet2.txt

代码如下:

import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt

#数据读取与预处理
def file2matrix(filename):
    with open(filename) as f:
        arrayOLines = f.readlines()
        numberOfLines = len(arrayOLines)
        #传进去的shape
        returnMat = np.zeros((numberOfLines,3))
        classLableVector = []
        index = 0
        for line in arrayOLines:
            line = line.strip()
            listFromLine = line.split('\t')
            returnMat[index,:] = listFromLine[0:3]
            classLableVector.append(int(listFromLine[-1]))
            index += 1
    return returnMat,classLableVector

#归一化 newvalue = (prevalue - min)/(max - min)
def autoNorm(dataset):
    minVals = dataset.min(0)
    maxVals = dataset.max(0)
    ranges = maxVals - minVals
    #可以直接传进去一个array的shape用于构造新array
    normDataset = np.zeros(dataset.shape)
    #重复minvals这个array,n行1列
    m = np.tile(minVals,(dataset.shape[0],1))
    normDataset = dataset - m
    normDataset = normDataset/np.tile(ranges,(dataset.shape[0],1))
    return normDataset

# KNN 分类器
def classfy0KNN(intX,dataset,labels,K):
    datasetSize = dataset.shape[0]
    newX = np.tile(intX,(datasetSize,1))
    diff = dataset - newX
    sqrdiff = diff**2
    sumSqrdiff = sqrdiff.sum(axis=1)
    distance = sumSqrdiff**0.5
    sortIndex = distance.argsort()
    dictionary = {}
    for n in range(K):
        voteLable = labels[sortIndex[n]]
        dictionary[voteLable] = dictionary.get(voteLable,0)+1
    sortedClassCount = sorted(dictionary.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def datingClassTest():
    dataset, labels = file2matrix('datingTestSet2.txt')
    dataset = autoNorm(dataset)
    hoRatio = 0.1
    m = dataset.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0
    #k 是一个超参数,经过实验取4为最佳
    for i in range(numTestVecs):
        predictClass = classfy0KNN(dataset[i,:],dataset[numTestVecs:,:],labels[numTestVecs:],4)
        print('predict output is : ',predictClass,' the real is : ',labels[i])
        if predictClass != labels[i]:
            errorCount += 1
    print('the total error rate is : ',(errorCount/numTestVecs))



if __name__ == '__main__':
    datingClassTest()
    # dataset,labels = file2matrix('datingTestSet2.txt')
    # dataset = autoNorm(dataset)
    # fig = plt.figure()
    #参数349的意思是:将画布分割成3行4列,图像画在从左到右从上到下的第9块
    # ax = fig.add_subplot(221)
    #利用颜色和尺寸标识数据点的属性类别
    # ax.scatter(dataset[:,0],dataset[:,1],15.0*np.array(labels),15.0*np.array(labels))
    # plt.show()
    # print(dataset)
    # print(labels)
    # intX = []
    # predictLable = classfy0KNN(intX,dataset,labels,20)

运行结果:

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值