机器学习实战(①)——KNN算法改进约会网站的配对效果和手写字识别系统

最新推荐文章于 2021-01-14 14:49:40 发布

BRUCE_WUANG

最新推荐文章于 2021-01-14 14:49:40 发布

阅读量1.3k

点赞数 1

分类专栏：机器学习 python 文章标签：机器学习

本文链接：https://blog.csdn.net/sinat_36458870/article/details/77045410

版权

python 同时被 2 个专栏收录

21 篇文章 1 订阅

订阅专栏

机器学习

16 篇文章 0 订阅

订阅专栏

KNN 又名K近邻算法
用于：客户流失预测、欺诈检测等。
算法思想：近朱者赤近墨者黑！

步骤：

 1. 算距离、计算新数据和训练数据之间的距离
 2. 去排序、对算出来的距离进行排序
 3. 找邻居、确定最近的K个训练对象
 4. 做分类、根据K个近邻对象归属的类最高的为对测试对象分类

# -*- coding: utf-8 -*-
"""
Created on Thu Aug  3 09:53:27 2017

@author: steve
"""

import sys
sys.path.append("C:\\Users\\Administrator.PC-201707110905\\Desktop")
from numpy import *
import operator
# 倒入数据
def createDataSet():
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

# KNN算法
def classify0(Input, training_data, labels, k):
    training_data_size = training_data.shape[0]
    InputMatrix = tile(Input, (training_data_size, 1)) - training_data
    InputMatrix_square = InputMatrix**2
    distances = InputMatrix_square.sum(axis=1)**0.5
    sorted_distances_indices = distances.argsort()
    class_count = {}
    for i in range(k):
        dict_key = labels[sorted_distances_indices[i]]
        class_count[dict_key] = class_count.get(dict_key,0) + 1
    sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1),
        reverse = True)
    return sorted_class_count[0][0]
# 完成这个上面的函数，就可以试试1，2， 分类到哪个啦。
# >>> import knn
# >>> group, labels = knn.createDataSet()
# >>> knn.classify0([1,2], group, labels, 3)
# 'A'

# 读取数据
def file2matrix(filename):
    fr = open(filename)
    arrayOnlines = fr.readlines()
    numberOfLines = len(arrayOnlines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOnlines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector
# 完成这个上面的函数，就可以对把数据转化成需要用的啦
# >>> datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
# >>> datingDataMat
# array([[  4.09200000e+04,   8.32697600e+00,   9.53952000e-01],
       # [  1.44880000e+04,   7.15346900e+00,   1.67390400e+00],
       # [  2.60520000e+04,   1.44187100e+00,   8.05124000e-01],
       # ..., 
       # [  2.65750000e+04,   1.06501020e+01,   8.66627000e-01],
       # [  4.81110000e+04,   9.13452800e+00,   7.28045000e-01],
       # [  4.37570000e+04,   7.88260100e+00,   1.33244600e+00]])
# >>> datingLabels[1]
# 2

# 归一化特征值：因为数值差值较大的属性对计算结果影响较大
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    
    normDataSet = dataSet - tile(minVals, 1)
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet, ranges, minVals
# 完成这个上面的函数，就可以进行归一化特征啦
# 特征归一原因是：数字差值较大的属性对结果影响较大
# 特征归于的手段：将所有数据转化成0-1的数值
# >>> import knn
# >>> datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
# >>> normMat,ranges, minVals= knn.autoNorm(datingDataMat)
# >>> normMat
# array([[ 0.44832535,  0.39805139,  0.56233353],
       # [ 0.15873259,  0.34195467,  0.98724416],
       # [ 0.28542943,  0.06892523,  0.47449629],
       # ..., 
       # [ 0.29115949,  0.50910294,  0.51079493],
       # [ 0.52711097,  0.43665451,  0.4290048 ],
       # [ 0.47940793,  0.3768091 ,  0.78571804]])
# >>> ranges
# array([  9.12730000e+04,   2.09193490e+01,   1.69436100e+00])
# >>> minVals
# array([ 0.      ,  0.      ,  0.001156])
# >>> 

# 分类器针对约会网站得测试代码
def datingClassTest():
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # there is no '2' in intitial book
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], 
            normMat[numTestVecs:m,:],datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d"
            % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):errorCount += 1.0
    print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print(errorCount)
    return classifierResult
# 完成这个上面的函数，就可以进行测试啦
# 测试一般拿90%做训练数据，用10% 做测试数据
# >>> import knn
# >>> knn.datingClassTest()
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 2, the real answer is: 2
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# .
# .
# .
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 2, the real answer is: 2
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 3, the real answer is: 1
# the total error rate is: 0.050000
# 5.0
# 3
# >>> 

# 构建完整可用系统
def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(input("percentage of time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per years?"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels, 3)
    print("you will probably like this person:", resultList[int(classifierResult) - 1])
# 完成这个上面的函数 之后就可以进行预测啦！！！
# >>> import knn
# >>> knn.classifyPerson()
# percentage of time spent playing video games?5
# frequent flier miles earned per years?5000
# liters of ice cream consumed per year?0.2
# you will probably like this person: in small doses

有关数据的格式咱们来看看

# 对原始数据画出散点图
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
fig = plt.figure
ax = fig.add_subplot(111)
datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
ax.scatter(datingDatMat[:,1],datingDatMat[:,2],
    15.0*array(datingLabels), 15.0*array[datingLabels])
plt.show()

好像有点样子了哇