机器学习实战笔记六_Python3

程序清单2-5,

本笔记将主要注意力放在理解代码上,所以大家看代码中的注释即可

伪代码

def classifyPerson():
    resultList = ['not at all','in small doses','in large doses']
    percentTats = float(input(\
        "percentage of time spent playing video games?"))
    ffMiles = float(input(\
        "frequent flier miles earned per year? "))
    iceCream = float(input(\
        "liters of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    clssifierResult = classify0((inArr-minVals)/ranges,\
                                normMat,datingLabels,3)
    print("You will probably like this person:",\
          resultList[clssifierResult-1])

完整

#批量注释、批量取消注释 Ctrl+/
# from __future__ import print_function
from  numpy import *
import operator#运算符模块
import matplotlib.pyplot as plt
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0, 0.1]])
    labels = ['A','A','B','B']
    return group,labels

group,labels=createDataSet()

def classify0(inX, dataSet, labels, k): #inX: 待测试数据 ;  dataSet: 训练样本集;labels: 样本集的标签;k近邻
    dataSetSize = dataSet.shape[0]      #to get the rows of the matrix
    # to get the Xi-Yi of the dataSet
    diffMat = tile(inX, (dataSetSize,1)) - dataSet      #a=[1 2],b=[2 3];tile(a,b) to generate 2*3 matrix when
                                                        #the element all is a [1 2]
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)         #使每行的元素相加,得到测试样本与各训练样本distance**2
                                                #axis=0,按列相加;axis=1,按行相加;
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()    #将distance中的元素从小到大排列,
                                                # 提取其对应的index(索引),然后输出到 sortedDistIndicies
   #声明一个dict:{key:value1,key2:value2}
    classCount={}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        #classCount= {'B': 2, 'A': 1},初始化后,classCount每得到一个相同的voteIlabel,就+1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1       #当我们获取字典里的值的时候,一个是通过
                                                                        # 键值对,即dict['key'],另一个就是dict.get()方法
                                                                        # dict.get(voteIlabel,0) = 0, 此处0 to be initiated,
                                                                        #  之后就没有作用了。
    #items方法是可以将字典中的所有项,以列表方式返回。 iteritems方法与items方法相比作用大致相同,只是它的返回值不是列表,而是一个迭代器
    #Python3 中没有iteritems函数,需要用values()代替,并用list转为列表
    # sortedClassCount = sorted((key_label, value_num), key=operator.itemgetter(1), reverse=True)
    #python3中无法使用iteritems,需要对上面这句话改造,我们通过得到两个list,得到出现频率最高的label
    key_label=list(classCount.keys())
    value_num=list(classCount.values())
    #label出现频率由小到大排列,并返回索引index
    sortedvalue_num_indicies = argsort(value_num)
    #返回频率最大的label
    return key_label[len(sortedvalue_num_indicies)-1]

# group,labels = createDataSet()
# a=classify0([0,0], group,labels,3)
# print(a)

#自己根据Python3 改正后的函数
def file2matrix(filename): # 将数据分离为样本数据与标签
    #open a file, default: 'r'ead
    fr = open(filename)
    #一次读取所有行
    arrayOLines = fr.readlines()
    #得到行数
    numberOfLines = len(arrayOLines)
    #1000*3 zeros matrix,row-1000, column-3
    returnMat = zeros((numberOfLines,3))
    #声明
    classLabelVector = []
    classLabelVector_Value = []
    index = 0
    #逐行扫描
    for line in arrayOLines:
        #strip函数会删除头和尾的字符,中间的不会删除
        line = line.strip()
        #删除‘\t’字符,仅剩下数据,供使用
        listFromLine = line.split('\t')
        #得到前三列数据,即飞行时间,游戏,冰激凌
        returnMat[index, :] = listFromLine[0:3]
        #得到largeDoses,smallDoses,didntLike的label
        classLabelVector.append(listFromLine[-1])      #无法将largeDoses,smallDoses,didntLike
                                                       #转换为int。基于这个思想,我们在这里将得到的行矩阵建立
                                                       #一个数值矩阵与之对应,暂时这样处理,不合适再继续修改
        if classLabelVector[index] == 'largeDoses':
            classLabelVector_Value.append(3)
        elif classLabelVector[index] == 'smallDoses':
            classLabelVector_Value.append(2)
        else:
            classLabelVector_Value.append(1)
        index += 1
    return returnMat, classLabelVector_Value
# def file2matrix(filename):
#     fr = open(filename)
#     numberOfLines = len(fr.readlines())         #get the number of lines in the file
#     returnMat = zeros((numberOfLines,3))        #prepare matrix to return
#     classLabelVector = []                       #prepare labels return
#     fr = open(filename)
#     index = 0
#     for line in fr.readlines():
#         line = line.strip()
#         listFromLine = line.split('\t')
#         returnMat[index,:] = listFromLine[0:3]
#         classLabelVector.append(int(listFromLine[-1]))
#         index += 1
#     return returnMat,classLabelVector
def autoNorm(dataSet):#得到归一化后的数据样本,最大值最小值之差,与最小值
    #得到每一列的max,min
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    #initiate a zero-matrix like dataSet's shape
    normDataSet = zeros(shape(dataSet))
    #get the num of row in dataSet
    m = dataSet.shape[0]
    #init a matrix of minvals that the same rows to the dataSet, 从而使当前数据矩阵中的每个数减去最小值
    normDataSet = dataSet - tile(minVals, (m,1))        #tile(matrixlike,A) :init a matrix when the shape is same to A
                                                        #meanwhile, if A is a number, the matrix is A*1, if A is (m,n),the matrix
                                                        #is m*n matrix
    normDataSet = normDataSet/tile(ranges, (m,1))      #element wise divide
    return normDataSet, ranges, minVals


def datingClassTest():
    #使用10%的数据去测试分类器
    hoRatio = 0.10  # hold out 10%
    #datingTestSet2.txt中标签全部变为3,2,1,而不是字符串label,所以如果不想改file2matrix()函数,应用datingTestSet.txt
    #如果file2matrix()用书中原程序,可用datingTestSet.txt
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')  # 将数据分离为样本数据与标签
    normMat, ranges, minVals = autoNorm(datingDataMat)#得到归一化后的数据样本,最大值最小值之差,与最小值
    #get the num of the row
    m = normMat.shape[0]
    #get the test num of normMat
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        #数据前numTestVecs个为测试数据,以后为样本训练集
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)  # inX: 待测试数据 ;  dataSet: 训练样本集;labels: 样本集的标签;k近邻
        #测试结果与真正结果对照输出
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if classifierResult != datingLabels[i]:
            errorCount += 1.0
    print("the total error rate is: %f"% (errorCount / float(numTestVecs)))
    print(errorCount)

def classifyPerson():
    resultList = ['not at all','in small doses','in large doses']
    percentTats = float(input(\
        "percentage of time spent playing video games?"))
    ffMiles = float(input(\
        "frequent flier miles earned per year? "))
    iceCream = float(input(\
        "liters of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    clssifierResult = classify0((inArr-minVals)/ranges,\
                                normMat,datingLabels,3)
    print("You will probably like this person:",\
          resultList[clssifierResult-1])

测试

classifyPerson()
完成!
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值