机器学习实战(二)

# -*- coding: utf-8 -*-
"""
Created on Fri Nov 06 16:51:05 2015

@author: hzh
"""
from numpy import*
from os import listdir
import operator
import matplotlib
import matplotlib.pyplot as plt

# Create random data with numpy
import numpy as np
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

#k临近算法    
def classify0(inX, dataSet, labels, k ):
    dataSetSize = dataSet.shape[0]
    diffMat = tile( inX, (dataSetSize, 1 )) - dataSet
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis = 1 )
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted( classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]

#将文本记录转换为NumPy的解析程序
def file2matrix( filename ):
    fr = open( filename )
    arrayOLines = fr.readlines()
    numberOfLines = len( arrayOLines )
    returnMat = zores((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

#归一化特征值
def autoNorm( dataSet ):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros( shape(dataSet) )
    normDataSet = dataSet - tile( minVals,(dataSet.shape[0],1) )
    normDataSet = normDataSet / tile( ranges, (dataSet.shape[0],1))
    return normDataSet, ranges, minVals

#使用Matplotlib创建散点图
x = random.rand(100)
x *= 100
c = random.rand(100,2)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(c[:,0],c[:,1],5*array(x),5*array(x))
plt.show()

def img2vector(filename):
    returnVect = zeros( (1,1024) )
    fr = open( filename )
    for i in range( 32 ):
        lineStr = fr.readline()
        for j in range( 32 ):
            returnVect[0,32 * i + j ] = int(lineStr[j])
    return returnVect

#手写数字识别系统的测试代码
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')
    m = len( trainingFileList )
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len( testFileList )
    for i in range( mTest ):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s'%fileNameStr)
        classifierResult = classify0( vectorUnderTest, trainingMat, hwLabels, 3 )
        print "the classifier came back with : %d, the real answer is : %d "%( 
            classifierResult, classNumStr)
        if( classifierResult != classNumStr ):
            errorCount += 1.0
    print 'the total number of errors is : %d'%errorCount
    print 'the total error rate is :%f'%(errorCount/float(mTest))
###############################################################################
d = random.rand(5,2)
print d.min(0)
print d

xx = [ [1,2],[2,1]]
xx = array(xx)

print xx
print autoNorm(xx)[0]

handwritingClassTest()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值