自学《机器学习实战》一书,书中的代码亲自敲一遍,努力搞懂每句代码的含义:
今天将第一章kNN分类算法的笔记总结一下。
# -*- coding: utf-8 -*-
"""
k-近邻算法小结:
k-近邻算法是基于实例的学习,k-近邻算法必须保存全部数据集,
如果训练数据集很大,必须使用大量的存储空间。
由于必须对数据集中的每个数据计算距离值,实际使用时可能非常耗时。
可修改的参数为:k,训练集大小.
优点:简单
缺点:耗时,计算量大,占用存储空间
注意:并非k越大越好,对于手写数字识别系统的例子,k=3时,错误率为0.011628
k=5时,错误率为0.017970
Annotation:<jianzhang.zhang@foxmail.com>
"""
from numpy import *
import os
import operator
import matplotlib
import matplotlib.pyplot as plt
# 创建数据集的函数
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
# k近邻分类函数,使用欧氏距离公式
def classify0(inX,dataSet,labels,k):
# 每行代表一条数据,计算数据集的大小
dataSetSize = dataSet.shape[0]
# 将测试记录复制n条,n为训练集的数目,然后便于做距离运算
diffMat = tile(inX,(dataSetSize,1)) - dataSet
# 计算距离平方
sqDiffmat = diffMat**2
# 计算距离平方和
sqDistances = sqDiffmat.sum(axis = 1)
# 距离平方和开方
distances = sqDistances**0.5
# 按照距离从小到大的顺序,返回训练记录的索引值
sortedDisIndices = distances.argsort()
# 建立类别字典用来计数
classCount = {}
# 取距离最小的前k项的标签,标签为键,个数为值,存入字典
for i in range(k):
voteIlabel = labels[sortedDisIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
# 对字典按照标签出现的次数进行排序
sortedClassCount = sorted(classCount.iteritems(),\
key = operator.itemgetter(1),reverse = True)
# 返回距离最小的前k项中,出现次数最多的标签
return sortedClassCount[0][0]
# 将训练数据转换为训练样本矩阵和类标签向量
def file2matrix(filename):
# 此处原文中未显示关闭文件,修改如下
with open(filename) as fr:
arrayOLines = fr.readlines()
# 训练集条数
numberOfLines = len(arrayOLines)
# 生成一个占位零矩阵
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFormLine = line.split('\t')
# 将每条记录放入矩阵中,自动由字符串转换为数字
returnMat[index,:] = listFormLine[0:3]
# 将训练集的类别标签放入列表中
classLabelVector.append(int(listFormLine[-1]))
index += 1
return returnMat,classLabelVector
# 归一化特征值,newValue = (oldValue-min)/(max-min)
# 将任意取值范围的特征值转化为0到1区间内的值
def autoNorm(dataSet):
# 每一列的最小最大值
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
# 生成新的占位矩阵
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
# 随机挑选10%记录作为测试集,其他作为训练集进行分类测试,并计算错误率
def datingClassTest():
# 测试集比例为10%
hoRatio = 0.10
datingSetMat,datingLabels = file2matrix('E:/mlcode/ch02/datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingSetMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
# 生成numTestVecs个随机数,原文是挑选前numTestVecs个样本作为测试数据
numList = [random.randint(0,m-1) for i in range(numTestVecs)]
#for i in range(numTestVecs):
for i in numList:
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m],\
datingLabels[numTestVecs:m],3)
print "Record index: %d,The classifier came back with: %d, the real answer is: %d"\
% (i,classifierResult,datingLabels[i])
# 计算分类错误数目
if (classifierResult != datingLabels[i]):errorCount += 1.0
print "The total error rate is: %f" %(errorCount/float(numTestVecs))
# 根据输入记录进行预测
def classifyPerson():
resultList = ['Not at all','In small doses','In large doses']
percentTats = float(raw_input(\
"Percentage of time spent playing video games? "))
ffMiles = float(raw_input("Frequent flier miles earned per year? "))
iceCream = float(raw_input("Liters of ice cream consumed per year? "))
datingDataMat,datingLabels = file2matrix('E:/mlcode/ch02/datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print "You will probably like this person: ",\
resultList[classifierResult - 1]
# 把一个32X32的二进制图像矩阵转换为1x1024的向量
def img2vector(filename):
# 生成一个1X1024的占位numpy向量
returnVector = zeros((1,1024))
# 原文未显示关闭文件,修改如下
with open(filename) as fr:
l = fr.readlines()
for i in range(32):
lineStr = l[i]
for j in range(32):
returnVector[0,32*i+j] = int(lineStr[j])
# 返回1X1024的向量
return returnVector
# 手写数字识别系统的测试代码,每一条记录(包括标签)都进行数字化
def handwritingClassTest():
hwLabels = []
# 获取指定目录下的内容(文件和文件夹)
trainingFileList = os.listdir('E:/mlcode/ch02/trainingDigits')
m = len(trainingFileList)
# 生成训练集占位矩阵
trainingMat = zeros((m,1024))
# 将全部训练集中的数据放入trainingMat中
for i in range(m):
# 从文件名中解析分类数字
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumberStr = int(fileStr.split('_')[0])
hwLabels.append(classNumberStr)
trainingMat[i,:] = img2vector('E:/mlcode/ch02/trainingDigits/%s' %fileNameStr)
testFileList = os.listdir('E:/mlcode/ch02/testDigits')
errorCount = 0.0
mTest = len(testFileList)
# 对测试集逐条进行测试
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('E:/mlcode/ch02/testDigits/%s' %fileNameStr)
classifierResult = classify0(vectorUnderTest,\
trainingMat,hwLabels,5)
print "The classifier came back with: %d, the real answer is: %d"\
%(classifierResult,classNumStr)
if (classifierResult != classNumStr):errorCount += 1.0
print "\nThe total number of errors is: %d." %errorCount
print "\nThe total error rate is: %f" %(errorCount/float(mTest))
##if __name__ == "__main__":
## group,labels = createDataSet()
## classify0([0,0],group,labels,3)
## datingDataMat,datingLabels = file2matrix('E:/mlcode/ch02/datingTestSet2.txt')
## # 使用matplotlib绘制原始数据的散点图
## fig = plt.figure()
## ax = fig.add_subplot(111)
## # 绘制第二列和第三列数据的散点图
## ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
## # 利用颜色及尺寸标识了数据点的属性类别,
## # 第一个和第二个特征能够更好标识三个不同的样本分类区域
## ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array\
## (datingLabels),15.0*array(datingLabels))
## plt.show()
## normMat,ranges,minVals = autoNorm(datingDataMat)
## datingClassTest()
## classifyPerson()
## # 查看trainingDigits中的文件内容
## with open('E:/mlcode/ch02/trainingDigits/0_0.txt') as f:
## l = f.readlines()
## testVector = img2vector('E:/mlcode/ch02/trainingDigits/0_13.txt')
## print testVector[0,0:32]
## print testVector[0,32:64]
## # 测试手写数字识别系统
## handwritingClassTest()
学习完之后自己写了一个人脸识别的小例子,人脸识别单纯靠像素值数据挖掘还是不够的,下面的例子准确率不高只有60%左右,但还是帮助自己巩固了kNN算法。
# -*- coding: utf-8 -*-
"""
学习机器学习实战的第一章kNN算法后,自己写一个人脸识别的小例子,效果不太好,准确率只有60%左右
我对人脸识别的算法不了解,这里只是使用一下kNN分类算法,作为对所学知识的巩固
这个例子从某图片网站上获取10位明星的图片,每位明星500张,
然后调用openCV对图片中的人脸进行识别,并生成灰度格式的面部图片,对面部图片进行过滤,
只保留尺寸大于50X50的面部图片,然后将灰度格式的面部图片全部压缩为50X50的尺寸.
从每位明星的图片中挑选10%的面部图片作为测试集,其余作为训练集,使用kNN分类算法,
进行分类测试.
主要思路就是把人脸图片变为矩阵进而转化为一维向量从而套用书上手写数字识别的例子
"""
# Author:<jianzhang.zhang@foxmail.com>
import numpy as np
import cv2,os,random,shutil
from PIL import Image
from kNN import *
# 对原始图片提取面部区域,并保存
def detectFace(filename):
list = os.listdir('./raw/%s' %filename)
if not os.path.exists('./face/%s' %filename):
os.mkdir('./face/%s' %filename)
# 加载面部识别模型
face_cascade = cv2.CascadeClassifier('F:/opencv/build/etc/haarcascades/\
haarcascade_frontalface_default.xml')
for item in list:
img = cv2.imread('./raw/%s/%s' %(filename,item))
# 以灰度格式加载输入图像
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 如果检测到面部,它会返回面部所在的矩形区域Rect(x,y,w,h)
face = face_cascade.detectMultiScale(gray, 1.3, 5)
try:
fc = gray[face[0][1]:(face[0][1]+face[0][3]),face[0][0]:(face[0][0]+face[0][2])]
cv2.imwrite('./face/%s/%s' %(filename,item),fc)
except:
continue
# 将面部图像统一保存为50X50的jpg格式
def modifyImage(filename):
list = os.listdir('./face/%s' %filename)
if not os.path.exists('./temp/%s' %filename):
os.mkdir('./temp/%s' %filename)
for img in list:
im = Image.open('./face/%s/%s' %(filename,img))
a,b = im.size
if a>=50 and b>=50:
im.thumbnail((50,50))
im.save('./temp/%s/%s.jpg' %(filename,img[:-4]),'jpeg')
# 将图像转化为1维向量
def imgTovector(filename):
img = cv2.imread(filename)
vector = img.flatten()
return vector
# 每个文件夹中随机挑选百分之十图片作为测试样例,其余作为训练样例
def randomSelectTest():
directoryList = os.listdir('./temp')
for direc in directoryList:
fileList = os.listdir('./temp/%s' %direc)
# 文件夹中图片总数
m = len(fileList)
# 挑选百分之十的测试图片
totalTest = int(m*0.1)
testImg = []
while len(testImg) != totalTest:
testImg = list(set([random.choice(fileList) for _ in range(totalTest)]))
for img in testImg:
shutil.copy('./temp/%s/%s' %(direc,img),'./test/%s' %img)
trainList = list(set(fileList).difference(set(testImg)))
# 将剩余文件作为训练集
for img in trainList:
shutil.copy('./temp/%s/%s' %(direc,img),'./train/%s' %img)
# 图片分类测试
def imgTest():
imgLabels = []
# 获取指定目录下的内容(文件和文件夹)
trainingFileList = os.listdir('./train')
m = len(trainingFileList)
# 生成训练集占位矩阵
trainingMat = zeros((m,7500))
# 将全部训练集中的数据放入trainingMat中
for i in range(m):
# 从文件名中解析分类数字
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumberStr = int(fileStr.split('_')[0])
imgLabels.append(classNumberStr)
trainingMat[i,:] = imgTovector('./train/%s' %fileNameStr)
testFileList = os.listdir('./test')
errorCount = 0.0
mTest = len(testFileList)
# 对测试集逐条进行测试
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = imgTovector('./test/%s' %fileNameStr)
classifierResult = classify0(vectorUnderTest,\
trainingMat,imgLabels,3)
print "The classifier came back with: %d, the real answer is: %d"\
%(classifierResult,classNumStr)
if (classifierResult != classNumStr):errorCount += 1.0
print "\nThe total number of errors is: %d." %errorCount
print "\nThe total error rate is: %f" %(errorCount/float(mTest))
if __name__ == '__main__':
l = os.listdir('./raw')
for item in l:
detectFace(item)
modifyImage(item)
randomSelectTest()
imgTest()
图片spider代码就不附上了,很简单的大家动手练习一下。