一、利用k——近邻算法实现约会对象类型分类。(python3.5.3 ubuntu16.04)
1、创建文件knn.py,里面添加代码:
from numpy import *
import operator
from os import listdir
#k-近邻算法, inX为用于分类的输入向量, 训练样本集dataSet, 标签向量labels, 选择最近邻的个数k
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] #读取矩阵dataSet的第一维度的长度
diffMat = tile(inX, (dataSetSize, 1)) - dataSet #复制矩阵
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #从小到大依次排序
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] #选取与当前距离最小的k个点
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #确定前k个最小元素所在的分类
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True) #从大到小对元组进行排序
return sortedClassCount[0][0] #返回前k个点中出现频率最高的类别作为当前点的预测分类
#文件转换成矩阵
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip() #截取所有的回车字符
listFromLine = line.split('\t') #使用tab字符\t将上一步得到的整行数据分割成一个元素列表
returnMat[index,:] = listFromLine[0:3] #取前3个元素,将它们存储到特征矩阵中
classLabelVector.append(int(listFromLine[-1])) #将列表的最后一列存储到向量classLabelVector中
index += 1
return returnMat,classLabelVector
利用散点图分析数据
import numpy as np
import knn
import matplotlib
import matplotlib.pyplot as plt
from array import array
datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*numpy.array(datingLabels),15.0*numpy.array(datingLabels))
datingLabels = np.array(datingLabels)
idx_1 = np.where(datingLabels==1)
p1 = ax.scatter(datingDataMat[idx_1,0],datingDataMat[idx_1,1], s=20, marker = 'o', c = 'r', label='Do Not Like')
idx_2 = np.where(datingLabels==2)
p2 = ax.scatter(datingDataMat[idx_2,0],datingDataMat[idx_2,1], s=10, marker = 'o', c = 'b', label='Liked in Small Doses')
idx_3 = np.where(datingLabels==3)
p3 = ax.scatter(datingDataMat[idx_3,0],datingDataMat[idx_3,1], s=30, marker = 'o', c = 'g', label='Liked in Large Doses')
#ax.legend(fontsize = 5)
plt.legend(loc = 'upper left', fontsize = 10)
#plt.xlim(-5,35)
plt.xlabel('Frequent-flier miles per year')
plt.ylabel('The percentage of time spent on playing video games(%)')
#plt.ylabel('The litres of ice cream consumed per week')
#plt.xlim(0, 25)
#plt.ylim(0.0, 2.0)
plt.savefig("examples_1.jpg")
plt.show()
散点图:
def autoNorm(dataSet):
minVals = dataSet.min(0) #将每列的最小值放到minVals中,参数0是使得函数可以从列中选取最小值
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet, ranges, minVals
4、测试代码
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLables = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], datingLables[numTestVecs:m],3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLables[i]))
if (classifierResult != datingLables[i]): errorCount += 1.0
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print(errorCount)
在终端中进入到knn.py文件所在文件夹,输入python进入到python环境
>>>import (knn)
>>>datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt') //datingTestSet2.txt文件在《机器学习实战》官网上下载数据
>>>knn.datingClassTest() //测试代码
结果如下:
5、约会网站预测函数
#通过输入下面三个参量来判断这个人是不是需要约会的类型
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses'] #三种类型分类
percentTats = float(input("percentage of time spent playing vedio games?")) #输入玩视频游戏时间百分比
ffMiles = float(input("frequent flier miles earned per year?")) #输入每年飞行的公里数
iceCream = float(input("liters of ice cream consumed per year?")) #输入每周消费的冰淇淋公升数
datingDataMat, datingLables = file2matrix('datingTestSet2.txt') #读取元数据,并将其转换成矩阵
normMat, ranges, minVals = autoNorm(datingDataMat) #归一化处理
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLables, 3)
print("you will probably like this person: %s" %(resultList[classifierResult-1]))
测试结果:
二、手写字识别系统
1、将图像转换为测试向量
#手写识别系统,将32*32的图像转换成1*1024的向量
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
#将数据输入到分类器,检测分类器的执行效果
#k近邻算法的缺点是不需要提前训练,导致计算量很大
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits') #读取训练文件
m = len(trainingFileList) #训练文件长度
trainingMat = zeros((m, 1024)) #创建m行1024列的训练矩阵,该矩阵每行数据存储一个图像
for i in range(m):
fileNameStr = trainingFileList[i] #获取文件名
fileStr = fileNameStr.split('.')[0] #以.为分隔符取文件名
classNumStr = int(fileStr.split('_')[0]) #以_为分隔符取文件名
hwLabels.append(classNumStr) #在hwLabels数组后面添加数字
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
# 每个测试样本与1934个训练样本做距离计算,取前三个距离最小的值,并取其类别,类别频率出现最高的即为识别出来的数字值
classfierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classfier came back with: %d, the real answer is: %d" %(classfierResult, classNumStr))
if (classfierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" %errorCount)
print("\nthe total error rate is: %f" %(errorCount/float(mTest)))
测试结果: