一 约会网站配对效果实例
K近邻算法:
1 计算已知类别数据集中的点与当前点之间的距离;
2 按照距离递增次序排序;
3 选取与当前点距离最小的k个点;
4 确定前k个点所在类别的出现频率;
5 返回前k个点出现频率最高的类别作为当前点的预测分类。
#!/usr/bin/python
# coding:utf-8
# 机器学习实战 第02章 KNN
# 约会网站配对效果实例
# 读取数据并处理
from numpy import *
def file2matrix(filename):
# 打开文件
fr = open(filename)
# 读取文件内容 获取文本行数
numberOfLines = len(fr.readlines())
# 生成numberOfLines行3列的0矩阵
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
# 使用函数line.strip()截取掉所有的回车字符
line = line.strip()
# 使用 13 匕字符#将上一步得到的整行数据分割成一个元素列表
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
# 数据归一化
def autoNorm(dataSet):
# 0表示列的最大与最小值;获得每列的最大值与最小值 1*3
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
# 得到取值范围 1*3
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
# 获得dataSet的行数
m = dataSet.shape[0]
# tile(minVals, (m, 1))为构造一m行每行重复1次minVals的矩阵
# 每列减去对应的最小值
normDataSet = dataSet - tile(minVals, (m, 1))
# 除以每列对应的取值范围
normDataSet = normDataSet/tile(ranges, (m, 1))
# 返回归一化后的值,每列的取值范围,每列的最小值
return normDataSet, ranges, minVals
# K近邻算法
import operator
def classify0(inX, dataSet, labels, k):
# 获得dataSet的行数
dataSetSize = dataSet.shape[0]
# tile(inX, (dataSetSize, 1))为产生一个dataSetSize行每行重复1次inX的矩阵
# 获得一个dataSet大小的矩阵,每个元素为dataSet的每行与inX对应相减的值
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
# 对应元素差的平方
sqDiffMat = diffMat**2
# 对矩阵的行1/列0求和
sqDistances = sqDiffMat.sum(axis=1)
# 获得每行的距离
distances = sqDistances**0.5
# 返回排序此数组的索引/升序
sortedDistIndicies = distances.argsort()
# 创建一个空字典
classCount = {}
# 取距离最小的前k个点
for i in range(k):
voteIlbel = labels[sortedDistIndicies[i]]
classCount[voteIlbel] = classCount.get(voteIlbel, 0) + 1
# sortedClassCount : [('A', 3), ('B', 2)] 按值逆序排列
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# 手写数字分类
import os
# 将数据转换为一个一维向量
def img2vector(filename):
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i + j] = int(lineStr[j])
return returnVect
约会网站配对效果实例:
if __name__ == '__main__':
# # 绘制散点图
# import matplotlib.pyplot as plt
# datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(datingDataMat[:, 1], datingDataMat[:,2])
# ax.scatter(datingDataMat[:, 1], datingDataMat[:,2], 15.0*array(datingLabels), 15.0*array(datingLabels))
# plt.show()
# 测试代码
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
# 将数据归一化处理
normMat, ranges, minVals = autoNorm(datingDataMat)
# 获得数据的行数
m = normMat.shape[0]
# 计算用于测试的数据量
numTestVecs = int(m*hoRatio)
errorCount = 0.0
# 即前numTestVecs行数据用于测试;numTestVecs行到m行用于训练
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "the classifier came back, with: %d, the real answer is: % d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]):
errorCount += 1.0
# 总共预测错误的个数除以预测数据总数
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
打印结果:
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 3
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 3, the real answer is: 3
the classifier came back, with: 2, the real answer is: 2
the classifier came back, with: 1, the real answer is: 1
the classifier came back, with: 3, the real answer is: 1
the total error rate is: 0.050000
散点图1:
散点图2:
手写数字分类:
if __name__ == '__main__':
# # 手写数字分类
# print img2vector(r'digits/testDigits/0_0.txt')[0, 0:31]
hwLabels = []
# os.listdir(path) 返回指定的文件夹包含的文件或文件夹的名字的列表
trainingFileList = os.listdir('digits/trainingDigits')
# m为文件夹内文件的个数(1934)
m = len(trainingFileList)
# 初始化训练矩阵
trainingMat = zeros((m, 1024))
for i in range(m):
# 获得第i个文件名
fileNameStr = trainingFileList[i]
# 获得去掉.txt的文件名
fileStr = fileNameStr.split('.')[0]
# 通过文件名得到label
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
# 将数据转换为向量,按行存入trainingMat中
trainingMat[i, :] = img2vector(r'digits/trainingDigits/%s' %fileNameStr)
# 获得testDigits文件夹下的文件列表
testFileList = os.listdir(r'digits/testDigits')
errorCount = 0.0
# 得到文件个数mTest
mTest = len(testFileList)
for i in range(mTest):
# 获得第i个文件名
fileNameStr = testFileList[i]
# 获得去掉.txt的文件名
fileStr = fileNameStr.split('.')[0]
# 通过文件名得到label
classNumStr = int(fileStr.split('_')[0])
# 将数据转换为向量,按行存入vectorUnderTest中
vectorUnderTest = img2vector(r'digits/testDigits/%s' %fileNameStr)
# 将测试向量和训练矩阵添加到classify0()中,进行分类
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if(classifierResult != classNumStr):
errorCount += 1.0
# 输出总的分类错误个数
print "\n the total number of errors is: %d" % errorCount
# 输出分类错误率
print "\n the total error rate is: %f" % (errorCount / float(mTest))
输出:
the classifier came back with: 8, the real answer is: 8
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 5, the real answer is: 5
the classifier came back with: 6, the real answer is: 6
.
.
.
the classifier came back with: 0, the real answer is: 0
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 6, the real answer is: 6
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 5, the real answer is: 5
the classifier came back with: 7, the real answer is: 7
the classifier came back with: 5, the real answer is: 5
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 4, the real answer is: 4
the classifier came back with: 8, the real answer is: 8
the total number of errors is: 12
the total error rate is: 0.012685