第二章k邻近算法(机器学习实战)
本章主要介绍k邻近算法的基本原理,并附上测试用例代码。
算法介绍
测试代码
测试一
简单的算法测试代码,kNN_1.py。
from numpy import *
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0.0, 1.0], [0.0, 1.1]])
label = array(['A', 'A', 'B', 'B'])
return group, label
def classify(input, dataset, labels, k):
size = dataset.shape[0]
# tile 转换成矩阵计算
diffmat = tile(input, (size, 1)) - dataset
sqdiffmat = diffmat**2
sqdistance = sqdiffmat.sum(axis=1)
distances = sqdistance**0.5
# 获取降序排序的索引值
sortedindex = distances.argsort()
classcount = {}
for i in range(k):
votelabel = labels[sortedindex[i]]
classcount[votelabel] = classcount.get(votelabel, 0) + 1
# 采用sorted对字典排序
sortedclasscount = sorted(classcount.items(), key=lambda item:item[1], reverse=True)
return sortedclasscount[0][0]
if __name__ == '__main__':
dataset, label = createDataSet()
preclass = classify([1.0, 1.2], dataset, label, 3)
print(preclass)
测试二
约会网站配对示例代码,kNN_2.py。
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
from kNN_1 import *
def file2Matrix(filepath):
lines = open(filepath).readlines()
linenum = len(lines)
matrix = zeros((linenum, 3))
labels = []
for i, line in enumerate(lines):
strs = line.strip().split("\t")
matrix[i, :] = [float(str) for str in strs[:3]]
labels.append(int(strs[-1]))
return matrix, labels
def datasetAnalysis(dataset, labels):
# 创建图
fig = plt.figure()
# 设置属性参数行数,列数,第几张图
ax = fig.add_subplot(111)
# 画散点图,并上色
ax.scatter(dataset[:, 0], dataset[:, 1], 15.0 * array(labels), 15.0 * array(labels))
plt.show()
def autoNorm(dataset):
minvals = dataset.min(axis=0)
maxvals = dataset.max(axis=0)
ranges = maxvals - minvals
minmatrix = tile(minvals, (dataset.shape[0], 1))
rangematrix = tile(ranges, (dataset.shape[0], 1))
returnmatrix = (dataset - minmatrix) / rangematrix
return returnmatrix, ranges, minvals
def datasetTest(dataset, labels):
testratio = 0.10
allnum = dataset.shape[0]
testnum = int(allnum * testratio)
error = 0
for i in range(testnum):
res = classify(dataset[i], dataset[testnum:allnum], labels[testnum:allnum], 10)
if res != labels[i]:
print("pre: ", res, "======> gt: ", labels[i])
error += 1
print("error: ", error, ", ratio: ", error / testnum)
if __name__ == '__main__':
# 获取数据
matrix, label = file2Matrix("/home/tgj/machinelearninginaction/Ch02/datingTestSet2.txt")
print(matrix, label)
# 数据的分析
# datasetAnalysis(matrix, label)
# 数据的归一化
normmatrix, _, _ = autoNorm(matrix)
# 数据测试
datasetTest(normmatrix, label)
测试三
minist手写字体识别示例,kNN_3.py
from numpy import *
import matplotlib, os
import matplotlib.pyplot as plt
from kNN_1 import *
def img2Vector(digit_dir):
files = os.listdir(digit_dir)
labels = []
returnmatrix = zeros((len(files), 1024))
for row, file in enumerate(files):
label = file.split("_")[0]
lines = open(os.path.join(digit_dir, file)).readlines()
for i, line in enumerate(lines):
strs = line.strip()
vals = array([int(val) for val in strs])
returnmatrix[row, i*32:(i+1)*32] = vals
labels.append(int(label))
return returnmatrix, labels
def datasetTest(testpath, dataset, labels):
files = os.listdir(testpath)
error = 0
testnum = len(files)
for file in files:
lines = open(os.path.join(testpath, file)).readlines()
label = file.split("_")[0]
testvec = zeros((1, 1024))
for i, line in enumerate(lines):
strs = line.strip()
vals = array([int(val) for val in strs])
testvec[0, i * 32:(i + 1) * 32] = vals
pre = classify(testvec, dataset, labels, 10)
if pre != int(label):
error += 1
print("pre: ", pre, " ===> gt: ", label)
print("error: ", error, ", ratio: ", error / testnum)
if __name__ == '__main__':
# 数据的准备
returnmatrix, label = img2Vector("/home/tgj/machinelearninginaction/Ch02/digits/trainingDigits")
# 数据的测试
testdir = "/home/tgj/machinelearninginaction/Ch02/digits/trainingDigits"
datasetTest(testdir, returnmatrix, label)