机器学习算法:第二章 knn算法
2.1
knn.py
from numpy import *
import operator
#
# def classify0(inX, dataSet, labels, k):
# dataSet_Size = dataSet.shape[0]
# diffMat = tile(inX, (dataSet_Size,1)) - dataSet
# sqDiffMat = diffMat**2
# sqDistances = sqDiffMat.sum(axis=1)
# distances = sqDistances**0.5
# sortedDistindicies = distances.argsort()
# classCount={}
# for i in range(k):
# voteIlabel = labels[sortedDistindicies[i]]
# classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
# sortedClassCount= sorted(classCount.iteritems(),key=operator.itemgetter(1), reverse=True)
# return sortedClassCount[0][0]
def classify0(X, dataSet, labels, k):
data_size = dataSet.shape[0]
diff_mat = tile(X, (data_size,1)) - dataSet #tile(A,n),功能是将数组A重复n次,构成一个新的数组,此处是data_size行X
# print(diff_mat)
sq_diff = diff_mat**2
# print(sq_diff)
sq_D = sq_diff.sum(axis=1)
D = sq_D**0.5
# print(D)
sorted_D = D.argsort() #获得是索引
# print(sorted_D)
classCount={}
for i in range(k):
vote = labels[sorted_D[i]]
# print(vote)
classCount[vote] = classCount.get(vote, 0) + 1
# print(classCount)
# print(classCount)
sorted_Class = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# print(sorted_Class)
return sorted_Class[0][0]
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
group, labels = knn.createDataSet()
#print(group)
#print(labels)
result = knn.classify0([0,0], group, labels,3)
print(result)
B
2,.2 使用 k-近邻算法改进约会网站的配对结果
过程:
1.从文本解析数据
文本转化为numpy的程序:
def file2matrix(filename):
"""
:param filename: 文件名字符串
:return: 训练样本矩阵和类标签向量
"""
fr = open(filename)
line = fr.readlines()
num=len(line)
Mat = zeros((num, 3))
label_vector = []
index = 0
for lin in line:
lin = lin.strip()
list = lin.split('\t')
Mat[index,:] = list[0:3]
label_vector.append(int(list[-1]))
index +=1
return Mat, label_vector
分析数据:使用matplotlib创建散点图
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
filename = 'datingTestSet2.txt'
data_mat, labels = knn.file2matrix(filename)
fig = plt.figure(figsize=(20,6),dpi=180)
ax = fig.add_subplot(111)
# plt.plot(data_mat[:,1],data_mat[:,2])
# plt.scatter(data_mat[:,1],data_mat[:,2])
ax.scatter(data_mat[:,1],data_mat[:,2],15.0*array(labels), 15.0*array(labels))
plt.xlabel('玩视频游戏所耗时间百分比')
plt.ylabel('每周所消费的冰淇淋公升数')
plt.show()
数据归一化:
def autoNorm(data):
min = data.min(0)
max = data.max(0)
ranges = max - min
norm_data = zeros(shape(data[0]),shape(data[1]))
m = data.shape[0]
norm_data = data - tile(min, (m, 1))
norm_data = norm_data/tile(ranges, (m,1))
return norm_data, ranges, min
修改后更简洁版:
def autoNorm(data):
min = data.min(0)
max = data.max(0)
norm_data = zeros(shape(data))
m = data.shape[0]
norm_data = (data - tile(min, (m, 1)))/tile(max-min, (m,1))
return norm_data, max - min, min
约会网站预测函数:
def data_class_test():
ratio = 0.1
data_mat,data_labels = file2matrix("datingTestSet2.txt")
norm_mat, ranges, min =autoNorm(data_mat)
m = norm_mat.shape[0]
vec_num = int(m*ratio)
error_count = 0
for i in range(vec_num):
classifier_result = classify0(norm_mat[i,:],norm_mat[vec_num:m, :],data_labels[vec_num:m],4)
print("分类结果:%d , 真实的结果:%d"%(classifier_result, data_labels[i]))
if (classifier_result != data_labels[i]): error_count +=1
print("误差:%f" %(error_count/float(vec_num)))
2.手写字体系统:
from os import listdir
def hand_writing_test():
labels = []
file_list = listdir('trainingDigits')
m = len(file_list)
training_mat = zeros((m, 1024))
for i in range(m):
filename = file_list[i]
file_str = filename.split('.')[0]
label = int(file_str.split('_')[0])
labels.append(label)
training_mat[i,:] = img2vector('trainingDigits/%s.txt' %file_str)
test_file_list = listdir('testDigits')
error_count = 0.0
m_test = len(test_file_list)
#随机化测试数据
index=[i for i in range(m_test)]
random.shuffle(index)
for i in index:
filename = test_file_list[i]
file_str = filename.split('.')[0]
label = int(file_str.split('_')[0])
test_vector = img2vector('testDigits/%s.txt' %file_str)
classifier_result = classify0(test_vector,training_mat,labels, 3)
print("分类结果:%d , 真实的结果:%d" % (classifier_result, label))
if (classifier_result != label): error_count += 1.0
print("误差数:%f" % error_count )
print("误差率:%f" % (error_count / float(m_test)))
使用
knn.hand_writing_test()
运行可得:
完整代码如下:
knn.py
from numpy import *
import operator
#
# def classify0(inX, dataSet, labels, k):
# dataSet_Size = dataSet.shape[0]
# diffMat = tile(inX, (dataSet_Size,1)) - dataSet
# sqDiffMat = diffMat**2
# sqDistances = sqDiffMat.sum(axis=1)
# distances = sqDistances**0.5
# sortedDistindicies = distances.argsort()
# classCount={}
# for i in range(k):
# voteIlabel = labels[sortedDistindicies[i]]
# classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
# sortedClassCount= sorted(classCount.iteritems(),key=operator.itemgetter(1), reverse=True)
# return sortedClassCount[0][0]
def classify0(X, dataSet, labels, k):
data_size = dataSet.shape[0]
diff_mat = tile(X, (data_size,1)) - dataSet #tile(A,n),功能是将数组A重复n次,构成一个新的数组,此处是data_size行X
# print(diff_mat)
sq_diff = diff_mat**2
# print(sq_diff)
sq_D = sq_diff.sum(axis=1)
D = sq_D**0.5
# print(D)
sorted_D = D.argsort() #获得是索引
# print(sorted_D)
classCount={}
for i in range(k):
vote = labels[sorted_D[i]]
# print(vote)
classCount[vote] = classCount.get(vote, 0) + 1
# print(classCount)
# print(classCount)
sorted_Class = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# print(sorted_Class)
return sorted_Class[0][0]
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
def file2matrix(filename):
"""
:param filename: 文件名字符串
:return: 训练样本矩阵和类标签向量
"""
fr = open(filename)
line = fr.readlines()
num=len(line)
Mat = zeros((num, 3))
label_vector = []
index = 0
for lin in line:
lin = lin.strip()
list = lin.split('\t')
Mat[index,:] = list[0:3]
label_vector.append(int(list[-1]))
index +=1
return Mat, label_vector
def autoNorm(data):
min = data.min(0)
max = data.max(0)
ranges = max - min
norm_data = zeros(shape(data))
m = data.shape[0]
norm_data = data - tile(min, (m, 1))
norm_data = norm_data/tile(ranges, (m,1))
return norm_data, ranges, min
# def autoNorm_(data):
# min = data.min(0)
# max = data.max(0)
# norm_data = zeros(shape(data))
# m = data.shape[0]
# norm_data = (data - tile(min, (m, 1)))/tile(max-min, (m,1))
# return norm_data, max - min, min
def data_class_test():
ratio = 0.1
data_mat,data_labels = file2matrix("datingTestSet2.txt")
norm_mat, ranges, min =autoNorm(data_mat)
m = norm_mat.shape[0]
vec_num = int(m*ratio)
error_count = 0
for i in range(vec_num):
classifier_result = classify0(norm_mat[i,:],#测试数据
norm_mat[vec_num:m, :],#训练数据
data_labels[vec_num:m],#训练数据的标签
4#设定的k值
)
print("分类结果:%d , 真实的结果:%d"%(classifier_result, data_labels[i]))
if (classifier_result != data_labels[i]): error_count +=1
print("误差:%f" %(error_count/float(vec_num)))
def img2vector(filename):
returnvect = zeros((1,32*32))
f = open(filename)
for i in range(32):
file = f.readline()
for j in range(32):
returnvect[0, 32*i+j] = int(file[j])
return returnvect
from os import listdir
def hand_writing_test():
labels = []
file_list = listdir('trainingDigits')
m = len(file_list)
training_mat = zeros((m, 1024))
for i in range(m):
filename = file_list[i]
file_str = filename.split('.')[0]
label = int(file_str.split('_')[0])
labels.append(label)
training_mat[i,:] = img2vector('trainingDigits/%s.txt' %file_str)
test_file_list = listdir('testDigits')
error_count = 0.0
m_test = len(test_file_list)
#随机化测试数据
index=[i for i in range(m_test)]
random.shuffle(index)
for i in index:
filename = test_file_list[i]
file_str = filename.split('.')[0]
label = int(file_str.split('_')[0])
test_vector = img2vector('testDigits/%s.txt' %file_str)
classifier_result = classify0(test_vector,training_mat,labels, 3)
print("分类结果:%d , 真实的结果:%d" % (classifier_result, label))
if (classifier_result != label): error_count += 1.0
print("误差数:%f" % error_count )
print("误差率:%f" % (error_count / float(m_test)))
knn_main.py
import knn
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
#----------------转矩阵--------------------
# group, labels = knn.createDataSet()
# print(group)
# print(labels)
# result = knn.classify0([0,0], group, labels,3)
# print(result)
# filename = 'datingTestSet2.txt'
# data_mat, labels = knn.file2matrix(filename)
#
#----------------可视化----------------
# # print("data_mat:",data_mat[1,:])
# # print("labels:",labels[1:])
#
# fig = plt.figure(figsize=(20,8),dpi=180)
# ax = fig.add_subplot(111)
# # plt.plot(data_mat[:,1],data_mat[:,2])
# # plt.scatter(data_mat[:,1],data_mat[:,2])
# ax.scatter(data_mat[:,1],data_mat[:,2],15.0*array(labels), 15.0*array(labels))
# plt.xlabel('玩视频游戏所耗时间百分比')
# plt.ylabel('每周所消费的冰淇淋公升数')
# plt.show()
#-----------------归一化---------------------
# norm_data, ranges, min=knn.autoNorm(data_mat)
# print(norm_data)
# print(ranges)
# print(min)
#---------------约会网站测试--------------------
# knn.data_class_test()
# testvector = knn.img2vector('testDigits/0_13.txt')
# print(testvector[0,0:31])
#------------------手写字体实现--------------------
knn.hand_writing_test()