1计算训练集中数据与当前点之间的距离
2按距离依次递增排序
3选取与当前距离最近的k个点
4确定这k个点所在类别的出现频率
5返回前k个点出现频率最高的类别作为当前点有预测分类'''
from numpy import *
import operator
def classify(inX, data_set, labels, k):
data_set_size=len(data_set)
diff_mat=tile(inX, (data_set_size, 1))-data_set
sq_diff_mat=diff_mat**2
sq_distances=sq_diff_mat.sum(axis=1)
distances=sq_distances**0.5
#distance [ 1.48660687 1.41421356 0. 0.1 ]
sorted_dist_indicies=distances.argsort()
#argsort函数对distance中的元素由小到大排列 注意返回的是索引
# sorted_dist_indicies [2 3 1 0]
#print("sorted_dist_indicies",sorted_dist_indicies)
#print('\n')
class_count={}
for i in range(k):
votel_label=labels[sorted_dist_indicies[i]]
class_count[votel_label]=class_count.get(votel_label, 0)+1
sorted_class_count=sorted(class_count.items(),key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
group=array([[1.0, 1.1], [1.0, 1.0], [0,0], [0, 0.1]])
labers=['A', 'A', 'B', 'B']
k=3
inx=[0, 0]
print(classify(inx, group, labers,k))
输出
B
(即inx属于B这个类别)
'''约会网站的配对'''
from numpy import *
import operator
def file2_matrix(file_name): #数据处理函数
fr=open(file_name)
array_of_line=fr.readlines() #打开文本文件 逐行读取到array_of_line的list中并关闭文件(及时关闭文件,作为一种习惯)
fr.close
number_of_line=len(array_of_line)
return_mat=zeros((number_of_line,3)) #构造一个结果输出矩阵 初始化元素全为0
class_laber_vector=[]
index=0
for line in array_of_line: #通过一个循环 将文件中的数据逐行输入到对应的list中
line=line.strip() #前3列输入到dating_data_mat中 最后一列是标签s输入到dating_laber中
list_from_line=line.split('\t')
return_mat[index:]=list_from_line[0:3]
class_laber_vector.append(int(list_from_line[-1]))
index +=1
return return_mat, class_laber_vector
dating_data_mat, dating_laber=file2_matrix(r"C:\Users\Administrator\Desktop\机器学习\机器学习实战(数据)\Ch02\datingTestSet2.txt")
import matplotlib #绘制散点图部分
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111) #add_subplot函数是画布分割函数
ax.scatter(dating_data_mat[:,1], dating_data_mat[:,2], #这里选择的是dating_mat的第2和第3列绘图
15.0*array(dating_laber), 15.0*array(dating_laber))
plt.show()
def auto_norm(data_set): #归一化特征值的处理 目的是使得几类特征在数据的分析中具备相同的权重
min_vals=data_set.min(0) #不会因为某类数据值的普遍过大或者过小而影响最终的结果
max_vals=data_set.max(0)
ranges=max_vals-min_vals
norm_data_set=zeros(shape(data_set)) #python3中定义norm_data_set这一行是不需要的,这里考虑的是先定义再使用
m=len(data_set)
norm_data_set=data_set-tile(min_vals, (m,1)) #运用一个tile函数一次处理完所有值
norm_data_set=norm_data_set/tile(ranges, (m,1))
return norm_data_set, ranges, min_vals
norm_mat, ranges, min_vals=auto_norm(dating_data_mat)
#print(norm_mat)
#print(ranges)
#print(min_vals)
import sys
sys.path.append(r"C:\Users\Administrator\Desktop\机器学习")
import kNN
def dating_class_test(): #测试算法部分
ho_ratio=0.10 #训练样本包含数据的90%,其余10%用来测试分类器
dating_data_mat, dating_labers=file2_matrix(r"C:\Users\Administrator\Desktop\机器学习\机器学习实战(数据)\Ch02\datingTestSet2.txt")
norm_mat, ranges, min_vals=auto_norm(dating_data_mat)
m=len(norm_mat)
num_test_vecs=int(m*ho_ratio) #获得用于测试的数据个数
error_count=0.0
for i in range(num_test_vecs): #通常10%的测试数据是随机选择的,这里用for循环实现
class_ifier_result=kNN.classify(norm_mat[i,:], norm_mat[num_test_vecs:m,:],\
dating_labers[num_test_vecs:m], 3)
'''classify的参数有inx(测试样本),data_set(数据集),labers(标签列表),k值'''
print("the classifier came back with: %d, the real anwer is: %d"\
% (class_ifier_result, dating_labers[i]))
if class_ifier_result != dating_labers[i]:
error_count+=1
print("the total error ratio is ", (error_count/float(num_test_vecs))) #输出错误率
print(dating_class_test())
def class_ify_person(): #构造完整的预测系统
result_list=['not at all', 'in small doses', 'in large doses'] #标签
percent_tats=float(input("pecentage of time spend playing video games?"))#输入数据用来判定所属类别
ff_miles=float(input("frequent filer miles earned per year?"))
ice_cream=float(input("lites of icecream consume per year?"))
dating_data_mat, dating_labers=file2_matrix(r"C:\Users\Administrator\Desktop\机器学习\机器学习实战(数据)\Ch02\datingTestSet2.txt")
norm_mat, ranges, min_vals=auto_norm(dating_data_mat)
inArr=array([ff_miles, percent_tats, ice_cream]) #待检测样本
class_ifier_result=kNN.classify((inArr-min_vals)/ranges, norm_mat,\
dating_labers, 3) #作了特征值归一化的处理
print("you will like this person", result_list[class_ifier_result-1])
class_ify_person()
结果输出:
[[ 4.09200000e+04 8.32697600e+00 9.53952000e-01]
[ 1.44880000e+04 7.15346900e+00 1.67390400e+00]
[ 2.60520000e+04 1.44187100e+00 8.05124000e-01]
...,
[ 2.65750000e+04 1.06501020e+01 8.66627000e-01]
[ 4.81110000e+04 9.13452800e+00 7.28045000e-01]
[ 4.37570000e+04 7.88260100e+00 1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1,1, 1, 1, 1, 2, 3]
[[ 0.44832535 0.39805139 0.56233353]
[0.15873259 0.34195467 0.98724416]
[0.28542943 0.06892523 0.47449629]
...,
[0.29115949 0.50910294 0.51079493]
[0.52711097 0.43665451 0.4290048 ]
[0.47940793 0.3768091 0.78571804]]
[ 9.12730000e+04 2.09193490e+01 1.69436100e+00]
[ 0. 0. 0.001156]
the classifier came back with: 3, the realanwer is: 3
the classifier came back with: 2, the realanwer is: 2
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 3, the realanwer is: 3
the classifier came back with: 3, the realanwer is: 3
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 3, the realanwer is: 3
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 1, the realanwer is: 1
the classifier came back with: 2, the realanwer is: 2
the classifier came back with: 1, the realanwer is: 1
…..
the total error ratio is 0.05
None
pecentage of time spend playing videogames?10 #输入的用于判定类别的数据
frequent filer miles earned per year?10000
lites of icecream consume per year?0.5
you will like this person in small doses
由输出可知 具备该数据特征(10,10000, 0.5)的人魅力一般