（机器学习实战）2.2使用K临近法改进约会网站的配对效果（详细注释）

最新推荐文章于 2023-05-23 15:34:09 发布

修炼果

最新推荐文章于 2023-05-23 15:34:09 发布

阅读量197

点赞数

分类专栏： 06-机器学习实战+西瓜书

本文链接：https://blog.csdn.net/eettttttt/article/details/80238624

版权

06-机器学习实战+西瓜书专栏收录该内容

6 篇文章 2 订阅

订阅专栏

样本集下载：https://pan.baidu.com/s/1GJoeIRlMLRWbzjsG4EU_RA

编译：python3.6

运行可用：

import operator 
from numpy import *
from scipy import *
from matplotlib import *



"""
	KNN近邻分类思路：
	1，用shape[0]得到行数
	2，把带划分数据用tile扩展成一个矩阵
	3，（用该矩阵-训练数据）平方后开方得到一个列表
	4，对列表按距离升序排列
	5，选取前K个中标签出现次数后按降序排列，[0][0]即为出现最多的标签作为带预测的划分标签
    """
#inx为自己输入要分类的测试数，dataset为训练集，labels为标签集，k为kNN中的K值
#shape[0]得到训练样本多少行
#tile建立inx，dataSetSize行，1列
#sum(axis=1),按行相加
#argsort递增排序
def classify0(inX,dataSet,labels,k):
	dataSetSize=dataSet.shape[0]
	diffMat=tile(inX,(dataSetSize,1))-dataSet
	sqDiffMat=diffMat**2
	sqDistances=sqDiffMat.sum(axis=1)    #
	distances=sqDistances**0.5
	sortedDistIndicies=distances.argsort() #生成的不是距离而是标号列表 
	#print(sortedDistIndicies)
	classCount={}
	for i in range(k):
		#print(sortedDistIndicies[i])               
		voteIlabel=labels[sortedDistIndicies[i]]    
		classCount[voteIlabel]=classCount.get(voteIlabel,0)+1   #这一步是做相同标签的统计（+1），找到键voteIlabel对应的值，如果不存在这样的键则返回默认值0
		#print(classCount.get(voteIlabel,0))
	sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) #sorted排序做升序，返回副本，原始输入不变，里面的参数设置请参考https://blog.csdn.net/hshl1214/article/details/40587985
	#print(sortedClassCount[0][0])                                                       #或者参考https://www.cnblogs.com/zhoufankui/p/6274172.html
	return sortedClassCount[0][0]




"""
	从文本中解析数据思路：
	1，读取文件
	2，取得文件所有行，取得行数
	3，建立除标签外同等大小的矩阵returnMat
	4，处理文件前后回车
	5，把处理后的类依次放入矩阵中
	6，把标签放入列表中
	7，返回矩阵和列表
	"""
#open() 函数用于打开一个文件，创建一个 file 对象，相关的方法才可以调用它进行读写。
#file.readline() 返回一行
#file.readlines([size]) 返回包含size行的列表,size 未指定则返回全部行
def file2matrix(filename):
	fr=open(filename)
	arrayOlines=fr.readlines()   #返回所有的行
	numberOflines=len(arrayOlines)  #返回所有的行数
	returnMat=zeros((numberOflines,3)) #生成numberOflines行，3列的矩阵  //数组矩阵的区分
	classLabelVector=[]
	index=0
	for line in arrayOlines:
		line=line.strip()             #去掉字符串前后面的回车字符，它返回的是字符串的副本，并删除前导和后缀字符，注意strip()和lstrip和rstrip的区别
		listFromLine=line.split('\t')
		returnMat[index,:]=listFromLine[0:3]   #利用切片returnMat[index,:]存储前三个
		classLabelVector.append(listFromLine[-1])  #利用classLabelVector数列存储标签   #如果训练数据用datingTestSet2.txt，则改为classLabelVector.append(int(listFromLine[-1])) 
		index+=1
	return returnMat,classLabelVector              #返回一个矩阵和数列

#显示一下
#datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
#print(datingDataMat)
#print(datingLabels)




"""
	图片显示思路：
	1,倒入库
	2,建立图片
	3，图片在画布上的放置
	4，scatter依赖特征确定位置和标签来确定颜色
	5，显示
	"""
# ~ print("**********************************************")
# ~ import matplotlib.pyplot as plt
# ~ fig=plt.figure()
# ~ ax=fig.add_subplot(121) #111,参数111的意思是：将画布分割成1行1列，图像画在从左到右从上到下的第1块
# ~ ax.scatter(datingDataMat[:,1],datingDataMat[:,2],10.0*array(datingLabels),10.0*array(datingLabels))      #scatter参考链接https://blog.csdn.net/qiu931110/article/details/68130199
# ~ ax=fig.add_subplot(122) 
# ~ ax.scatter(datingDataMat[:,0],datingDataMat[:,2],5.0*array(datingLabels),5.0*array(datingLabels))     
# ~ plt.show()




"""
	归一化思路：
	1，找列中最小值、最大值
	2，本身值-最小值/（最大值-最小值）
	3，返回归一化值，范围，最小值
	"""
print("**********************************************")
#归一化数值
def autoNorm(dataSet):
	minVals=dataSet.min(0)      #参数0是的从列中取最小值
	maxVals=dataSet.max(0)      
	ranges=maxVals-minVals
	normDataSet=zeros(shape(dataSet))
	m=dataSet.shape[0]
	normDataSet=dataSet-tile(minVals,(m,1))
	normDataSet=normDataSet/tile(ranges,(m,1))
	return normDataSet,ranges,minVals
#显示一下
# ~ normDataSet,ranges,minVals=autoNorm(datingDataMat)
# ~ print(normDataSet)
# ~ print("**********************************************")
# ~ print(ranges)
# ~ print(minVals)




"""
	分类器构建思路：
	1，找到文档的数据集和标签
	2，归一化
	3，取百分之10作为测试集
	4，用分类函数分类
	5，统计分类错误率
	"""
def datingClassTest():
	hoRatio=0.10
	datingDataMat,datingLabels=file2matrix('datingTestSet.txt')
	normMat,ranges,minVals=autoNorm(datingDataMat)
	m=normMat.shape[0]
	numTestVecs=int(m*hoRatio)
	errorCount=0.0
	for i in range(numTestVecs):
		classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],\
			datingLabels[numTestVecs:m],3)
		print ("the classfier come back with: %s,the real answer is :%s"\
			%(classifierResult,datingLabels[i]))#如果用datingTestSet2.txt的数据则s改成d
		if(classifierResult!=datingLabels[i]):errorCount+=1
	print ("the total error rate is :%f" %(errorCount/float(numTestVecs)))

# ~ datingClassTest()


"""
	约会网站预测函数：
	1,输入特征值
	2、建立特征值数组
	3、归一化特征值
	4、预测
	"""
def classifyPerson():
	resultList=['not at all','in small doses','in large doses'] #如果用datingTestSet2.txt就需要这行、这里添加也没事
	percentTats=float(input(\
		'percentage of time spent playing video games?'))
	ffMiles=float(input(\
		'frequent filer miles eared per year?'))
	iceCream=float(input(\
		'liters of ice cream consumed per year?'))
	datingDataMat,datingLabels=file2matrix('datingTestSet.txt')
	normMat,ranges,minVals=autoNorm(datingDataMat)
	inArr=array([ffMiles,percentTats,iceCream])
	classfierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
	print('you will probably like this person:%s'%classfierResult)
classifyPerson()

修炼果

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
（机器学习实战）2.2使用K临近法改进约会网站的配对效果（详细注释）

样本集下载：https://pan.baidu.com/s/1GJoeIRlMLRWbzjsG4EU_RA编译：python3.6运行可用：import operator from numpy import *from scipy import *from matplotlib import *""" KNN近邻分类思路： 1，用shape[0]得到行数 2，把带划分数据用ti...
复制链接

扫一扫