# -*- coding: utf-8 -*-
"""
Created on Sun Oct 21 22:44:59 2018
@author: 国涛
"""
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import operator
def file2matrix(filename):
'''
从文件中读取数据
filename:是文件所在的路径
'''
#打开文件
fr = open(filename)
#读取文件的内容
arrayOLines = fr.readlines()
#计算文件的个数
numberOfLines = len(arrayOLines)
#创建出需要返回的数组
returnMat = np.zeros((numberOfLines,3))
#创建需要返回的类别字典
classLabelVector = []
index = 0
#开始读数据进行处理
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat,classLabelVector
def createMap(returnMat,classLabelVector):
'''
根据数据创建散点图
returnMat:数据集
classLabelVector:标签集
'''
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
row,cols = returnMat.shape
for i in range(row):
if(classLabelVector[i] == 'largeDoses'):
ax.scatter(returnMat[i,0],returnMat[i,1],returnMat[i,2],c='r',marker='o')
elif(classLabelVector[i] == 'smallDoses'):
ax.scatter(returnMat[i,0],returnMat[i,1],returnMat[i,2],c='b',marker='*')
else:
ax.scatter(returnMat[i,0],returnMat[i,1],returnMat[i,2],c='g',marker='^')
def autoNorm(returnMat):
'''
对数据集进行归一化
returnMat:数据集
normData:归一化后的数据集
ranges:maxVals - minVals
minVals:数据集中每列的最小值
'''
minVals = returnMat.min(0)
maxVals = returnMat.max(0)
ranges = (maxVals - minVals )
normDataSet = np.zeros((returnMat.shape))
m = returnMat.shape[0]
normDataSet = returnMat - np.tile(minVals ,(m,1))
normDataSet = normDataSet/np.tile(ranges,(m,1))
return normDataSet,ranges,minVals
def datingClassTest(dataingDataMat,dataingLabels):
'''
对网站分类器进行测试
'''
hoRate = 0.10
errorCount = 0.0
normData,ranges,minVals = autoNorm(dataingDataMat)
m = dataingDataMat.shape[0]
numTestVecs = int(hoRate * m)
for i in range(numTestVecs):
classifierResult = classify0(normData[i,:],normData[numTestVecs:m,:],
dataingLabels[numTestVecs:m],3)
print('the classifier came back with: %s,the real answer is : %s'
%(classifierResult,dataingLabels[i]))
if(classifierResult != dataingLabels):
errorCount += 1.0
print("the total error rate is %f%%" %(errorCount/float(numTestVecs)))
def classifyPerson():
'''
对实际的数据进行预测
'''
percentTats = float(input("percent of time spend playing video games:"))
ffMiles = float(input("frequent flier miles earned per years:"))
iceCream = float(input("liters of icr creams per years:"))
filePath = 'E:\\机器学习实战\\datingTestSet.txt'
returnMat,classLabelVector = file2matrix(filePath)
normData,ranges,minVals = autoNorm(returnMat)
inArr = np.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((nArr - minVals)/ranges,normData,
classLabelVector,3)
print('the classifier came back with: %s'
%(classifierResult))
def classify0(inX,dataSet,labels,k):
'''
inX:输入的测试集
dataSet:数据集
labels:标签
k:选择距离最小的k个点
'''
#计算数据集的个数
dataSetSize = dataSet.shape[0]
#计算inX与数据集直接的距离差
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
#计算平方和
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
#计算距离
distance = sqDistances**0.5
#根据距离进行排序
sortedDistIndices = distance.argsort()
classCount={}
for i in range(k):
votelabel = labels[sortedDistIndices[i]]
classCount[votelabel] = classCount.get(votelabel,0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0]
if __name__ == '__main__':
#根据数据集创建散点图
createMap(returnMat,classLabelVector)
#由于数据之间相差的太多,对数据进行归一化
#autoNorm(returnMat)
#d对分类器进行测试
datingClassTest(returnMat,classLabelVector)
classifyPerson()
结果显示
散点图:
模型测试误差:
对数据进行预测: