# -*- coding: utf-8 -*-"""
Created on Wed May 22 10:43:50 2019
@author: 激光雷达
"""from numpy import*import operator
'''Part 1 '''defcreatDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels =['A','A','B','B']return group,labels
defclassfiy0(iinX,dataSet,labels,k):# Step 1 : Calculated distance
dataSetSize = dataSet.shape[0]# shape return the size of the matrix
dataMat = tile(iinX,(dataSetSize,1))- dataSet
# tile is copy the matrix or group : tile(object,(col,row))
sqDiffMat = dataMat**2# ** means power
sqDistances = sqDiffMat.sum(axis=1)# sum means plus all , when the axis = 0 plus by col,axis = 1 ,by row
distances = sqDistances**0.5# According to the Euclidean distance formula, the prescription should be made here.# We get all distance from the dataSet to the NewOne
sortDistanceIndex = distances.argsort()# sort distance , notice here argsort return the index ,not element# Step 2 : Determining the Classification of the First K # Minimum Distance Elements
classCount ={}for i inrange(k):
voteIlabel = labels[sortDistanceIndex[i]]
classCount[voteIlabel]= classCount.get(voteIlabel,0)+1# get(key,return(default is None)) # here run like this : k is a requirement for near accuracy, # According to the previous order, we take the labels of the corresponding # first k points into the new dictionary classCount and get the most# similar three labels.# Step 3 : Decomposed the classCount inyto a tupleList,sort it and# return the result.
sortedClassCount =sorted(classCount.items(),key=operator.itemgetter(1),
reverse=True)return sortedClassCount[0][0]
group,labels = creatDataSet()print(classfiy0([0,0],group,labels,3))''' Part 2 '''deffile2matrix(filename):
fr =open(filename)
arrayOLines = fr.readlines()# readlines() : read all lines in a file untill meet EOF ,return a list# when meet EOF return empty string
numberOfLines =len(arrayOLines)# len(obeject) : return the length of obeject
returnMat = zeros((numberOfLines,3))# zeros((numOfRow,numOfCol),dtype = int/double... ) make a 0 matrix
classLabelVector =[]
index =0for line in arrayOLines:
line = line.strip()# strip('obeject') : remove the obeject from the string head and tail# here means remove 'space'
listFromline = line.split('\t')# split('Obeject') :Separating strings with target symbols# '\t' : is horizontal tab
returnMat[index,:]= listFromline[0:3]# ',' Used to disambiguate# copy the 0,1,2 elements of this row ,which is the data ,# to the returnMat
classLabelVector.append(int(listFromline[-1]))# copy the last elements of this row to the returnMat ,# which is label
index +=1return returnMat,classLabelVector
datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')print(datingDataMat)print(datingDataLabels[0:20])import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)# For example, "111" means "1 *1 grid, first subgraph" # and "234" means "2 *3 grid, fourth subgraph".
ax.scatter(datingDataMat[:,1],datingDataMat[:,2])# Drawing scatter plots
ax.set_xlabel("Percentage of time spent playing video games")
ax.set_ylabel("Ice cream kilograms consumed per week")
plt.show()
plt2 = matplotlib.pyplot
fig2 = plt2.figure()
ax2 = fig2.add_subplot(111)
ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingDataLabels),15.0*array(datingDataLabels))# The above code uses the class label attributes stored in the variable # datingLabels to plot points of different colors and sizes on the# scatter plot.
ax2.set_xlabel("Percentage of time spent playing video games")
ax2.set_ylabel("Ice cream kilograms consumed per week")
plt2.show()
plt3 = matplotlib.pyplot
fig3 = plt3.figure()
ax3 = fig3.add_subplot(111)
ax3.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingDataLabels),15.0*array(datingDataLabels))# The above code uses the class label attributes stored in the variable # datingLabels to plot points of different colors and sizes on the# scatter plot.
ax3.set_xlabel("Frequent Flight Miles Obtained Annually")
ax3.set_ylabel("Percentage of time spent playing video games")
ax3.legend()
plt3.show()# Add Legend
plt4 = matplotlib.pyplot
plt4.rcParams['font.sans-serif']=['Simhei']
plt4.rcParams['axes.unicode_minus']=False
datingDataMat4, datingLabels4 = file2matrix('datingTestSet2.txt')
plt4.figure()
axes4 = plt4.subplot(111)
type1_x =[]
type1_y =[]
type2_x =[]
type2_y =[]
type3_x =[]
type3_y =[]for i inrange(len(datingLabels4)):if datingLabels4[i]==1:
type1_x.append(datingDataMat4[i][0])
type1_y.append(datingDataMat4[i][1])if datingLabels4[i]==2:
type2_x.append(datingDataMat4[i][0])
type2_y.append(datingDataMat4[i][1])if datingLabels4[i]==3:
type3_x.append(datingDataMat4[i][0])
type3_y.append(datingDataMat4[i][1])# Depart the data to three part
type1 = axes4.scatter(type1_x, type1_y, s=20, c='r')
type2 = axes4.scatter(type2_x, type2_y, s=40, c='b')
type3 = axes4.scatter(type3_x, type3_y, s=60, c='k')# Specify attributes for each part of the graph
plt4.legend((type1, type2, type3),('Dislike','Charming general','Glamour'))
plt4.show()''' Part 3 '''defautoNorm(dataSet):
minValues = dataSet.min(0)
maxValues = dataSet.max(0)
ranges = maxValues - minValues
normDataSet = zeros(shape(dataSet))
tempVector = dataSet.shape[0]
normDataSet = dataSet - tile(minValues,(tempVector,1))
normDataSet = normDataSet/tile(ranges,(tempVector,1))return normDataSet,ranges,minValues
normDataSet,ranges,minValues = autoNorm(datingDataMat)print()print(normDataSet)print()print(ranges)print()print(minValues)print()''' Part 4 '''defdatingClassTest():
hoRatio =0.10
datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minValues = autoNorm(datingDataMat)
tempVector = normMat.shape[0]
numTestVecs =int(tempVector*hoRatio)
errorCount =0.for i inrange(numTestVecs):
classfierResults = classfiy0(normMat[i,:],normMat[numTestVecs:tempVector,:],
datingDataLabels[numTestVecs:tempVector],3)print("The classfier came back with: %d, the real is : %d"%(classfierResults,datingDataLabels[i]))if(classfierResults != datingDataLabels[i]):
errorCount +=1print("The total error rate is : %f"%(errorCount/float(numTestVecs)))
datingClassTest()''' Part 5 '''defclassfiyPerson():
resultList =['Not at all','Small doses','Large doses']
percentTats =float(input("Percecntage of time spent on video games ?"))
ffMiles =float(input("Frequent flier miles earned per year ?"))
iceCream =float(input("Liters icecream consumed per year ?"))
datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minValues = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classfiyResult = classfiy0((inArr - minValues)/ranges,normMat,
datingDataLabels,3)print("You will probably like this person: ",resultList[classfiyResult -1])
classfiyPerson()#Over