Python3学习笔记【机器学习】【k-近邻算法】【约会网站配对算法】

# -*- coding: utf-8 -*-
"""
Created on Wed May 22 10:43:50 2019

@author: 激光雷达
"""

from numpy import *
import operator

'''Part 1 '''

def creatDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

def classfiy0(iinX,dataSet,labels,k):
    # Step 1 :     Calculated distance   
    dataSetSize = dataSet.shape[0]
    # shape return the size of the matrix
    dataMat = tile(iinX,(dataSetSize,1)) - dataSet
    # tile is copy the matrix or group : tile(object,(col,row))
    sqDiffMat = dataMat**2
    # ** means power
    sqDistances = sqDiffMat.sum(axis=1)
    # sum means plus all , when the axis = 0 plus by col,axis = 1 ,by row
    distances = sqDistances**0.5
    # According to the Euclidean distance formula, the prescription should be made here.
    # We get all distance from the dataSet to the NewOne
    sortDistanceIndex = distances.argsort()
    # sort distance , notice here argsort return the index ,not element
    
    # Step 2 :     Determining the Classification of the First K 
    #              Minimum Distance Elements   
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortDistanceIndex[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    # get（key，return（default is None）） 
    # here run like this : k is a requirement for near accuracy, 
    # According to the previous order, we take the labels of the corresponding 
    # first k points into the new dictionary classCount and get the most
    # similar three labels.
    
    # Step 3 :    Decomposed the classCount inyto a tupleList,sort it and
    #             return the result.
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),
                              reverse=True)
    return sortedClassCount[0][0]
   
     
group,labels = creatDataSet() 

print(classfiy0([0,0],group,labels,3))

''' Part 2 '''

def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    # readlines() : read all lines in a file untill meet EOF ,return a list
    #               when meet EOF return empty string
    numberOfLines = len(arrayOLines)
    # len(obeject) : return the length of obeject
    returnMat = zeros((numberOfLines,3))
    # zeros((numOfRow,numOfCol),dtype = int/double... )  make a 0 matrix
    classLabelVector = []
    index = 0
    for line in arrayOLines: 
        line = line.strip()
        # strip('obeject') : remove the obeject from the string head and tail
        #                    here means remove 'space'
        listFromline = line.split('\t')
        # split('Obeject') :Separating strings with target symbols
        # '\t' : is horizontal tab
        returnMat[index,:] = listFromline[0:3]
        # ',' Used to disambiguate
        # copy the 0,1,2 elements of this row ,which is the data ,
        # to the returnMat 
        classLabelVector.append(int(listFromline[-1]))
        # copy the last elements of this row to the returnMat ,
        # which is label
        index += 1
    return returnMat,classLabelVector

datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')

print(datingDataMat)
print(datingDataLabels[0:20])


import matplotlib
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
# For example, "111" means "1 *1 grid, first subgraph" 
# and "234" means "2 *3 grid, fourth subgraph".
ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
# Drawing scatter plots
ax.set_xlabel("Percentage of time spent playing video games")
ax.set_ylabel("Ice cream kilograms consumed per week")
plt.show()

plt2 = matplotlib.pyplot
fig2 = plt2.figure()
ax2 = fig2.add_subplot(111)
ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingDataLabels)
      ,15.0*array(datingDataLabels))
# The above code uses the class label attributes stored in the variable 
# datingLabels to plot points of different colors and sizes on the
# scatter plot.
ax2.set_xlabel("Percentage of time spent playing video games")
ax2.set_ylabel("Ice cream kilograms consumed per week")
plt2.show()

plt3 = matplotlib.pyplot
fig3 = plt3.figure()
ax3 = fig3.add_subplot(111)
ax3.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingDataLabels)
      ,15.0*array(datingDataLabels))
# The above code uses the class label attributes stored in the variable 
# datingLabels to plot points of different colors and sizes on the
# scatter plot.
ax3.set_xlabel("Frequent Flight Miles Obtained Annually")
ax3.set_ylabel("Percentage of time spent playing video games")
ax3.legend()
plt3.show()

# Add Legend
plt4 = matplotlib.pyplot
plt4.rcParams['font.sans-serif']=['Simhei']
plt4.rcParams['axes.unicode_minus']=False
 
datingDataMat4, datingLabels4 = file2matrix('datingTestSet2.txt')
 
plt4.figure()
axes4 = plt4.subplot(111)
 
type1_x = []
type1_y = []
type2_x = []
type2_y = []
type3_x = []
type3_y = []
 
for i in range(len(datingLabels4)):
    if datingLabels4[i] == 1:
        type1_x.append(datingDataMat4[i][0])
        type1_y.append(datingDataMat4[i][1])
 
    if datingLabels4[i] == 2:
        type2_x.append(datingDataMat4[i][0])
        type2_y.append(datingDataMat4[i][1])
 
    if datingLabels4[i] == 3:
        type3_x.append(datingDataMat4[i][0])
        type3_y.append(datingDataMat4[i][1])      
# Depart the data to three part

type1 = axes4.scatter(type1_x, type1_y, s=20, c='r')
type2 = axes4.scatter(type2_x, type2_y, s=40, c='b')
type3 = axes4.scatter(type3_x, type3_y, s=60, c='k')
# Specify attributes for each part of the graph
 
plt4.legend((type1, type2, type3), ('Dislike', 'Charming general', 'Glamour'))
plt4.show()

'''   Part  3    '''

def autoNorm(dataSet):
    minValues = dataSet.min(0)
    maxValues = dataSet.max(0)
    ranges = maxValues - minValues 
    normDataSet = zeros(shape(dataSet))
    tempVector = dataSet.shape[0]
    normDataSet = dataSet - tile(minValues,(tempVector,1))
    normDataSet = normDataSet/tile(ranges,(tempVector,1))
    return normDataSet,ranges,minValues

normDataSet,ranges,minValues = autoNorm(datingDataMat)

print()
print(normDataSet)
print()
print(ranges)
print()
print(minValues)
print()

'''   Part  4    '''

def datingClassTest():
    hoRatio = 0.10
    datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minValues = autoNorm(datingDataMat)
    tempVector = normMat.shape[0]
    numTestVecs = int(tempVector*hoRatio)
    errorCount = 0.
    for i in range(numTestVecs):
        classfierResults = classfiy0(normMat[i,:],normMat[numTestVecs:tempVector,:],
                                     datingDataLabels[numTestVecs:tempVector],3)
        print("The classfier came back with: %d, the real is : %d"
              %(classfierResults,datingDataLabels[i]))
        if (classfierResults != datingDataLabels[i] ):
            errorCount += 1
    print( "The total error rate is : %f"%(errorCount/float(numTestVecs)) )
   
datingClassTest()

'''   Part  5    '''

def classfiyPerson():
    resultList = ['Not at all','Small doses','Large doses']
    percentTats = float(input("Percecntage of time spent on video games ?"))
    ffMiles = float(input("Frequent flier miles earned per year ?"))
    iceCream = float(input("Liters icecream consumed per year ?"))
    datingDataMat,datingDataLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minValues = autoNorm(datingDataMat)
    inArr = array([ffMiles,percentTats,iceCream])
    classfiyResult = classfiy0((inArr - minValues)/ranges,normMat,
                               datingDataLabels,3)
    print("You will probably like this person: ",resultList[classfiyResult - 1])
    
classfiyPerson()


#Over