from numpy import *
import operator
# 创建数据
def createDataBase():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) # numpy向量
labels = ['A', 'A', 'B', 'B'] # 列表
return group, labels
# kNN算法
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # 获得向量第一维长度
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # 纵向扩大dataSetSize倍
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1) # 按行求和
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort() # 从小到大排序,返回的是索引值的列表
classCount = {} # python字典
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 数频度,每次加1
# 对字典进行排序
# Python 2 才能使用classCount.iteritems()
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
# 准备数据:处理读入的数据,只取前三个特征
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines() # 将文件每一行,变成列表的每个元素
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines, 3)) # 3列,注意不能少括号
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() # 截取所有的回车字符
listFromLine = line.split('\t') # 返回一个列表
returnMat[index, :] = listFromLine[0:3] # 列表赋值
# 把datingTestSet.txt文件里的largeDoses变成3,smallDoses变成2,didntLike变成1
classLabelVector.append(int(listFromLine[-1])) # 取最后一个
index += 1
return returnMat, classLabelVector
# 归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0) # 每一列最小值
maxVals = dataSet.max(0) # 每一列最大值
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0] # 行数(样本数)
# 归一化公式,处理到0-1
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
# 也可以只返回矩阵
return normDataSet, ranges, minVals
# 分类器针对约会代码的测试代码
def datingClassTest():
hoRadio = 0.10
# 获取数据
datingDataMat, datingLabels = file2matrix("datingTestSet.txt")
# 均值归一化
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRadio) # 测试向量的数量
errorCount = 0.0
for i in range(numTestVecs):
# 前numTestVecs个作为测试数据,后面作为样本
classifierResult = classify0(normMat[i, :], normMat[numTestVecs : m, :], datingLabels[numTestVecs : m], 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
# 构建完整可用系统
def classifyPerson():
resultList = ['完全不喜欢', '有点喜欢', '很喜欢']
# 注意Python 3不能用raw_input
ffMiles = float(input("frequent flier miles earned per year? "))
percentTats = float(input("percentage of time spent playing video games? "))
iceCream = float(input("liters of ice cream consumed per year? "))
datingDataMat, datingLabels = file2matrix("datingTestSet.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
# 注意输入的测试向量也要均值归一化
classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
print("你大概对这个男人" + resultList[classifierResult - 1])
if __name__ == '__main__':
# 从文本文件中解析数据
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
print(datingDataMat)
print(datingLabels[0:20])
# 用Matplotlib画散点图
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(datingDataMat[ : , 1], datingDataMat[ : , 2])
# 不同的颜色,使用第2列和第3列数据
# ax.scatter(datingDataMat[ : , 1], datingDataMat[ : , 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
# 使用第1列和第2列数据
ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
plt.show()
# 关闭图像,否则下面跑不出来!!!!
# 归一化数值
normMat, ranges, minVals = autoNorm(datingDataMat)
print("-------------------归一化数值-----------------------")
print(normMat)
print(ranges)
print(minVals)
print("-------------------测试算法-----------------------")
datingClassTest()
print("-------------------构建完整可用系统-----------------------")
classifyPerson()
# arr = array([[1, 2, 3, 4], [5, 6, 7, 8]])
# print(arr.shape)
# matrix = mat(arr)
# print(matrix.shape)
# print(array([[1, 2],[3, 4]]))
# print(array([(1, 2), (3, 4)]))
# a = array([1, 2])
# print(a.dtype)
# a = [1, 2, 3, 4]
# print(tile(a, 2))
# group, labels = createDataBase()
# print(classify0([0, 0], group, labels, 3)) # 输出B
Machine Learning in Action_CH2_2_使用kNN改进约会网站的配对效果
最新推荐文章于 2024-02-23 10:15:35 发布