import numpy as np
import operator
# K近邻算法分类爱情片和动作片
# 通过打斗镜头次数和接吻镜头次数区分
# 模拟数据 打斗次数 接吻次数
def init_data():
data_X = np.array([[1,100],[2,96],[3,93],[6,90],[80,3],[93,2],[86,5]])
data_Y = ['爱情片','爱情片','爱情片','爱情片','动作片','动作片','动作片']
return data_X,data_Y
# data: 测试数据
# testData: 数据集合
# output:测试数据输出
# k:取最接近数据的前几个
def kNN(data, testData, output, k):
# 获取测试数据数量
dataInputRow = testData.shape[0]
# np.tile 数组沿各个方向复制
# 这里将输入数据copy和测试数据数量保持一致,用来计算和测试数据的欧式距离
reduceData = np.tile(data, (dataInputRow,1)) - testData
squareData = reduceData ** 2
squareDataSum = squareData.sum(axis = 1)
distance = squareDataSum ** .5
sortDistance = distance.argsort()
dataCount = {}
# 统计排名靠前k数据的爱情片和动作片次数,取次数最高的做为输出
for i in range(k):
output_ = output[sortDistance[i]]
dataCount[output_] = dataCount.get(output_,0) + 1
sortDataCount = sorted(dataCount.items(), key = operator.itemgetter(1), reverse = True)
return sortDataCount[0][0]
if __name__ == '__main__':
data_X,data_Y = init_data()
import operator
# K近邻算法分类爱情片和动作片
# 通过打斗镜头次数和接吻镜头次数区分
# 模拟数据 打斗次数 接吻次数
def init_data():
data_X = np.array([[1,100],[2,96],[3,93],[6,90],[80,3],[93,2],[86,5]])
data_Y = ['爱情片','爱情片','爱情片','爱情片','动作片','动作片','动作片']
return data_X,data_Y
# data: 测试数据
# testData: 数据集合
# output:测试数据输出
# k:取最接近数据的前几个
def kNN(data, testData, output, k):
# 获取测试数据数量
dataInputRow = testData.shape[0]
# np.tile 数组沿各个方向复制
# 这里将输入数据copy和测试数据数量保持一致,用来计算和测试数据的欧式距离
reduceData = np.tile(data, (dataInputRow,1)) - testData
squareData = reduceData ** 2
squareDataSum = squareData.sum(axis = 1)
distance = squareDataSum ** .5
sortDistance = distance.argsort()
dataCount = {}
# 统计排名靠前k数据的爱情片和动作片次数,取次数最高的做为输出
for i in range(k):
output_ = output[sortDistance[i]]
dataCount[output_] = dataCount.get(output_,0) + 1
sortDataCount = sorted(dataCount.items(), key = operator.itemgetter(1), reverse = True)
return sortDataCount[0][0]
if __name__ == '__main__':
data_X,data_Y = init_data()
print(kNN([3,93], data_X, data_Y, 3))