1、特点
(1)是监督学习算法
(2)不需要训练模型,但是如果数据庞大,时间复杂度将是O(n2)
(3)对于属性值较大的属性需要进行归一化,否则该属性将在计算距离的时候占据主导权。
2、代码
import numpy as np
import types
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def knn(train_set, test_set, k):
"""
:param train_set: [[1, 2, 3, A],[4, 5, 6, B],[7, 8, 9, C]....], 数组格式
:param test_set: [[10, 11, 12], [13, 14, 15]],数组格式
:param k: 为test_set中的每一个对象选择k个最近的邻居
:return: 根据投票原则为test_set中的每一个对象预测分类,返回predict_label_list
"""
# train_set和test_set必须是数组
if isinstance(train_set, list):
train_set = np.array(train_set)
if isinstance(test_set, list):
test_set = np.array(test_set)
# 得到训练数据集中的标签列表
labels = train_set[:, -1]
print('labels: \n', len(labels))
# 过滤掉训练数据集中的标签方便计算距离
# 因为标签可能是字符数据,所以导致整个数组的数据格式都是字符,所以切片后需要转化数据格式为float
print(train_set[:, :-1])
train_set = train_set[:, :-1].astype(np.float)
test_set = test_set.astype(np.float)
# 因为第一列的数据值太大,它会占据距离计算的主导权,但是每个属性是平等的,所以必须进行归一化处理
train_set = normalization_process(train_set)
test_set = normalization_process(test_set)
# print('train_set: \n', train_set)
# 计算test_set中的每一个对象到train_set中每个对象的距离
# predict_label = []保存预测结果
predict_label_list = []
for test_obj in test_set:
# print('test_obj: \n', test_obj)
dist_list = []
dist_lab = {}
for train_obj in train_set:
# print('train_obj: \n', train_obj)
temp = (test_obj - train_obj)**2
# print('temp: \n', temp)
distance = (np.sum(temp, axis=0))**0.5
dist_list.append(distance)
# 以{distance: labels}的格式生成字典,方便筛选k个最近邻
# print('dist_list: \n', len(dist_list))
for i in range(len(labels)):
dist_lab[dist_list[i]] = labels[i]
# sorted()返回的是排序后的key列表, 默认从小到大排序
key_list = sorted(dist_lab)
# 根据排序后的key列表得到相应的标签列表
label_list = []
for i in key_list:
label_list.append(dist_lab[i])
# 截取前k个标签
label_list = label_list[:k]
# counter方法可以快速得到列表中某个出现次数最多的值
max_fre_label = Counter(label_list).most_common(1)[0][0]
predict_label_list.append(max_fre_label)
return predict_label_list
def process_text_data(file_path, fea_num):
"""
:param file_path: 文件路径
:param fea_num: 属性的个数
:return: 由文本数据得到的训练数组
"""
try:
f = open(file_path)
try:
data = f.readlines()
# 得到文本数据的行数
row_num = len(data)
# 初始化一个值全为0的数组,用来保存文本数据
# 因为最后一列是标签属于字符串格式,所以初始化的数组中的数据必须也是字符串格式
# 否则没办法赋值
return_array = np.zeros((row_num, fea_num)).astype(str)
index = 0
for line in data:
# 去掉字符串首尾的换行符、空白符、制表符(\t),总之去掉首尾的空白
line = line.strip()
# 按制表符(\t)划分每行数据的list
line_list = line.split('\t')
return_array[index, :] = line_list
index += 1
return return_array
finally:
f.close()
except IOError as e:
print('error: \n' + str(e))
# 归一化处理
def normalization_process(data_set):
"""
:param data_set: 输入一个数组或者DataFrame
:return: 返回归一化之后的数组
"""
if type(data_set) is not types.FrameType:
data_set = pd.DataFrame(data_set)
data_set = (data_set - data_set.min())/(data_set.max() - data_set.min())
data_set = np.array(data_set)
return data_set
if __name__ == '__main__':
# 得到文本数据数组
file_pth = 'F:/MachineLearning/ML_chapter2/datingTestSet.txt'
data_array = process_text_data(file_pth, 4)
print('data_array: \n', data_array)
# 得到训练集和测试集
from random import shuffle
shuffle(data_array)
train_set = data_array[:int(len(data_array)*0.8), :]
test_set = data_array[int(len(data_array)*0.8):, :]
test_set = test_set[:, :-1]
test_labels = test_set[:, -1]
# 预测
predict_result = knn(train_set, test_set, 100)
print('result: \n', predict_result)
# 计算准确率
accu = accuracy_score(test_labels, predict_result)
print('accu: \n', accu)
# 生成结果报告
# report = classification_report(test_labels, predict_result)
# print('report: \n', report)