机器学习之KNN
约会问题为例,展示整个算法工作流程
源代码
注:这里使用的是python语言
import numpy as np
import operator
def read_data(file_path):
fr = open(file_path)
file_lines = fr.readlines()
data_num = len(file_lines)
dataset = np.zeros((data_num, 3))
class_vec = []
index = 0
for line in file_lines:
line_split = (line.strip()).split('\t')
dataset[index, :] = line_split[:-1]
class_vec.append(line_split[-1])
index += 1
return dataset, class_vec
def data_classify(dataset, class_vec):
type1 = [[], [], []]
type2 = [[], [], []]
type3 = [[], [], []]
labels = list(set(class_vec))
for i in range(len(dataset)):
if class_vec[i] == labels[0]:
type1[0].append(dataset[i,0])
type1[1].append(dataset[i, 1])
type1[2].append(dataset[i, 2])
elif class_vec[i] == labels[1]:
type2[0].append(dataset[i,0])
type2[1].append(dataset[i, 1])
type2[2].append(dataset[i, 2])
elif class_vec[i] == labels[2]:
type3[0].append(dataset[i,0])
type3[1].append(dataset[i, 1])
type3[2].append(dataset[i, 2])
return type1, type2, type3, labels
def view_data(dataset, class_vec):
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.rcParams['axes.unicode_minus'] = False
type1, type2, type3, labels = data_classify(dataset, class_vec)
fig1 = plt.figure(figsize=(30, 10))
axe1 = plt.subplot(1, 3, 1)
axe1.scatter(type1[0], type1[1])
axe1.scatter(type2[0], type2[1])
axe1.scatter(type3[0], type3[1])
plt.legend(labels)
axe2 = plt.subplot(1, 3, 2)
axe2.scatter(type1[0], type1[2])
axe2.scatter(type2[0], type2[2])
axe2.scatter(type3[0], type3[2])
plt.legend(labels)
axe3 = plt.subplot(1, 3, 3)
axe3.scatter(type1[1], type1[2])
axe3.scatter(type2[1], type2[2])
axe3.scatter(type3[2], type3[2])
plt.legend(labels)
plt.show()
def split_dataset(dataset, class_vec):
ratio = 0.8
stop_index = int(dataset.shape[0] * ratio)
train_dataset = dataset[: stop_index, :]
test_dataset = dataset[stop_index:, :]
train_class = class_vec[: stop_index]
test_class = class_vec[stop_index :]
return train_dataset, test_dataset, train_class, test_class
def data_normalize(train_dataset):
maxv = train_dataset.max(0)
minv = train_dataset.min(0)
diff = maxv -minv
# print(diff, type(diff))
data_normal = (train_dataset - np.tile(minv, (train_dataset.shape[0],1))) / np.tile(diff, (train_dataset.shape[0],1))
# print(data_normal[:10,:])
return data_normal, maxv, minv
def KNN(train_dataset, train_class, input_data, k):
difference = np.sum((train_dataset - input_data) ** 2, 1)
sorted_index = difference.argsort()
class_count = {}
for i in range(k):
voted_label = train_class[sorted_index[i]]
class_count[voted_label] = class_count.get(voted_label, 0) + 1
sorted_class = sorted(class_count.items(), key = operator.itemgetter(1), reverse=True)
return sorted_class[0][0]
def main():
file_path = "datingTestSet.txt"
dataset, class_vec = read_data(file_path)
print('dataset = \n', dataset[:5, :])
print(type(class_vec), 'class_vec = \n', class_vec[:20])
view_data(dataset, class_vec)
print('画图完成!')
train_dataset, test_dataset, train_class, test_class = split_dataset(dataset, class_vec)
train_data_normal, maxv, minv = data_normalize(train_dataset)
error_num = 0
for i in range(len(test_dataset)):
data = test_dataset[i]
data_normal = (data - minv) / (maxv - minv)
label = KNN(train_data_normal, train_class, data_normal, 3)
print('估计出的标签为%s, 实际的标签为:%s' % (label, test_class[i]))
if str(label) != str(test_class[i]):
error_num += 1
print('*' * 20, '上面一行错误', '*' * 20)
print('错误率:%.2f' % (float(error_num / len(test_dataset))))
if __name__ == "__main__":
main()