第二章.knn
import numpy as np
import operator
def createDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
# 下面代码使用knn进行一次简单的分类
def classify0(input_data, data_set, labels, k):
# 1、计算距离
# 2、找到最近的k个数据
# 3、统计他们的类别
# 4、返回类别最多的那个
# 1、
input_data = (input_data - data_set) ** 2
distances = np.sqrt(np.sum(input_data, axis=1))
# 2+3
sorted_data_index = distances.argsort()
class_count = {}
for i in range(k):
label = labels[sorted_data_index[i]]
class_count[label] = class_count.get(label, 0) + 1
sorted_class_count = sorted(class_count.items(), key = lambda x: x[1], reverse=True)
return sorted_class_count[0][0]
group, labels = createDataSet()
classify0([0,0], group, labels, 3)
'B'
例子1:使用knn进行约会网站的配对
00.使用pandas处理数据
import pandas as pd
df = pd.read_csv("dataset/datingTestSet2.txt", sep='\t', names=['games', "miles", "ice_cream", "category"])
df
games | miles | ice_cream | category | |
---|---|---|---|---|
0 | 40920 | 8.326976 | 0.953952 | 3 |
1 | 14488 | 7.153469 | 1.673904 | 2 |
2 | 26052 | 1.441871 | 0.805124 | 1 |
3 | 75136 | 13.147394 | 0.428964 | 1 |
4 | 38344 | 1.669788 | 0.134296 | 1 |
... | ... | ... | ... | ... |
995 | 11145 | 3.410627 | 0.631838 | 2 |
996 | 68846 | 9.974715 | 0.669787 | 1 |
997 | 26575 | 10.650102 | 0.866627 | 3 |
998 | 48111 | 9.134528 | 0.728045 | 3 |
999 | 43757 | 7.882601 | 1.332446 | 3 |
1000 rows × 4 columns
01.文件转变量
# 将文件转换为numpy数组
def file2metrix(filename):
df = pd.read_csv("dataset/datingTestSet2.txt", sep='\t', names=['games', "miles", "ice_cream", "category"])
features = np.array([list(df.games.values), list(df.miles.values), list(df.ice_cream.values)]).T
category = np.array(list(df.category.values))
return features, category
features, category = file2metrix("dataset/datingTestSet2.txt")
category[:20]
array([3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3],
dtype=int64)
02.分析数据
# 图像显示中文字体
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
plt.scatter(features[:, 1], features[:, 2], c=category)
plt.xlabel("游戏时间")
plt.ylabel("冰淇淋公升数")
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uoVHGR5N-1665481867492)(output_12_0.png)]
category_1_mask = category == 1
plt.plot(features[:, 0][category_1_mask], features[:, 1][category_1_mask], 'bo', label="不喜欢")
category_2_mask = category == 2
plt.plot(features[:, 0][category_2_mask], features[:, 1][category_2_mask], 'go', label="魅力一般")
category_3_mask = category == 3
plt.plot(features[:, 0][category_3_mask], features[:, 1][category_3_mask], 'ro', label="极具魅力")
plt.legend(loc="best")
plt.xlabel("里程数")
plt.ylabel("游戏时间")
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-I57OXiFG-1665481867493)(output_13_0.png)]
03.归一化数据
def autoNorm(dataset):
# 将数据归一化
max_value = np.max(dataset, axis=0)
min_value = np.min(dataset, axis=0)
ranges = max_value - min_value
dataset_normilized = (dataset - min_value) / ranges
return dataset_normilized, ranges, min_value
dataset_normilized, ranges, min_value = autoNorm(features)
min_value
array([0. , 0. , 0.001156])
04.测试算法
# 将一部分数据作为测试数据
def dating_class_test():
test_ratio = 0.1
# 1、获取数据
features, labels = file2metrix("dataset/datingTestSet2.txt")
# 2、对数据进行归一化操作
features_normilized = autoNorm(features)
# 3、取一部分作为测试数据
data_size = features.shape[0]
random_index = np.arange(data_size)
np.random.shuffle(random_index)
test_data_size = int(data_size * test_ratio)
features_test = features[random_index[:test_data_size]]
features_train = features[random_index[test_data_size:]]
labels_train = labels[random_index[test_data_size:]]
labels_test = labels[random_index[:test_data_size]]
error_count = 0
for i in range(test_data_size):
label_pred = classify0(features_test[i], features_train, labels_train, 3)
if label_pred != labels_test[i]:
error_count += 1
error_percent = error_count / float(test_data_size)
print("错误率是:{0}%".format(error_percent))
dating_class_test()
错误率是:0.22%
05.构建可用系统
numpy.ndarray
def classify_person():
result_labels = ['不喜欢', '小魅力', '大魅力']
dating_features, dating_labels = file2metrix("dataset/datingTestSet2.txt")
dating_features_normilized, ranges, min_value = autoNorm(dating_features)
input_game = input("输入他每周花费在游戏上的时间")
input_miles = input("输入他每周飞行的里程数")
input_ice_cream = input("输入他每周吃的冰淇淋公升数")
input_feature = (np.array([float(input_game), float(input_miles), float(input_ice_cream)]) - min_value) / ranges
classified_result = classify0(input_feature, dating_features_normilized, dating_labels, 3)
print("这个人的类型属于:", result_labels[classified_result - 1])
classify_person()
这个人的类型属于: 不喜欢