1.基础概念
1.1 kd树
由于在KNN的预测结果中需要根据训练样本给出每个预测样本的标签值,因此就需要知道每个训练样本的原始标签值,故需要在节点中保存每个样本索引。
class Node(object):
def __init__(self, data=None, index=-1):
self.data = data
self.left_child = None
self.right_child = None
self.index = index
def __str__(self):
return f"data({self.data}),index({int(self.index)})"
KD树的构建:
#定义类的初始化函数
class MyKDTree(object):
def __init__(self, points):
self.root = None
self.dim = points.shape[1]
points = np.hstack(([points, np.arange(0, len(points)).reshape(-1, 1)]))
self.insert(points, order=0) # 递归构建KD树
def is_empty(self):
return not self.root
insert()方法的实现过程:
def insert(self, data, order=0):
if len(data) < 1:
return
data = sorted(data, key=lambda x: x[order % self.dim]) # 按某个维度进行排序
logging.debug(f"当前待划分样本点:{data}")
idx = len(data) // 2
node = Node(data[idx][:-1], data[idx][-1])
logging.debug(f"父节点:{data[idx]}")
left_data = data[:idx]
logging.debug(f"左子树: {left_data}")
right_data = data[idx + 1:]
logging.debug(f"右子树: {right_data}")
logging.debug("============")
if self.is_empty():
self.root = node # 整个KD树的根节点
node.left_child = self.insert(left_data, order + 1) # 递归构建左子树
node.right_child = self.insert(right_data, order + 1) # 递归构建右子树
return node
1.2 kd树最近邻搜索
1 KNearestNodes, n = [], 0
2 def NearestNodeSearch(curr_node):
3 if curr_node == None:
4 return
5 if n < K:
6 KNearestNodes.insert(curr_node) # 插入后保持有序
7 if n >= K and
8 distance(curr_node, q) < distance(curr_node, KNearestNodes[-1]):
9 KNearestNodes.pop()
10 KNearestNodes.insert(curr_node) # 插入后保持有序
11 if q_i < curr_node_i:
12 NearestNodeSearch(curr_node.left)
13 else:
14 NearestNodeSearch(curr_node.right)
15 if n < K or | curr_node_i - q_i | < distance(q, KNearestNodes[-1]):
16 NearestNodeSearch(curr_node.other)
在实现K近邻搜索之前,我们先来定义一个辅助函数用于对节点进行插入并排序,代码如下:
1 def append(k_nearest_nodes, curr_node, point):
2 k_nearest_nodes.append(curr_node)
3 k_nearest_nodes = sorted(k_nearest_nodes,
4 key=lambda x: distance(x.data, point))
5 return k_nearest_nodes
1 def _k_nearest_search(self, point, k):
2 k_nearest_nodes = []
3 visited = []
4 n = 0
5
6 def k_nearest_node_search(point, curr_node, order=0):
7 nonlocal k_nearest_nodes, n
8 if curr_node is None:
9 return None
10 visited.append(curr_node)
11 if n < k: # 如果当前还没找到k个点,则直接进行保存
12 n += 1
13 k_nearest_nodes = self.append(k_nearest_nodes, curr_node, point)
14 else: # 已经找到k个局部最优点,开始进行筛选
15 dist = (distance(curr_node.data, point) < distance(point, k_nearest_nodes[-1].data))
16 if dist:
17 k_nearest_nodes.pop() # 移除最后一个
18 k_nearest_nodes = self.append(k_nearest_nodes, curr_node, point) # 加入新的点并进行排序
19 cmp_dim = order % self.dim
20 if point[cmp_dim] < curr_node.data[cmp_dim]:
21 k_nearest_node_search(point, curr_node.left_child, order + 1)
22 else:
23 k_nearest_node_search(point, curr_node.right_child, order + 1)
24 if n < k or np.abs(curr_node.data[cmp_dim] - point[cmp_dim]) < distance(point,
25 k_nearest_nodes[-1].data):
26 child = curr_node.left_child if curr_node.left_child not in visited else curr_node.right_child
28 k_nearest_node_search(point, child, order + 1)
29 k_nearest_node_search(point, self.root, 0)
30 return k_nearest_nodes
需要再定义一个函数即可循环完成多个样本点的K近邻搜索过程,代码如下:
1 def k_nearest_search(self, points, k):
2 result_points = []
3 result_ind = []
4 for point in points:
5 k_nodes = self._k_nearest_search(point, k)
6 tmp_points = []
7 tmp_ind = []
8 for node in k_nodes:
9 tmp_points.append(node.data)
10 tmp_ind.append(int(node.index))
11 result_points.append(tmp_points)
12 result_ind.append(tmp_ind)
13 return np.array(result_points), np.array(result_ind)
2.KNN
2.1 模型拟合
对于KNN分类模型来说,所谓的模型拟合其实就是根据给定的训练样本来构造完成对应的树,实现代码如下所示:
1 class KNN(object):
2 def __init__(self, n_neighbors):
3 self.n_neighbors = n_neighbors
4
5 def fit(self, x, y):
6 self._y = y
7 self.kd_tree = MyKDTree(x)
8 return self
模型预测:
1 def get_pred_labels(query_label):
2 y_pred = [0] * len(query_label)
3 for i, label in enumerate(query_label):
4 max_freq = 0
5 count_dict = {}
6 for l in label:
7 count_dict[l] = count_dict.setdefault(l, 0) + 1
8 if count_dict[l] > max_freq:
9 max_freq = count_dict[l]
10 y_pred[i] = l
11 return np.array(y_pred)
新样本的预测:
1 def predict(self, x):
2 k_best_nodes, ind = self.kd_tree.k_nearest_search(x, k=self.n_neighbors)
3 query_label = self._y[ind]
4 y_pred = self.get_pred_labels(query_label)
5 return y_pred
最后,掌柜以iris数据集为例来进行实验,并同时与sklearn中KNeighborsClassifier
的分类结果进行对比。
1 if __name__ == '__main__':
2 x_train, x_test, y_train, y_test = load_data()
3 k = 5
4 model = KNeighborsClassifier(n_neighbors=k)
5 model.fit(x_train, y_train)
6 y_pred = model.predict(x_test)
7 logging.info(f"impl_by_sklearn 准确率:{accuracy_score(y_test, y_pred)}")
8
9 my_model = KNN(n_neighbors=k)
10 my_model.fit(x_train, y_train)
11 y_pred = my_model.predict(x_test)
12 logging.info(f"impl_by_ours 准确率:{accuracy_score(y_test, y_pred)}")
以上信息均来源于月来客栈,学习笔记,如有侵权请联系删除!!!