【统计学习笔记】习题三
1. k值选择与模型复杂度及预测准确率的关系
import numpy as np
class KNN:
def __init__(self, X_train, y_train, n_neighbors=3, p=2):
"""
parameter: n_neighbors 临近点个数
parameter: p 距离度量
"""
self.n = n_neighbors
self.p = p
self.X_train = X_train
self.y_train = y_train
def predict(self, X):
# 取出n个点
knn_list = []
for i in range(self.n):
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
knn_list.append((dist, self.y_train[i]))
for i in range(self.n, len(self.X_train)):
max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
if knn_list[max_index][0] > dist:
knn_list[max_index] = (dist, self.y_train[i])
# 统计
knn = [k[-1] for k in knn_list]
count_pairs = Counter(knn)
max_count = sorted(count_pairs, key=lambda x:x)[-1]
# print(max_count)
return max_count
def segment(self, X):
# 取出n个点
knn_list = []
for i in range(self.n):
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
knn_list.append((dist, self.y_train[i]))
for i in range(self.n, len(self.X_train)):
max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
if knn_list[max_index][0] > dist:
knn_list[max_index] = (dist, self.y_train[i])
# 统计
knn_list = np.array(knn_list)
sup_class = np.sum(knn_list[:,1]**1.5)
return sup_class
def score(self, X_test, y_test):
right_count = 0
n = 10
for X, y in zip(X_test, y_test):
label = self.predict(X)
if label == y:
right_count += 1
return right_count / len(X_test)
import matplotlib.pyplot as plt
def plot_decision_boundaries(clusterer, X, resolution=1000,show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
xxx = np.c_[xx.ravel(), yy.ravel()]
Z = np.array([clusterer.segment(z) for z in xxx])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
生成训练数据:
X_train = np.random.random((8,2))
Y_train = np.arange(1,10)
plt.plot(X_train[:,0],X_train[:,1],'ko')
1.1 k=1
mod1 = KNN(X_train,Y_train,n_neighbors=1)
plot_decision_boundaries(mod1, X_train,resolution=200)
plt.plot(X_train[:,0],X_train[:,1],'ko')
1.2 k=2
mod2 = KNN(X_train,Y_train,n_neighbors=2)
Z = plot_decision_boundaries(mod2, X_train,resolution=200)
plt.plot(X_train[:,0],X_train[:,1],'ko')
部分区域值其实是不同的,但是不知道为啥没有被划分开…
、正在输入…
。