K邻近学习
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
class KDNode():
def __init__(self, data, label, depth, left=None, right=None):
self.data = data #数据
self.label = label #标签
self.depth = depth #维度
self.left = left #左子树
self.right = right #右子树
class KDTree():
def __init__(self):
self.KdTree = None
self.n = 0
self.nearest = None
#建树
def create(self, X, y, depth=0):
m, n = X.shape
if m == 0:
return None
self.n = n
dim = depth % self.n #轮流划分
#按某一维度排序
indices = np.argsort(X[:, dim])
X = X[indices]
y = y[indices]
#找到中位数作为划分点
split_pos = m // 2
node = KDNode(X[split_pos], y[split_pos], depth)
#根节点
if depth == 0:
self.KdTree = node
#左右子树递归
node.left = self.create(X[:split_pos], y[:split_pos], depth+1)
node.right = self.create(X[split_pos+1:], y[split_pos+1:], depth+1)
return node
#先序遍历
def preOrder(self, node):
if node is not None:
# print(node.dim, node.data)
self.preOrder(node.left)
self.preOrder(node.right)
def search(self, x, k=1):
#用于记录k个近邻点,包括距离和对应节点
nearest = []
for i in range(k):
nearest.append([-1, None])
self.nearest = np.array(nearest)
def travel(node):
if node is not None:
axis = node.depth % self.n
daxis= x[axis] - node.data[axis]
#测试点当前维的坐标值小于切分点的坐标值, 向左子树查找,反之向右子树找
if daxis < 0:
travel(node.left)
else:
travel(node.right)
#测试点到切分点的欧式距离
dist = np.sqrt(sum((x - node.data) ** 2))
for i, d in enumerate(self.nearest):
if d[0] < 0 or dist < d[0]:
self.nearest = np.insert(self.nearest, i, [dist, node], axis=0)
self.nearest = self.nearest[:-1]
break
n = list(self.nearest[:, 0]).count(-1)
#是否需要到子节点区域搜索
if self.nearest[-n-1, 0] > abs(daxis):
if daxis < 0:
travel(node.right)
else:
travel(node.left)
travel(self.KdTree)
knn = self.nearest[:, 1]
labels = []
for node in knn:
labels.append(node.label)
#投票法
yhat = np.argmax(np.bincount(labels))
return yhat
class KNN():
def __init__(self, n_neighbors, algorithm="brute"):
self.n_neighbors = n_neighbors
self.algorithm = algorithm
def fit(self, X, y):
self.X = X
self.y = y
def predict(self, x_test):
if self.algorithm == "brute":
return self.brute(self.X, self.y, x_test)
elif self.algorithm == "kd_tree":
return self.kd_tree(self.X, self.y, x_test)
#暴力搜索
def brute(self, X, y, x_test):
m = x_test.shape[0]
yhat = np.zeros(m, )
for i in range(m):
#计算预测点与每个样本点距离
dist = np.sqrt(np.sum((x_test[i] - X) ** 2, axis=1))
#从小到大排列选取k个预测
indices = np.argsort(dist)[: self.n_neighbors]
y_select = y[indices]
#出现最多类别作为预测类
yhat[i] = np.argmax(np.bincount(y_select))
#还可以如下选取最多类别, 选一种就行
#yhat[i] = Counter(y_select).most_common(1)[0][0]
return yhat
#kd树搜索
def kd_tree(self, X, y, x_test):
kdtree = KDTree()
kdtree.create(X, y)
kdtree.preOrder(kdtree.KdTree)
m = x_test.shape[0]
yhat = np.zeros(m, )
for i in range(m):
yhat[i] = kdtree.search(x_test[i], self.n_neighbors) #找5个邻近点
return yhat
def load_data():
iris = datasets.load_iris()
return iris.data, iris.target
X, y = load_data()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knn = KNN(n_neighbors=5, algorithm="kd_tree")
knn.fit(x_train, y_train)
yhat = knn.predict(x_test)
print("my knn准确率: ",accuracy_score(y_test, yhat))
knn2 = KNeighborsClassifier(n_neighbors=5, algorithm="kd_tree")
knn2.fit(x_train, y_train)
yhat1 = knn2.predict(x_test)
print("sklearn knn准确率: ",accuracy_score(y_test, yhat1))
MDS
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.manifold import MDS
class MyMDS:
def __init__(self, n_components):
self.n_components = n_components
self.explained_variance_ratio_ = []
def fit(self, X):
m = X.shape[0]
self.x_mean = np.mean(X, axis=0)
self.x_std = np.std(X, axis=0)
# data = (data - self.x_mean) / self.x_std
X = X - self.x_mean
D = self.cal_D(X)
di = np.sum(D, axis=0) / m
dj = np.sum(D, axis=1) / m
dij = np.sum(D) / (m**2)
#内积矩阵B
B = np.zeros_like(D)
for i in range(m):
for j in range(m):
B[i, j] = -1/2 * (D[i, j] - di[i] - dj[j] + dij)
#特征值分解
#严格意义上来说用np.linalg.eig更好,但会产生虚部(不知道原因)
vals, vecs = np.linalg.eigh(B)
#a按特征值由大到小排列
eigindice = np.argsort(-vals)
n_eigindice = eigindice[:self.n_components]
#构造对角矩阵
A = np.diag(vals[n_eigindice])
V = vecs[:, n_eigindice]
for i in range(1, self.n_components+1):
n_eigindice = eigindice[i-1:i]
n_eigvals = vals[n_eigindice]
n_eigvecs = vecs[:, n_eigindice]
self.explained_variance_ratio_.append(np.round(np.sum(n_eigvals) / np.sum(vals), 8))
return np.dot(V, np.sqrt(A))
def cal_distance(self, x, y):
#这里不需要开平方,是便于后面计算
return sum((x - y) ** 2)
def cal_D(self, X):
m = X.shape[0]
D = np.zeros((m, m))
for i in range(m):
for j in range(i, m):
D[i, j] = self.cal_distance(X[i], X[j])
D[j, i] = D[i, j]
return D
#加载数据
def load_data():
iris = datasets.load_iris()
return iris.data, iris.target
X, Y = load_data()
mds_X1 = MyMDS(n_components=2).fit(X)
mds_X2 = MDS(n_components=2).fit_transform(X)
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].scatter(mds_X1[:, 0], mds_X1[:, 1], c=Y, marker='o', alpha=.5)
ax[0].set_title("My MDS")
ax[1].scatter(mds_X2[:, 0], mds_X2[:, 1], c=Y, marker='o', alpha=.5)
ax[1].set_title("Sklearn MDS")
plt.show()
PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
class MyPCA:
def __init__(self, n_components: int):
self.n_components = n_components
self.explained_variance_ratio_ = []
def fit(self, data:np.ndarray):
#中心化
self.x_mean = np.mean(data, axis=0)
# self.x_std = np.std(data, axis=0)
# data = (data - self.x_mean) / self.x_std
data = data - self.x_mean
m = data.shape[0]
sigma = np.dot(data.T, data) / m
vals, vecs = np.linalg.eig(sigma)
eigindice = np.argsort(-vals)
n_eigindice = eigindice[:self.n_components]
self.n_eigvals = vals[n_eigindice]
self.n_eigvecs = vecs[:, n_eigindice]
#计算方差贡献率
for i in range(1, self.n_components+1):
n_eigindice = eigindice[i-1:i]
n_eigvals = vals[n_eigindice]
n_eigvecs = vecs[:, n_eigindice]
self.explained_variance_ratio_.append(round(np.sum(n_eigvals) / np.sum(vals), 8))
return np.dot(data, self.n_eigvecs)
#数据还原
def recover(self, Z):
return np.dot(Z, self.n_eigvecs.T)
#X = np.array([[1, 1], [2, 1], [3, 2], [-1, -1], [-2, -1], [-3, -2]])
#pca = MyPCA(n_components=1)
#pca.fit(X)
# pca.explained_variance_ratio_
#加载鸢尾花数据
def load_data():
iris = datasets.load_iris()
return iris.data, iris.target
#比较两种方式降维效果
X, Y = load_data()
pca_X1 = MyPCA(n_components=2).fit(X)
pca_X2 = PCA(n_components=2).fit_transform(X)
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].scatter(pca_X1[:, 0], pca_X1[:, 1], c=Y, marker='o', alpha=.5)
ax[0].set_title("My PCA")
ax[1].scatter(pca_X2[:, 0], pca_X2[:, 1], c=Y, marker='o', alpha=.5)
ax[1].set_title("Sklearn PCA")
plt.show()
KPCA
import numpy as np
import matplotlib.pyplot as plt
class KPCA():
def __init__(self, n_components, kernel="linear", gamma=None, degree=3, coef=1):
self.n_components = n_components
self.kernel = kernel
self.gamma = gamma
self.degree = degree
self.coef = coef
def fit(self, X):
N, D = X.shape
K = np.zeros((N, N))
#默认gamma=1/features
if self.gamma == None and self.kernel == "poly" or self.kernel == "rbf" or self.kernel == "sigmoid":
self.gamma = 1/D
for i in range(N):
for j in range(i, N):
if self.kernel == "linear":
K[i, j] = self.linear(X[i], X[j])
elif self.kernel == "poly":
K[i, j] = self.poly(X[i], X[j])
elif self.kernel == "rbf":
K[i, j] = self.rbf(X[i], X[j])
elif self.kernel == "sigmoid":
K[i, j] = self.sigmoid(X[i], X[j])
K[j, i] = K[i, j]
#中心化核函数矩阵
In = np.ones((N, N)) / N
K = K - In @ K - K @ In.T + In @ K @ In.T
#计算特征值和特征向量
vals, vecs = np.linalg.eigh(K)
eigindices = np.argsort(-vals)
n_indices = eigindices[:self.n_components]
#对u正则化
eig_val = vals[n_indices] ** (1/2)
u = vecs[:, n_indices] / eig_val
return np.dot(K, u)
#线性核函数
def linear(self, x1, x2):
return np.dot(x1, x2)
#多项式核函数
def poly(self, x1, x2):
x = np.dot(x1, x2)
return (self.gamma * (x + 1) + self.coef) ** self.degree
#高斯核函数
def rbf(self, x1, x2):
x = np.dot((x1-x2), (x1-x2))
return np.exp(-self.gamma * x)
#sigmoid核函数
def sigmoid(self, x1, x2):
x = np.dot(x1, x2)
return np.tanh(self.gamma * x + self.coef)
#加载瑞士卷数据
def load_data():
swiss_roll = datasets.make_swiss_roll(n_samples=1000, noise=0.05)
return swiss_roll[0], np.floor(swiss_roll[1])
#查看不同核函数降维效果
X, Y = load_data()
kernels = ["linear", "poly", "rbf", "sigmoid"]
fig = plt.figure(figsize=(15, 8))
for i, kernel in enumerate(kernels):
ax = fig.add_subplot(2, 2, i+1)
kpca = KPCA(n_components=2, kernel=kernel)
KX = kpca.fit(X)
ax.scatter(KX[:, 0], KX[:, 1], c=Y, alpha=.5)
ax.set_title(kernel + " PCA")
fig.tight_layout()
plt.show()
使用sklearn中的KernelPCA,对比一下效果
from sklearn.decomposition import KernelPCA
kernels = ["linear", "poly", "rbf", "sigmoid"]
fig = plt.figure(figsize=(15, 8))
for i, kernel in enumerate(kernels):
ax = fig.add_subplot(2, 2, i+1)
kpca = KernelPCA(n_components=2, kernel=kernel)
KX = kpca.fit_transform(X)
ax.scatter(KX[:, 0], KX[:, 1], c=Y, alpha=.5)
ax.set_title(kernel + " PCA")
fig.tight_layout()
plt.show()
Isomap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import Isomap
import warnings
warnings.filterwarnings("ignore")
class MyIsomap():
def __init__(self, n_components, n_neighbors, path_method):
self.n_components = n_components
self.n_neighbors = n_neighbors
self.path_method = path_method
def fit(self, X):
assert self.n_neighbors < len(X), "the size of n_neighbors does not exceed the first dimension of X"
D = self.cal_D(X)
m = D.shape[0]
if self.path_method == "floyd":
D = self.floyd(D)
elif self.path_method == "dijkstra":
D = self.dijkstra_impl(D)
#将D作为MDS算法的输入
D = np.square(D)
T1 = np.ones((m, m)) * np.sum(D) / (m ** 2)
T2 = np.sum(D, axis=0) / m
T3 = np.sum(D, axis=1) / m
#内积矩阵B
B = -(T1 - T2 - T3 + D) / 2
#特征值分解
vals, vecs = np.linalg.eig(B)
#a按特征值由大到小排列
eigindice = np.argsort(-vals)
n_eigindice = eigindice[:self.n_components]
#构造对角矩阵
A = np.diag(vals[n_eigindice]).real
V = vecs[:, n_eigindice]
return np.dot(V,np.sqrt(A))
def floyd(self, D):
m = D.shape[0]
for k in range(m):
for i in range(m):
for j in range(m):
if D[i, k] + D[k, j] < D[i, j]:
D[i, j] = D[i, k] + D[k, j]
return D
def dijkstra_impl(self, D):
m = D.shape[0]
for i in range(m):
D[i] = self.dijkstra(D, i)
return D
def dijkstra(self, D, start):
m = D.shape[0]
MAX = float("inf")
visited = [False] * m #已访问点
dist = [MAX] * m
dist[start] = 0
while visited.count(False): #尾节点为True,停止访问
min_val = float("inf")
min_val_id = -1
for index in range(m):
if not visited[index] and dist[index] < min_val:
min_val = dist[index]
min_val_id = index
visited[min_val_id] = True
for index in range(m):
dist[index] = min(dist[index], dist[min_val_id] + D[min_val_id, index])
return dist
def cal_D(self, X):
m = X.shape[0]
D = np.zeros((m, m))
for i in range(m):
for j in range(i, m):
D[i, j] = self.cal_distance(X[i], X[j])
D[j, i] = D[i, j]
#确定每个样本点的k近邻,保留这k个样本间距离,其余设置为无穷大
MAX = float("inf")
D1 = np.ones((m, m)) * MAX
Dk = np.argsort(D, axis=1) #行排列
for i in range(D.shape[0]):
#这里加1原因:自己到自己距离最小,这里从小到大排列肯定有一个样本是自己,所以需要加个1
D1[i, Dk[i, :self.n_neighbors+1]] = D[i, Dk[i, :self.n_neighbors+1]]
return D1
def cal_distance(self, x1, x2):
return np.sqrt(sum((x1 - x2) ** 2))
X, y = datasets.make_s_curve(n_samples=500, random_state=0)
isomap = MyIsomap(n_components=2, n_neighbors=10, path_method="floyd")
isomap_X1 = isomap.fit(X)
isomap = Isomap(n_components=2, n_neighbors=10, path_method="FW")
isomap_X2 = isomap.fit_transform(X)
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].scatter(isomap_X1[:, 0], isomap_X1[:, 1], c=y)
ax[0].set_title("My Isomap")
ax[1].scatter(isomap_X2[:, 0], isomap_X2[:, 1], c=y)
ax[1].set_title("Sklearn Isomap")
plt.show()
LLE
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import LocallyLinearEmbedding
class LLE():
def __init__(self, n_components, n_neighbors):
self.n_components = n_components
self.n_neighbors = n_neighbors
def fit(self, X):
m, n = X.shape
assert self.n_neighbors < m, f"Expected n_neighbors <= n_samples, but n_samples = {m}, n_neighbors = {self.n_neighbors}"
if self.n_neighbors > n:
tol = 1e-3
else:
tol = 0
D = self.cal_D(X)
N = self.x_neigbor(D)
W = np.zeros((m, m))
I = np.ones((self.n_neighbors, 1))
for i in range(m):
Xi, Xi_k = X[i].reshape(1,-1), X[N[i]]
#计算Si
Si = (I @ Xi - Xi_k) @ (I @ Xi - Xi_k).T
#防止对角线过小
Si = Si + np.eye(self.n_neighbors) * tol * np.trace(Si)
#计算wi
wi = (I.T @ np.linalg.pinv(Si)) / (I.T @ np.linalg.pinv(Si) @ I)
#近邻点有w,其余点为0
W[i, N[i]] = wi
Im = np.eye(m)
M = (Im - W).T @ (Im - W)
val, vecs = np.linalg.eig(M)
#从小到大排列,第一个特征值为0不取
index = np.argsort(val)[1:self.n_components+1]
Y = vecs[:, index]
return Y
#确定x的k近邻
def x_neigbor(self, D):
m = D.shape[0]
N = np.argsort(D, axis=1)[:, 1:self.n_neighbors + 1]
return N.astype(int)
#计算距离矩阵
def cal_D(self, X):
m = X.shape[0]
D = np.zeros((m, m))
for i in range(m):
for j in range(i, m):
D[i, j] = self.cal_distance(X[i], X[j])
D[j, i] = D[i, j]
return D
def cal_distance(self, x1, x2):
return np.sqrt(sum((x1 - x2) ** 2))
X, y = datasets.make_s_curve(n_samples=500, random_state=0)
lle1 = LLE(n_components=2, n_neighbors=15)
lle_X1 = lle1.fit(X)
lle2 = LocallyLinearEmbedding(n_components=2, n_neighbors=15)
lle_X2 = lle2.fit_transform(X)
fig, ax = plt.subplots(1, 2, figsize=(15, 4))
ax[0].scatter(lle_X1[:, 0], lle_X1[:, 1], c=y)
ax[0].set_title("My LLE")
ax[1].scatter(lle_X2[:, 0], lle_X2[:, 1], c=y)
ax[1].set_title("Sklearn LLE")
plt.show()