当我们意识到需要降维时,一般是发现了特征间的高度线性相关,而t-SNE主打的是非线性降维。如果我们发现了线性相关,可能用PCA处理就可以了。即使发现了“非线性相关性”,我们也不会尝试用t-SNE降维再搭配一个线性分类模型,而会直接选择非线性的分类模型去处理。复杂的非线性关系不适合强行降维再分类,而应该用非线性模型直接处理。 —— t-sne数据可视化算法的作用是啥?- 微调的回答 - 知乎
# Refer:
# https://leovan.me/cn/2018/03/manifold-learning/
# https://github.com/lmcinnes/umap
#
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import manifold, datasets, metrics
from sklearn.utils import check_random_state
import umap
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['SimHei']
rcParams['font.size'] = 8
rcParams['lines.markersize'] = 2
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.ticker import NullFormatter
from mpl_toolkits.mplot3d import Axes3D
from time import time
def genPoints(n_points=1000, func_name='swiss-roll'):
if func_name == 'swiss-roll':
points, colors = datasets.make_swiss_roll(n_points, random_state=0)
elif func_name == 's-curve':
points, colors = datasets.make_s_curve(n_points, random_state=0)
elif func_name == 'severed-sphere':
random_state = check_random_state(0)
p = random_state.rand(n_points) * (2 * np.pi - 0.55)
t = random_state.rand(n_points) * np.pi
indices = ((t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8))))
colors = p[indices]
points = np.c_[np.sin(t[indices]) * np.cos(p[indices]),
np.sin(t[indices]) * np.sin(p[indices]),
np.cos(t[indices])]
else:
raise ValueError('Unsupported function [%s]' % func_name)
return points, colors
# ----------------------
# mainfold transfers
# ----------------------
def get_manifold(points, method='lle',
n_neighbors=15, n_components=2,
max_iter=100, n_init=1,
init='pca', random_state=0):
print('Fitting with {method}'.format(method=method))
if method == 'lle':
m_points = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='dense',
method='standard',
random_state=random_state).fit_transform(points)
elif method == 'ltsa':
m_points = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='dense',
method='ltsa',
random_state=random_state).fit_transform(points)
elif method == 'hessian-lle':
m_points = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='dense',
method='hessian',
random_state=random_state).fit_transform(points)
elif method == 'modified-lle':
m_points = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='dense',
method='modified',
random_state=random_state).fit_transform(points)
elif method == 'isomap':
m_points = manifold.Isomap(n_neighbors, n_components).fit_transform(points)
elif method == 'mds':
m_points = manifold.MDS(n_components, max_iter=max_iter, n_init=n_init,
random_state=random_state).fit_transform(points)
elif method == 'le':
m_points = manifold.SpectralEmbedding(n_components, n_neighbors=n_neighbors,
random_state=random_state).fit_transform(points)
elif method == 'tsne':
m_points = manifold.TSNE(n_components, init=init, random_state=random_state).fit_transform(points)
elif method == 'umap':
m_points = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.2, random_state=random_state).fit_transform(points)
else:
raise ValueError('Unsupported method [%s] ' % method)
return m_points
# -------- 手写数字dataset ------
def plot_mnist_manifold_dim_reduction(save_path, cmap=None):
methods = ['mds', 'isomap', 'lle', 'hessian-lle', 'modified-lle', 'ltsa', 'le', 'tsne']
labels = ['MDS', 'Isomap', 'LLE', 'Hessian LLE', 'Modified LLE', 'LTSA', 'Laplacian Eigenmaps', 't-SNE']
mnist = datasets.load_digits(n_class=10)
points = mnist.data
colors = mnist.target
fig = plt.figure(figsize=(6, 6))
for i, (method, label) in enumerate(zip(methods, labels)):
t_start = time()
m_points = get_manifold(points, method=method)
t_end = time()
ax = fig.add_subplot(3, 3, 2 + i)
plt.scatter(m_points[:, 0], m_points[:, 1], c=colors, cmap=cmap)
plt.title("%s\n(in %.2g sec.)" % (label, t_end - t_start))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
fig.tight_layout()
fig.show()
## --------
def plot_km_manifold(df_data, save_path, cmap=None):
methods = ['isomap', 'lle', 'hessian-lle', 'modified-lle', 'ltsa', 'le', 'tsne', 'umap']
labels = ['Isomap', 'LLE', 'Hessian LLE', 'Modified LLE', 'LTSA', 'Laplacian Eigenmaps', 't-SNE', 'U MAP']
# methods = ['lle', 'le']
# labels = ['LLE', 'Laplacian Eigenmaps']
points = df_data.values
# 设置colors值为labels
kmeans_model = KMeans(n_clusters=3, max_iter=200, random_state=7).fit(points)
score = metrics.silhouette_score(points, kmeans_model.labels_, metric='euclidean')
colors = kmeans_model.labels_
fig = plt.figure(figsize=(8, 8))
# plt.style.use('dark_background')
for i, (method, label) in enumerate(zip(methods, labels)):
t_start = time()
m_points = get_manifold(points, method=method)
t_end = time()
ax = fig.add_subplot(3, 3, 1 + i)
ax.set_facecolor("navy")
plt.scatter(m_points[:, 0], m_points[:, 1], s=2, c=colors, cmap=cmap)
plt.title("%s\n(score=%.3g,in %.2g sec.)" % (label, score, t_end - t_start))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
fig.tight_layout()
fig.savefig(save_path)
plt.show()
if __name__ == '__main__':
#
df_feat = load_your_data(file_name)
plot_km_manifold(df_feat, './play_manifold.png', cmap=plt.cm.Set1)