前些天发现了一个巨牛的人工智能学习网站,通俗易懂,风趣幽默,忍不住分享一下给大家。点击跳转到网站
https://www.captainbed.cn/north
文章目录
- 一、基础可视化工具配置
- 二、数据分布可视化
- 三、模型训练过程可视化
- 四、模型结果解释
- 五、特征重要性分析
- 六、决策边界可视化
- 七、聚类结果可视化
- 八、神经网络可视化
- 九、时间序列预测可视化
- 十、交互式可视化进阶
- 一、基础可视化工具配置
- 二、数据分布可视化
- 三、模型训练过程可视化
- 四、模型结果解释
- 五、特征重要性分析
- 六、决策边界可视化
- 七、聚类结果可视化
- 八、神经网络可视化
- 九、时间序列预测可视化
- 十、交互式可视化进阶
- 一、基础可视化工具配置
- 二、数据分布可视化
- 三、模型训练过程可视化
- 四、模型结果解释
- 五、特征重要性分析
- 六、决策边界可视化
- 七、聚类结果可视化
- 八、神经网络可视化
- 九、时间序列预测可视化
- 十、交互式可视化进阶
一、基础可视化工具配置
1.1 环境设置与样式配置
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 设置全局样式
plt.style.use('seaborn-v0_8-whitegrid') # 白色网格背景
sns.set_palette("husl") # 设置调色板
plt.rcParams['font.family'] = 'SimHei' # 中文显示
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 创建示例数据
np.random.seed(42)
X = np.random.randn(100, 3)
y = X[:, 0] + 2*X[:, 1] - 1.5*X[:, 2] + np.random.randn(100)*0.5
二、数据分布可视化
2.1 特征分布分析
def plot_feature_distributions(X, feature_names):
"""绘制特征分布矩阵图"""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, (ax, name) in enumerate(zip(axes, feature_names)):
# 直方图与密度曲线
sns.histplot(X[:, i], kde=True, ax=ax, bins=15)
ax.set_title(f'特征 {name} 分布')
ax.set_xlabel('值范围')
ax.set_ylabel('频数')
plt.tight_layout()
plt.show()
# 使用示例
plot_feature_distributions(X, ['特征1', '特征2', '特征3'])
2.2 目标变量分析
def plot_target_analysis(y):
"""目标变量综合分析"""
fig = plt.figure(figsize=(14, 4))
# 分布图
ax1 = fig.add_subplot(131)
sns.histplot(y, kde=True, ax=ax1)
ax1.set_title('目标值分布')
# 箱线图
ax2 = fig.add_subplot(132)
sns.boxplot(y=y, ax=ax2)
ax2.set_title('目标值箱线图')
# QQ图
ax3 = fig.add_subplot(133)
stats.probplot(y, dist="norm", plot=ax3)
ax3.set_title('正态性检验(Q-Q图)')
plt.tight_layout()
plt.show()
plot_target_analysis(y)
三、模型训练过程可视化
3.1 学习曲线绘制
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, X, y):
"""绘制学习曲线"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std,
train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std,
test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r",
label="训练分数")
plt.plot(train_sizes, test_mean, 'o-', color="g",
label="交叉验证分数")
plt.title('学习曲线')
plt.xlabel('训练样本数')
plt.ylabel('分数(R2)')
plt.legend(loc="best")
plt.grid(True)
plt.show()
# 使用示例
from sklearn.linear_model import LinearRegression
plot_learning_curve(LinearRegression(), X, y)
3.2 损失函数可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import History
def plot_loss_history(model, X, y):
"""训练过程损失变化可视化"""
history = History()
model = Sequential([
Dense(10, input_dim=3, activation='relu'),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=100, verbose=0, callbacks=[history])
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='训练损失')
plt.title('模型训练损失变化')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
plot_loss_history(model, X, y)
四、模型结果解释
4.1 回归模型可视化
def plot_regression_results(y_true, y_pred):
"""回归结果可视化"""
plt.figure(figsize=(12, 5))
# 真实值vs预测值
plt.subplot(121)
max_val = max(y_true.max(), y_pred.max())
min_val = min(y_true.min(), y_pred.min())
plt.scatter(y_true, y_pred, alpha=0.5)
plt.plot([min_val, max_val], [min_val, max_val], '--r')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('真实值 vs 预测值')
# 残差图
plt.subplot(122)
residuals = y_true - y_pred
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差分析')
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
plot_regression_results(y, y_pred)
4.2 分类模型可视化
from sklearn.metrics import confusion_matrix, roc_curve, auc
def plot_classification_results(y_true, y_pred, y_score):
"""分类模型结果可视化"""
plt.figure(figsize=(15, 5))
# 混淆矩阵
plt.subplot(131)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
# ROC曲线
plt.subplot(132)
fpr, tpr, _ = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC曲线 (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('ROC曲线')
plt.legend(loc="lower right")
# 概率分布
plt.subplot(133)
sns.kdeplot(y_score[y_true==0], label='负类', shade=True)
sns.kdeplot(y_score[y_true==1], label='正类', shade=True)
plt.title('类别概率分布')
plt.xlabel('预测概率')
plt.ylabel('密度')
plt.legend()
plt.tight_layout()
plt.show()
# 示例使用(二分类)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X_clf, y_clf = make_classification(n_samples=1000, n_features=4, random_state=42)
clf = RandomForestClassifier().fit(X_clf, y_clf)
y_pred = clf.predict(X_clf)
y_score = clf.predict_proba(X_clf)[:, 1]
plot_classification_results(y_clf, y_pred, y_score)
五、特征重要性分析
5.1 树模型特征重要性
def plot_feature_importance(model, feature_names):
"""特征重要性可视化"""
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]
plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(len(importance)), importance[indices],
color="lightblue", align="center")
plt.xticks(range(len(importance)),
[feature_names[i] for i in indices], rotation=45)
plt.xlim([-1, len(importance)])
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(X, y)
plot_feature_importance(model, ['特征1', '特征2', '特征3'])
5.2 SHAP值可视化
import shap
def plot_shap_summary(model, X, feature_names):
"""SHAP特征贡献可视化"""
explainer = shap.Explainer(model)
shap_values = explainer(X)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
plot_type="bar", show=False)
plt.title("SHAP特征重要性(绝对值平均)")
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
show=False)
plt.title("SHAP值分布")
plt.tight_layout()
plt.show()
# 示例使用
plot_shap_summary(model, X, ['特征1', '特征2', '特征3'])
六、决策边界可视化
6.1 二维决策边界
from sklearn.datasets import make_classification
from mlxtend.plotting import plot_decision_regions
def plot_2d_decision_boundary(model, X, y):
"""二维特征空间决策边界"""
plt.figure(figsize=(10, 6))
plot_decision_regions(X, y, clf=model, legend=2)
plt.title("决策边界可视化")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.show()
# 示例使用(二维特征)
X_2d, y_2d = make_classification(n_samples=200, n_features=2,
n_redundant=0, random_state=42)
clf_2d = RandomForestClassifier().fit(X_2d, y_2d)
plot_2d_decision_boundary(clf_2d, X_2d, y_2d)
6.2 高维投影决策边界
from sklearn.decomposition import PCA
def plot_pca_decision_boundary(model, X, y):
"""PCA降维后的决策边界"""
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
model_pca = RandomForestClassifier().fit(X_pca, y)
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.6)
plt.title("PCA降维投影")
plt.xlabel("主成分1")
plt.ylabel("主成分2")
plt.subplot(122)
plot_decision_regions(X_pca, y, clf=model_pca, legend=2)
plt.title("降维空间决策边界")
plt.xlabel("主成分1")
plt.tight_layout()
plt.show()
# 示例使用
plot_pca_decision_boundary(clf, X_clf, y_clf)
七、聚类结果可视化
7.1 二维聚类可视化
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
def plot_cluster_results(X, n_clusters=3):
"""聚类结果可视化"""
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8)
plt.title("聚类结果")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.subplot(122)
silhouette_vals = silhouette_samples(X, clusters)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_vals,
alpha=0.7)
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.title("轮廓系数分析")
plt.xlabel("轮廓系数值")
plt.ylabel("聚类标签")
plt.tight_layout()
plt.show()
# 示例使用
X_blobs, _ = make_blobs(n_samples=300, centers=3, random_state=42)
plot_cluster_results(X_blobs)
八、神经网络可视化
8.1 模型结构可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model
def visualize_keras_model():
"""Keras模型结构可视化"""
model = Sequential([
Dense(64, input_dim=20, activation='relu'),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid')
])
plot_model(model, to_file='model.png', show_shapes=True,
show_layer_names=True, rankdir='LR')
# 在Notebook中直接显示
from IPython.display import Image
return Image(filename='model.png')
visualize_keras_model()
8.2 激活热图可视化
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
def visualize_activation_heatmap(img_path):
"""CNN激活热图可视化"""
model = VGG16(weights='imagenet')
# 预处理图像
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# 获取预测类别
preds = model.predict(x)
class_idx = np.argmax(preds[0])
# 获取最后一个卷积层的梯度
last_conv_layer = model.get_layer('block5_conv3')
grads = K.gradients(model.output[:, class_idx], last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([x])
# 计算热图
for i in range(512):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
heatmap = np.mean(conv_layer_output_value, axis=-1)
heatmap = np.maximum(heatmap, 0)
heatmap /= np.max(heatmap)
# 可视化
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.imshow(img)
plt.title('原始图像')
plt.subplot(122)
plt.imshow(heatmap, cmap='jet')
plt.title('激活热图')
plt.tight_layout()
plt.show()
# 示例使用(需要实际图像文件)
# visualize_activation_heatmap('elephant.jpg')
九、时间序列预测可视化
9.1 预测结果对比
def plot_time_series_forecast(y_true, y_pred, title="时间序列预测结果"):
"""时间序列预测可视化"""
plt.figure(figsize=(12, 6))
plt.plot(y_true, label='真实值', color='blue', alpha=0.6)
plt.plot(y_pred, label='预测值', color='red', linestyle='--', alpha=0.8)
plt.fill_between(range(len(y_true)),
y_pred - 0.2*np.std(y_true),
y_pred + 0.2*np.std(y_true),
color='pink', alpha=0.3)
plt.title(title)
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()
# 示例使用
np.random.seed(42)
time_series = np.sin(np.linspace(0, 10, 100)) + np.random.randn(100)*0.2
prediction = np.sin(np.linspace(0, 10, 100)+0.1) + np.random.randn(100)*0.1
plot_time_series_forecast(time_series, prediction)
十、交互式可视化进阶
10.1 Plotly交互可视化
import plotly.express as px
import pandas as pd
def interactive_feature_analysis(X, y, feature_names):
"""交互式特征分析"""
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
fig = px.scatter_matrix(df, dimensions=feature_names,
color='target', opacity=0.7,
title="交互式特征矩阵图")
fig.update_traces(diagonal_visible=False)
fig.show()
fig2 = px.parallel_coordinates(df, color='target',
labels={col:col for col in df.columns},
title="平行坐标图")
fig2.show()
# 示例使用
interactive_feature_analysis(X, y, ['特征1', '特征2', '特征3'])
通过以上可视化技术,您可以全面理解AI模型的行为、诊断问题并有效解释模型决策。每种可视化方法都针对特定的模型解释需求,建议根据您的具体场景选择组合使用。
一、基础可视化工具配置
1.1 环境设置与样式配置
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 设置全局样式
plt.style.use('seaborn-v0_8-whitegrid') # 白色网格背景
sns.set_palette("husl") # 设置调色板
plt.rcParams['font.family'] = 'SimHei' # 中文显示
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 创建示例数据
np.random.seed(42)
X = np.random.randn(100, 3)
y = X[:, 0] + 2*X[:, 1] - 1.5*X[:, 2] + np.random.randn(100)*0.5
二、数据分布可视化
2.1 特征分布分析
def plot_feature_distributions(X, feature_names):
"""绘制特征分布矩阵图"""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, (ax, name) in enumerate(zip(axes, feature_names)):
# 直方图与密度曲线
sns.histplot(X[:, i], kde=True, ax=ax, bins=15)
ax.set_title(f'特征 {name} 分布')
ax.set_xlabel('值范围')
ax.set_ylabel('频数')
plt.tight_layout()
plt.show()
# 使用示例
plot_feature_distributions(X, ['特征1', '特征2', '特征3'])
2.2 目标变量分析
def plot_target_analysis(y):
"""目标变量综合分析"""
fig = plt.figure(figsize=(14, 4))
# 分布图
ax1 = fig.add_subplot(131)
sns.histplot(y, kde=True, ax=ax1)
ax1.set_title('目标值分布')
# 箱线图
ax2 = fig.add_subplot(132)
sns.boxplot(y=y, ax=ax2)
ax2.set_title('目标值箱线图')
# QQ图
ax3 = fig.add_subplot(133)
stats.probplot(y, dist="norm", plot=ax3)
ax3.set_title('正态性检验(Q-Q图)')
plt.tight_layout()
plt.show()
plot_target_analysis(y)
三、模型训练过程可视化
3.1 学习曲线绘制
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, X, y):
"""绘制学习曲线"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std,
train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std,
test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r",
label="训练分数")
plt.plot(train_sizes, test_mean, 'o-', color="g",
label="交叉验证分数")
plt.title('学习曲线')
plt.xlabel('训练样本数')
plt.ylabel('分数(R2)')
plt.legend(loc="best")
plt.grid(True)
plt.show()
# 使用示例
from sklearn.linear_model import LinearRegression
plot_learning_curve(LinearRegression(), X, y)
3.2 损失函数可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import History
def plot_loss_history(model, X, y):
"""训练过程损失变化可视化"""
history = History()
model = Sequential([
Dense(10, input_dim=3, activation='relu'),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=100, verbose=0, callbacks=[history])
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='训练损失')
plt.title('模型训练损失变化')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
plot_loss_history(model, X, y)
四、模型结果解释
4.1 回归模型可视化
def plot_regression_results(y_true, y_pred):
"""回归结果可视化"""
plt.figure(figsize=(12, 5))
# 真实值vs预测值
plt.subplot(121)
max_val = max(y_true.max(), y_pred.max())
min_val = min(y_true.min(), y_pred.min())
plt.scatter(y_true, y_pred, alpha=0.5)
plt.plot([min_val, max_val], [min_val, max_val], '--r')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('真实值 vs 预测值')
# 残差图
plt.subplot(122)
residuals = y_true - y_pred
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差分析')
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
plot_regression_results(y, y_pred)
4.2 分类模型可视化
from sklearn.metrics import confusion_matrix, roc_curve, auc
def plot_classification_results(y_true, y_pred, y_score):
"""分类模型结果可视化"""
plt.figure(figsize=(15, 5))
# 混淆矩阵
plt.subplot(131)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
# ROC曲线
plt.subplot(132)
fpr, tpr, _ = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC曲线 (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('ROC曲线')
plt.legend(loc="lower right")
# 概率分布
plt.subplot(133)
sns.kdeplot(y_score[y_true==0], label='负类', shade=True)
sns.kdeplot(y_score[y_true==1], label='正类', shade=True)
plt.title('类别概率分布')
plt.xlabel('预测概率')
plt.ylabel('密度')
plt.legend()
plt.tight_layout()
plt.show()
# 示例使用(二分类)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X_clf, y_clf = make_classification(n_samples=1000, n_features=4, random_state=42)
clf = RandomForestClassifier().fit(X_clf, y_clf)
y_pred = clf.predict(X_clf)
y_score = clf.predict_proba(X_clf)[:, 1]
plot_classification_results(y_clf, y_pred, y_score)
五、特征重要性分析
5.1 树模型特征重要性
def plot_feature_importance(model, feature_names):
"""特征重要性可视化"""
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]
plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(len(importance)), importance[indices],
color="lightblue", align="center")
plt.xticks(range(len(importance)),
[feature_names[i] for i in indices], rotation=45)
plt.xlim([-1, len(importance)])
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(X, y)
plot_feature_importance(model, ['特征1', '特征2', '特征3'])
5.2 SHAP值可视化
import shap
def plot_shap_summary(model, X, feature_names):
"""SHAP特征贡献可视化"""
explainer = shap.Explainer(model)
shap_values = explainer(X)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
plot_type="bar", show=False)
plt.title("SHAP特征重要性(绝对值平均)")
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
show=False)
plt.title("SHAP值分布")
plt.tight_layout()
plt.show()
# 示例使用
plot_shap_summary(model, X, ['特征1', '特征2', '特征3'])
六、决策边界可视化
6.1 二维决策边界
from sklearn.datasets import make_classification
from mlxtend.plotting import plot_decision_regions
def plot_2d_decision_boundary(model, X, y):
"""二维特征空间决策边界"""
plt.figure(figsize=(10, 6))
plot_decision_regions(X, y, clf=model, legend=2)
plt.title("决策边界可视化")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.show()
# 示例使用(二维特征)
X_2d, y_2d = make_classification(n_samples=200, n_features=2,
n_redundant=0, random_state=42)
clf_2d = RandomForestClassifier().fit(X_2d, y_2d)
plot_2d_decision_boundary(clf_2d, X_2d, y_2d)
6.2 高维投影决策边界
from sklearn.decomposition import PCA
def plot_pca_decision_boundary(model, X, y):
"""PCA降维后的决策边界"""
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
model_pca = RandomForestClassifier().fit(X_pca, y)
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.6)
plt.title("PCA降维投影")
plt.xlabel("主成分1")
plt.ylabel("主成分2")
plt.subplot(122)
plot_decision_regions(X_pca, y, clf=model_pca, legend=2)
plt.title("降维空间决策边界")
plt.xlabel("主成分1")
plt.tight_layout()
plt.show()
# 示例使用
plot_pca_decision_boundary(clf, X_clf, y_clf)
七、聚类结果可视化
7.1 二维聚类可视化
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
def plot_cluster_results(X, n_clusters=3):
"""聚类结果可视化"""
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8)
plt.title("聚类结果")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.subplot(122)
silhouette_vals = silhouette_samples(X, clusters)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_vals,
alpha=0.7)
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.title("轮廓系数分析")
plt.xlabel("轮廓系数值")
plt.ylabel("聚类标签")
plt.tight_layout()
plt.show()
# 示例使用
X_blobs, _ = make_blobs(n_samples=300, centers=3, random_state=42)
plot_cluster_results(X_blobs)
八、神经网络可视化
8.1 模型结构可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model
def visualize_keras_model():
"""Keras模型结构可视化"""
model = Sequential([
Dense(64, input_dim=20, activation='relu'),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid')
])
plot_model(model, to_file='model.png', show_shapes=True,
show_layer_names=True, rankdir='LR')
# 在Notebook中直接显示
from IPython.display import Image
return Image(filename='model.png')
visualize_keras_model()
8.2 激活热图可视化
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
def visualize_activation_heatmap(img_path):
"""CNN激活热图可视化"""
model = VGG16(weights='imagenet')
# 预处理图像
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# 获取预测类别
preds = model.predict(x)
class_idx = np.argmax(preds[0])
# 获取最后一个卷积层的梯度
last_conv_layer = model.get_layer('block5_conv3')
grads = K.gradients(model.output[:, class_idx], last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([x])
# 计算热图
for i in range(512):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
heatmap = np.mean(conv_layer_output_value, axis=-1)
heatmap = np.maximum(heatmap, 0)
heatmap /= np.max(heatmap)
# 可视化
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.imshow(img)
plt.title('原始图像')
plt.subplot(122)
plt.imshow(heatmap, cmap='jet')
plt.title('激活热图')
plt.tight_layout()
plt.show()
# 示例使用(需要实际图像文件)
# visualize_activation_heatmap('elephant.jpg')
九、时间序列预测可视化
9.1 预测结果对比
def plot_time_series_forecast(y_true, y_pred, title="时间序列预测结果"):
"""时间序列预测可视化"""
plt.figure(figsize=(12, 6))
plt.plot(y_true, label='真实值', color='blue', alpha=0.6)
plt.plot(y_pred, label='预测值', color='red', linestyle='--', alpha=0.8)
plt.fill_between(range(len(y_true)),
y_pred - 0.2*np.std(y_true),
y_pred + 0.2*np.std(y_true),
color='pink', alpha=0.3)
plt.title(title)
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()
# 示例使用
np.random.seed(42)
time_series = np.sin(np.linspace(0, 10, 100)) + np.random.randn(100)*0.2
prediction = np.sin(np.linspace(0, 10, 100)+0.1) + np.random.randn(100)*0.1
plot_time_series_forecast(time_series, prediction)
十、交互式可视化进阶
10.1 Plotly交互可视化
import plotly.express as px
import pandas as pd
def interactive_feature_analysis(X, y, feature_names):
"""交互式特征分析"""
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
fig = px.scatter_matrix(df, dimensions=feature_names,
color='target', opacity=0.7,
title="交互式特征矩阵图")
fig.update_traces(diagonal_visible=False)
fig.show()
fig2 = px.parallel_coordinates(df, color='target',
labels={col:col for col in df.columns},
title="平行坐标图")
fig2.show()
# 示例使用
interactive_feature_analysis(X, y, ['特征1', '特征2', '特征3'])
通过以上可视化技术,您可以全面理解AI模型的行为、诊断问题并有效解释模型决策。每种可视化方法都针对特定的模型解释需求,建议根据您的具体场景选择组合使用。
一、基础可视化工具配置
1.1 环境设置与样式配置
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 设置全局样式
plt.style.use('seaborn-v0_8-whitegrid') # 白色网格背景
sns.set_palette("husl") # 设置调色板
plt.rcParams['font.family'] = 'SimHei' # 中文显示
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 创建示例数据
np.random.seed(42)
X = np.random.randn(100, 3)
y = X[:, 0] + 2*X[:, 1] - 1.5*X[:, 2] + np.random.randn(100)*0.5
二、数据分布可视化
2.1 特征分布分析
def plot_feature_distributions(X, feature_names):
"""绘制特征分布矩阵图"""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, (ax, name) in enumerate(zip(axes, feature_names)):
# 直方图与密度曲线
sns.histplot(X[:, i], kde=True, ax=ax, bins=15)
ax.set_title(f'特征 {name} 分布')
ax.set_xlabel('值范围')
ax.set_ylabel('频数')
plt.tight_layout()
plt.show()
# 使用示例
plot_feature_distributions(X, ['特征1', '特征2', '特征3'])
2.2 目标变量分析
def plot_target_analysis(y):
"""目标变量综合分析"""
fig = plt.figure(figsize=(14, 4))
# 分布图
ax1 = fig.add_subplot(131)
sns.histplot(y, kde=True, ax=ax1)
ax1.set_title('目标值分布')
# 箱线图
ax2 = fig.add_subplot(132)
sns.boxplot(y=y, ax=ax2)
ax2.set_title('目标值箱线图')
# QQ图
ax3 = fig.add_subplot(133)
stats.probplot(y, dist="norm", plot=ax3)
ax3.set_title('正态性检验(Q-Q图)')
plt.tight_layout()
plt.show()
plot_target_analysis(y)
三、模型训练过程可视化
3.1 学习曲线绘制
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, X, y):
"""绘制学习曲线"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std,
train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std,
test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r",
label="训练分数")
plt.plot(train_sizes, test_mean, 'o-', color="g",
label="交叉验证分数")
plt.title('学习曲线')
plt.xlabel('训练样本数')
plt.ylabel('分数(R2)')
plt.legend(loc="best")
plt.grid(True)
plt.show()
# 使用示例
from sklearn.linear_model import LinearRegression
plot_learning_curve(LinearRegression(), X, y)
3.2 损失函数可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import History
def plot_loss_history(model, X, y):
"""训练过程损失变化可视化"""
history = History()
model = Sequential([
Dense(10, input_dim=3, activation='relu'),
Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=100, verbose=0, callbacks=[history])
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='训练损失')
plt.title('模型训练损失变化')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)
plt.show()
plot_loss_history(model, X, y)
四、模型结果解释
4.1 回归模型可视化
def plot_regression_results(y_true, y_pred):
"""回归结果可视化"""
plt.figure(figsize=(12, 5))
# 真实值vs预测值
plt.subplot(121)
max_val = max(y_true.max(), y_pred.max())
min_val = min(y_true.min(), y_pred.min())
plt.scatter(y_true, y_pred, alpha=0.5)
plt.plot([min_val, max_val], [min_val, max_val], '--r')
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('真实值 vs 预测值')
# 残差图
plt.subplot(122)
residuals = y_true - y_pred
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差分析')
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
plot_regression_results(y, y_pred)
4.2 分类模型可视化
from sklearn.metrics import confusion_matrix, roc_curve, auc
def plot_classification_results(y_true, y_pred, y_score):
"""分类模型结果可视化"""
plt.figure(figsize=(15, 5))
# 混淆矩阵
plt.subplot(131)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
# ROC曲线
plt.subplot(132)
fpr, tpr, _ = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC曲线 (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('ROC曲线')
plt.legend(loc="lower right")
# 概率分布
plt.subplot(133)
sns.kdeplot(y_score[y_true==0], label='负类', shade=True)
sns.kdeplot(y_score[y_true==1], label='正类', shade=True)
plt.title('类别概率分布')
plt.xlabel('预测概率')
plt.ylabel('密度')
plt.legend()
plt.tight_layout()
plt.show()
# 示例使用(二分类)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X_clf, y_clf = make_classification(n_samples=1000, n_features=4, random_state=42)
clf = RandomForestClassifier().fit(X_clf, y_clf)
y_pred = clf.predict(X_clf)
y_score = clf.predict_proba(X_clf)[:, 1]
plot_classification_results(y_clf, y_pred, y_score)
五、特征重要性分析
5.1 树模型特征重要性
def plot_feature_importance(model, feature_names):
"""特征重要性可视化"""
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]
plt.figure(figsize=(10, 6))
plt.title("特征重要性")
plt.bar(range(len(importance)), importance[indices],
color="lightblue", align="center")
plt.xticks(range(len(importance)),
[feature_names[i] for i in indices], rotation=45)
plt.xlim([-1, len(importance)])
plt.tight_layout()
plt.show()
# 示例使用
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(X, y)
plot_feature_importance(model, ['特征1', '特征2', '特征3'])
5.2 SHAP值可视化
import shap
def plot_shap_summary(model, X, feature_names):
"""SHAP特征贡献可视化"""
explainer = shap.Explainer(model)
shap_values = explainer(X)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
plot_type="bar", show=False)
plt.title("SHAP特征重要性(绝对值平均)")
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, feature_names=feature_names,
show=False)
plt.title("SHAP值分布")
plt.tight_layout()
plt.show()
# 示例使用
plot_shap_summary(model, X, ['特征1', '特征2', '特征3'])
六、决策边界可视化
6.1 二维决策边界
from sklearn.datasets import make_classification
from mlxtend.plotting import plot_decision_regions
def plot_2d_decision_boundary(model, X, y):
"""二维特征空间决策边界"""
plt.figure(figsize=(10, 6))
plot_decision_regions(X, y, clf=model, legend=2)
plt.title("决策边界可视化")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.show()
# 示例使用(二维特征)
X_2d, y_2d = make_classification(n_samples=200, n_features=2,
n_redundant=0, random_state=42)
clf_2d = RandomForestClassifier().fit(X_2d, y_2d)
plot_2d_decision_boundary(clf_2d, X_2d, y_2d)
6.2 高维投影决策边界
from sklearn.decomposition import PCA
def plot_pca_decision_boundary(model, X, y):
"""PCA降维后的决策边界"""
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
model_pca = RandomForestClassifier().fit(X_pca, y)
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.6)
plt.title("PCA降维投影")
plt.xlabel("主成分1")
plt.ylabel("主成分2")
plt.subplot(122)
plot_decision_regions(X_pca, y, clf=model_pca, legend=2)
plt.title("降维空间决策边界")
plt.xlabel("主成分1")
plt.tight_layout()
plt.show()
# 示例使用
plot_pca_decision_boundary(clf, X_clf, y_clf)
七、聚类结果可视化
7.1 二维聚类可视化
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
def plot_cluster_results(X, n_clusters=3):
"""聚类结果可视化"""
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8)
plt.title("聚类结果")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.subplot(122)
silhouette_vals = silhouette_samples(X, clusters)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_vals = silhouette_vals[clusters == i]
ith_cluster_silhouette_vals.sort()
size_cluster_i = ith_cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_vals,
alpha=0.7)
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.title("轮廓系数分析")
plt.xlabel("轮廓系数值")
plt.ylabel("聚类标签")
plt.tight_layout()
plt.show()
# 示例使用
X_blobs, _ = make_blobs(n_samples=300, centers=3, random_state=42)
plot_cluster_results(X_blobs)
八、神经网络可视化
8.1 模型结构可视化
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model
def visualize_keras_model():
"""Keras模型结构可视化"""
model = Sequential([
Dense(64, input_dim=20, activation='relu'),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid')
])
plot_model(model, to_file='model.png', show_shapes=True,
show_layer_names=True, rankdir='LR')
# 在Notebook中直接显示
from IPython.display import Image
return Image(filename='model.png')
visualize_keras_model()
8.2 激活热图可视化
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
def visualize_activation_heatmap(img_path):
"""CNN激活热图可视化"""
model = VGG16(weights='imagenet')
# 预处理图像
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
# 获取预测类别
preds = model.predict(x)
class_idx = np.argmax(preds[0])
# 获取最后一个卷积层的梯度
last_conv_layer = model.get_layer('block5_conv3')
grads = K.gradients(model.output[:, class_idx], last_conv_layer.output)[0]
pooled_grads = K.mean(grads, axis=(0, 1, 2))
iterate = K.function([model.input], [pooled_grads, last_conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([x])
# 计算热图
for i in range(512):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
heatmap = np.mean(conv_layer_output_value, axis=-1)
heatmap = np.maximum(heatmap, 0)
heatmap /= np.max(heatmap)
# 可视化
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.imshow(img)
plt.title('原始图像')
plt.subplot(122)
plt.imshow(heatmap, cmap='jet')
plt.title('激活热图')
plt.tight_layout()
plt.show()
# 示例使用(需要实际图像文件)
# visualize_activation_heatmap('elephant.jpg')
九、时间序列预测可视化
9.1 预测结果对比
def plot_time_series_forecast(y_true, y_pred, title="时间序列预测结果"):
"""时间序列预测可视化"""
plt.figure(figsize=(12, 6))
plt.plot(y_true, label='真实值', color='blue', alpha=0.6)
plt.plot(y_pred, label='预测值', color='red', linestyle='--', alpha=0.8)
plt.fill_between(range(len(y_true)),
y_pred - 0.2*np.std(y_true),
y_pred + 0.2*np.std(y_true),
color='pink', alpha=0.3)
plt.title(title)
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()
# 示例使用
np.random.seed(42)
time_series = np.sin(np.linspace(0, 10, 100)) + np.random.randn(100)*0.2
prediction = np.sin(np.linspace(0, 10, 100)+0.1) + np.random.randn(100)*0.1
plot_time_series_forecast(time_series, prediction)
十、交互式可视化进阶
10.1 Plotly交互可视化
import plotly.express as px
import pandas as pd
def interactive_feature_analysis(X, y, feature_names):
"""交互式特征分析"""
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
fig = px.scatter_matrix(df, dimensions=feature_names,
color='target', opacity=0.7,
title="交互式特征矩阵图")
fig.update_traces(diagonal_visible=False)
fig.show()
fig2 = px.parallel_coordinates(df, color='target',
labels={col:col for col in df.columns},
title="平行坐标图")
fig2.show()
# 示例使用
interactive_feature_analysis(X, y, ['特征1', '特征2', '特征3'])
通过以上可视化技术,您可以全面理解AI模型的行为、诊断问题并有效解释模型决策。每种可视化方法都针对特定的模型解释需求,建议根据您的具体场景选择组合使用。