基于python的相关性和标准差的多模型评价泰勒图

有梦想的Frank博士

已于 2024-10-27 10:34:42 修改

阅读量1.1k

点赞数 19

分类专栏：数据绘图文章标签： python 开发语言

于 2024-10-27 10:32:29 首次发布

本文链接：https://blog.csdn.net/frankgis/article/details/143265119

版权

数据绘图专栏收录该内容

21 篇文章

订阅专栏

前言

作用
模型对比：

通过泰勒图 (Taylor Diagram) 来直观比较模型的性能，泰勒图通过标准差和相关系数来展示模型表现，并包含RMSE的等高线。
标准差比率：图中每个点到原点的径向距离表示预测值与观测值的标准差之比

相关系数角度：从x轴（0弧度）到点的角度表示观测值和预测值之间的相关性

红色虚线：指示标准差为1的参考线，用于快速判断预测精度
RMSE等高线表示预测值与观测值之间的均方根误差，通常用虚线绘制，距离原点较近且角度较小的点通常表示更优的模型。

最终的泰勒图对比了各模型相对于观测数据的表现，为模型的统计特性提供了直观的展示。

代码实现

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from mpl_toolkits.axisartist import floating_axes
from mpl_toolkits.axisartist.grid_finder import FixedLocator, DictFormatter
from matplotlib.projections import PolarAxes

# Set global plot parameters
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.unicode_minus'] = False

# 1. Data Reading and Splitting
df = pd.read_excel('机器学习AI.xlsx')

# Split features and target variable
X = df.drop(['待预测变量Y'], axis=1)
y = df['待预测变量Y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate and collect metrics
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    metrics = {
        'Standard Deviation (Pred)': np.std(y_pred),
        'Standard Deviation (Observed)': np.std(y_test),
        'Correlation': np.corrcoef(y_test, y_pred)[0, 1],
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2 Score': r2_score(y_test, y_pred)
    }
    return metrics

# 2. XGBoost Model Training
params_xgb = {
    'learning_rate': 0.02,
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'max_leaves': 127,
    'verbosity': 1,
    'seed': 42,
    'nthread': -1,
    'colsample_bytree': 0.6,
    'subsample': 0.7
}

# Grid search for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'min_child_weight': [1, 2],
}

grid_search_xgb = GridSearchCV(
    estimator=xgb.XGBRegressor(**params_xgb),
    param_grid=param_grid_xgb,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

metrics_xgb = evaluate_model(grid_search_xgb, X_train, y_train, X_test, y_test)
metrics_xgb['Model'] = 'XGBoost'

# 3. Random Forest Model Training
rf_regressor = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
}

grid_search_rf = GridSearchCV(
    estimator=rf_regressor,
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

metrics_rf = evaluate_model(grid_search_rf, X_train, y_train, X_test, y_test)
metrics_rf['Model'] = 'RF'

# 4. CatBoost Model Training
params_catboost = {
    'learning_rate': 0.02,
    'depth': 6,
    'loss_function': 'RMSE',
    'verbose': 100,
    'random_seed': 42,
    'thread_count': -1,
    'subsample': 0.7,
    'l2_leaf_reg': 3.0
}

grid_search_catboost = GridSearchCV(
    estimator=CatBoostRegressor(**params_catboost),
    param_grid={
        'iterations': [100, 200],
        'depth': [3, 4],
        'learning_rate': [0.01, 0.02],
    },
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

metrics_catboost = evaluate_model(grid_search_catboost, X_train, y_train, X_test, y_test)
metrics_catboost['Model'] = 'CatBoost'

# Compile all metrics into a single DataFrame
metrics_df = pd.DataFrame([metrics_xgb, metrics_rf, metrics_catboost])

# 5. Visualization: Taylor Diagram
def set_tayloraxes(fig, location):
    trans = PolarAxes.PolarTransform(apply_theta_transforms=False)
    r1_locs = np.hstack((np.arange(1, 10)/10.0, [0.95, 0.99]))
    t1_locs = np.arccos(r1_locs)
    gl1 = FixedLocator(t1_locs)
    tf1 = DictFormatter(dict(zip(t1_locs, map(str, r1_locs))))

    r2_locs = np.arange(0, 2, 0.25)
    r2_labels = ['0', '0.25', '0.50', '0.75', '1.00', '1.25', '1.50', '1.75']
    gl2 = FixedLocator(r2_locs)
    tf2 = DictFormatter(dict(zip(r2_locs, r2_labels)))

    ghelper = floating_axes.GridHelperCurveLinear(trans, extremes=(0, np.pi/2, 0, 1.75),
                                                  grid_locator1=gl1, tick_formatter1=tf1,
                                                  grid_locator2=gl2, tick_formatter2=tf2)
    ax = floating_axes.FloatingSubplot(fig, location, grid_helper=ghelper)
    fig.add_subplot(ax)

    ax.axis["top"].set_axis_direction("bottom")
    ax.axis["top"].toggle(ticklabels=True, label=True)
    ax.axis["top"].major_ticklabels.set_axis_direction("top")
    ax.axis["top"].label.set_text("Correlation")
    ax.axis["top"].label.set_fontsize(14)

    ax.axis["left"].set_axis_direction("bottom")
    ax.axis["left"].label.set_text("Standard deviation")
    ax.axis["left"].label.set_fontsize(14)

    ax.axis["right"].set_axis_direction("top")
    ax.axis["right"].toggle(ticklabels=True)
    ax.axis["right"].major_ticklabels.set_axis_direction("left")
    ax.axis["bottom"].set_visible(False)

    ax.grid(True)
    polar_ax = ax.get_aux_axes(trans)

    # Create contours for Taylor Diagram
    rs, ts = np.meshgrid(np.linspace(0, 1.75, 100), np.linspace(0, np.pi/2, 100))
    rms = np.sqrt(1 + rs**2 - 2 * rs * np.cos(ts))
    CS = polar_ax.contour(ts, rs, rms, colors='gray', linestyles='--')
    plt.clabel(CS, inline=1, fontsize=10)

    return polar_ax

def plot_taylor(ax, std_obs, std_pred, correlation, **kwargs):
    theta = np.arccos(correlation)
    radius = std_pred / std_obs
    ax.plot(theta, radius, **kwargs)

# Create and display the Taylor Diagram
fig = plt.figure(figsize=(8, 8), dpi=1200)
ax = set_tayloraxes(fig, 111)

# Plot each model's data point on the Taylor Diagram
for _, row in metrics_df.iterrows():
    plot_taylor(ax, row['Standard Deviation (Observed)'], row['Standard Deviation (Pred)'], 
                row['Correlation'], marker='o', markersize=8, label=row['Model'])

# Add legend and save the figure
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.1))
plt.savefig("Python机器学习AI——1.pdf", format='pdf', bbox_inches='tight')
plt.show()

出图效果

在这里插入图片描述