前言
作用
模型对比:
通过泰勒图 (Taylor Diagram) 来直观比较模型的性能,泰勒图通过标准差和相关系数来展示模型表现,并包含RMSE的等高线。
标准差比率:图中每个点到原点的径向距离表示预测值与观测值的标准差之比
相关系数角度:从x轴(0弧度)到点的角度表示观测值和预测值之间的相关性
红色虚线:指示标准差为1的参考线,用于快速判断预测精度
RMSE等高线表示预测值与观测值之间的均方根误差,通常用虚线绘制,距离原点较近且角度较小的点通常表示更优的模型。
最终的泰勒图对比了各模型相对于观测数据的表现,为模型的统计特性提供了直观的展示。
代码实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from mpl_toolkits.axisartist import floating_axes
from mpl_toolkits.axisartist.grid_finder import FixedLocator, DictFormatter
from matplotlib.projections import PolarAxes
# Set global plot parameters
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.unicode_minus'] = False
# 1. Data Reading and Splitting
df = pd.read_excel('机器学习AI.xlsx')
# Split features and target variable
X = df.drop(['待预测变量Y'], axis=1)
y = df['待预测变量Y']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Function to evaluate and collect metrics
def evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate evaluation metrics
metrics = {
'Standard Deviation (Pred)': np.std(y_pred),
'Standard Deviation (Observed)': np.std(y_test),
'Correlation': np.corrcoef(y_test, y_pred)[0, 1],
'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
'R2 Score': r2_score(y_test, y_pred)
}
return metrics
# 2. XGBoost Model Training
params_xgb = {
'learning_rate': 0.02,
'booster': 'gbtree',
'objective': 'reg:squarederror',
'max_leaves': 127,
'verbosity': 1,
'seed': 42,
'nthread': -1,
'colsample_bytree': 0.6,
'subsample': 0.7
}
# Grid search for XGBoost
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 4],
'min_child_weight': [1, 2],
}
grid_search_xgb = GridSearchCV(
estimator=xgb.XGBRegressor(**params_xgb),
param_grid=param_grid_xgb,
scoring='neg_root_mean_squared_error',
cv=5,
n_jobs=-1,
verbose=1
)
metrics_xgb = evaluate_model(grid_search_xgb, X_train, y_train, X_test, y_test)
metrics_xgb['Model'] = 'XGBoost'
# 3. Random Forest Model Training
rf_regressor = RandomForestRegressor(random_state=42)
param_grid_rf = {
'n_estimators': [100, 200],
'max_depth': [None, 10],
}
grid_search_rf = GridSearchCV(
estimator=rf_regressor,
param_grid=param_grid_rf,
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,
verbose=1
)
metrics_rf = evaluate_model(grid_search_rf, X_train, y_train, X_test, y_test)
metrics_rf['Model'] = 'RF'
# 4. CatBoost Model Training
params_catboost = {
'learning_rate': 0.02,
'depth': 6,
'loss_function': 'RMSE',
'verbose': 100,
'random_seed': 42,
'thread_count': -1,
'subsample': 0.7,
'l2_leaf_reg': 3.0
}
grid_search_catboost = GridSearchCV(
estimator=CatBoostRegressor(**params_catboost),
param_grid={
'iterations': [100, 200],
'depth': [3, 4],
'learning_rate': [0.01, 0.02],
},
scoring='neg_mean_squared_error',
cv=5,
n_jobs=-1,
verbose=1
)
metrics_catboost = evaluate_model(grid_search_catboost, X_train, y_train, X_test, y_test)
metrics_catboost['Model'] = 'CatBoost'
# Compile all metrics into a single DataFrame
metrics_df = pd.DataFrame([metrics_xgb, metrics_rf, metrics_catboost])
# 5. Visualization: Taylor Diagram
def set_tayloraxes(fig, location):
trans = PolarAxes.PolarTransform(apply_theta_transforms=False)
r1_locs = np.hstack((np.arange(1, 10)/10.0, [0.95, 0.99]))
t1_locs = np.arccos(r1_locs)
gl1 = FixedLocator(t1_locs)
tf1 = DictFormatter(dict(zip(t1_locs, map(str, r1_locs))))
r2_locs = np.arange(0, 2, 0.25)
r2_labels = ['0', '0.25', '0.50', '0.75', '1.00', '1.25', '1.50', '1.75']
gl2 = FixedLocator(r2_locs)
tf2 = DictFormatter(dict(zip(r2_locs, r2_labels)))
ghelper = floating_axes.GridHelperCurveLinear(trans, extremes=(0, np.pi/2, 0, 1.75),
grid_locator1=gl1, tick_formatter1=tf1,
grid_locator2=gl2, tick_formatter2=tf2)
ax = floating_axes.FloatingSubplot(fig, location, grid_helper=ghelper)
fig.add_subplot(ax)
ax.axis["top"].set_axis_direction("bottom")
ax.axis["top"].toggle(ticklabels=True, label=True)
ax.axis["top"].major_ticklabels.set_axis_direction("top")
ax.axis["top"].label.set_text("Correlation")
ax.axis["top"].label.set_fontsize(14)
ax.axis["left"].set_axis_direction("bottom")
ax.axis["left"].label.set_text("Standard deviation")
ax.axis["left"].label.set_fontsize(14)
ax.axis["right"].set_axis_direction("top")
ax.axis["right"].toggle(ticklabels=True)
ax.axis["right"].major_ticklabels.set_axis_direction("left")
ax.axis["bottom"].set_visible(False)
ax.grid(True)
polar_ax = ax.get_aux_axes(trans)
# Create contours for Taylor Diagram
rs, ts = np.meshgrid(np.linspace(0, 1.75, 100), np.linspace(0, np.pi/2, 100))
rms = np.sqrt(1 + rs**2 - 2 * rs * np.cos(ts))
CS = polar_ax.contour(ts, rs, rms, colors='gray', linestyles='--')
plt.clabel(CS, inline=1, fontsize=10)
return polar_ax
def plot_taylor(ax, std_obs, std_pred, correlation, **kwargs):
theta = np.arccos(correlation)
radius = std_pred / std_obs
ax.plot(theta, radius, **kwargs)
# Create and display the Taylor Diagram
fig = plt.figure(figsize=(8, 8), dpi=1200)
ax = set_tayloraxes(fig, 111)
# Plot each model's data point on the Taylor Diagram
for _, row in metrics_df.iterrows():
plot_taylor(ax, row['Standard Deviation (Observed)'], row['Standard Deviation (Pred)'],
row['Correlation'], marker='o', markersize=8, label=row['Model'])
# Add legend and save the figure
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.1))
plt.savefig("Python机器学习AI——1.pdf", format='pdf', bbox_inches='tight')
plt.show()