机器学习误差图绘-CSDN博客

本文链接：https://blog.csdn.net/2301_81197800/article/details/147357749

机器学习误差图绘制

绘图类

# Define the ModelComparisonPlot class
class ModelComparisonPlot:
    def __init__(self, model_name):
        self.model_name = model_name
    
    def plot_comparison(self, y_val, y_pred, mse, mae, r2):
        # Create a figure with two subplots
        fig, axes = plt.subplots(1, 2, figsize=(11, 5))

        # Plot the predicted vs true values
        sns.regplot(x=y_val, y=y_pred, color='blue', scatter_kws={'alpha':0.5}, ax=axes[0])
        axes[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2)
        axes[0].set_xlabel('True values', fontsize=12)
        axes[0].set_ylabel('Predicted values', fontsize=12)
        axes[0].set_title('Predicted vs true values')
        axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

        # Plot the residuals vs predicted values
        residuals = y_val - y_pred
        sns.residplot(x=y_pred, y=residuals, color='blue', scatter_kws={'alpha':0.5}, ax=axes[1])
        axes[1].plot([y_val.min(), y_val.max()], [0, 0], 'k--', lw=2)
        axes[1].set_xlabel('Predicted values', fontsize=12)
        axes[1].set_ylabel('Residuals', fontsize=12)
        axes[1].set_title('Residual plot', fontsize=15)
        axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
               # Add a title to the figure
        fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(self.model_name), fontsize=15)

        # Adjust the spacing between subplots
        plt.subplots_adjust(wspace=0.4)

        # Display the figure with the title
        plt.show()

独热编码

# Assuming 'Entity' is a categorical variable
X_encoded = pd.get_dummies(df, columns=['实体'], drop_first=True)

确定预测值

# Assuming 'gdp_growth' is target variable
X = X_encoded.drop('人均国内生产总值', axis=1)
y = X_encoded['人均国内生产总值']
X_encoded = X_encoded.fillna(0)  # Replace with your preferred imputation method

区分训练集与测试集

# Assuming X and y are features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

机器学习代码

ExtraTreesRegressor


# Create an Extra Trees Regressor model
model_ETR = ExtraTreesRegressor(
    max_depth=None,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300
)

# Fit the model
model_ETR.fit(X_train, y_train)

# Make predictions
y_pred = model_ETR.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Model: {type(model_ETR).__name__}, mse: {mse}")
print(f"Model: {type(model_ETR).__name__}, mae: {mae}")
print(f"Model: {type(model_ETR).__name__}, r2: {r2}")
model_ETR_plot = ModelComparisonPlot('ExtraTreesRegressor')

# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=y_pred, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals = y_test - y_pred
sns.residplot(x=y_pred, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(model_ETR_plot.model_name), fontsize=15, color="black")
# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()

DecisionTreeRegressor

# Create a decision tree regression model
dt_model = DecisionTreeRegressor()
# Fit the model
dt_model.fit(X_train, y_train)
# Make predictions
predictions = dt_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
# Print evaluation metrics
print(f"Model: {type(dt_model).__name__}, mse: {mse}")
print(f"Model: {type(dt_model).__name__}, mae: {mae}")
print(f"Model: {type(dt_model).__name__}, r2: {r2}")
dt_model_plot = ModelComparisonPlot('DecisionTreeRegressor')

# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals = y_test - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(dt_model_plot.model_name), fontsize=15, color="black")
# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()

LinearRegression

# Create a linear regression model
linear_model = LinearRegression()

# Fit the model
linear_model.fit(X_train, y_train)

# Make predictions
predictions_linear = linear_model.predict(X_test)

# Evaluate the model
mse_linear = mean_squared_error(y_test, predictions_linear)
mae_linear = mean_absolute_error(y_test, predictions_linear)
r2_linear = r2_score(y_test, predictions_linear)

# Print evaluation metrics
print(f"Model: {type(linear_model).__name__}, mse: {mse_linear}")
print(f"Model: {type(linear_model).__name__}, mae: {mae_linear}")
print(f"Model: {type(linear_model).__name__}, r2: {r2_linear}")
linear_model_plot = ModelComparisonPlot('LinearRegression')

# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_linear, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals_linear = y_test - predictions_linear
sns.residplot(x=predictions_linear, y=residuals_linear, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(linear_model_plot.model_name), fontsize=15, color="black")

# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()

KNeighborsRegressor

# Create a KNN regression model
knn_model = KNeighborsRegressor()

# Fit the model
knn_model.fit(X_train, y_train)

# Make predictions
predictions_knn = knn_model.predict(X_test)

# Evaluate the model
mse_knn = mean_squared_error(y_test, predictions_knn)
mae_knn = mean_absolute_error(y_test, predictions_knn)
r2_knn = r2_score(y_test, predictions_knn)

# Print evaluation metrics
print(f"Model: {type(knn_model).__name__}, mse: {mse_knn}")
print(f"Model: {type(knn_model).__name__}, mae: {mae_knn}")
print(f"Model: {type(knn_model).__name__}, r2: {r2_knn}")
knn_model_plot = ModelComparisonPlot('KNeighborsRegressor')

# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_knn, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals_knn = y_test - predictions_knn
sns.residplot(x=predictions_knn, y=residuals_knn, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(knn_model_plot.model_name), fontsize=15, color="black")

# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()

XGboost

# Convert 'Density' column to numeric
X_train['Density'] = pd.to_numeric(X_train['Density'], errors='coerce')
X_test['Density'] = pd.to_numeric(X_test['Density'], errors='coerce')

# Drop rows with missing values after conversion
X_train = X_train.dropna()
X_test = X_test.dropna()

# Create an XGBoost regression model
xgb_model = XGBRegressor()

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions
predictions_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, predictions_xgb)
mae_xgb = mean_absolute_error(y_test, predictions_xgb)
r2_xgb = r2_score(y_test, predictions_xgb)

# Print evaluation metrics
print(f"Model: {type(xgb_model).__name__}, mse: {mse_xgb}")
print(f"Model: {type(xgb_model).__name__}, mae: {mae_xgb}")
print(f"Model: {type(xgb_model).__name__}, r2: {r2_xgb}")
xgb_model_plot = ModelComparisonPlot('XGBRegressor')

# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test, y=predictions_xgb, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Remove color argument here
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals_xgb = y_test - predictions_xgb
sns.residplot(x=predictions_xgb, y=residuals_xgb, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test.min(), y_test.max()], [0, 0], 'k--', lw=2)  # Remove color argument here
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\n{}'.format(xgb_model_plot.model_name), fontsize=15, color="black")

# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()

Gaussian Naive Bayes

# Convert continuous labels to binary categories
y_train_binary = (y_train > y_train.mean()).astype(int)
y_test_binary = (y_test > y_train.mean()).astype(int)
# Create a Naive Bayes model
nb_model = GaussianNB()
# Fit the model
nb_model.fit(X_train, y_train_binary)
# Make predictions
predictions = nb_model.predict(X_test)
X_encoded['Density'] = pd.to_numeric(X_encoded['Density'], errors='coerce')

# Drop 'gdp_growth' as before
X = X_encoded.drop('国内生产总值增长率', axis=1)
y = X_encoded['国内生产总值增长率']
X_encoded = X_encoded.fillna(0)

# Assuming X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of algorithms to check
algorithms = [
    LinearRegression(),
    DecisionTreeRegressor(),
    KNeighborsRegressor(),
    XGBRegressor()
]

best_mse = float('inf')
best_model = None
# Loop through each algorithm
for model in algorithms:
    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Evaluate the model using cross-validation with mean squared error
    mse_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(mse_scores)
    
    # Print the cross-validation mean squared error
    print(f"{model.__class__.__name__} - Cross-Validation MSE: {mean_mse}")

    # Update the best model if the current model has lower mean squared error
    if mean_mse < best_mse:
        best_mse = mean_mse
        best_model = model

# Print the best model and its mean squared error
print("\nBest Model:")
print(best_model)
print("Best Cross-Validation MSE:", best_mse)
# Set Seaborn style
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "#D3D3D3"})

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(11, 5), facecolor="#F5F5F5")
# Plot the predicted vs true values
sns.regplot(x=y_test_binary, y=predictions, color='red', scatter_kws={'alpha': 0.5}, ax=axes[0])
axes[0].plot([y_test_binary.min(), y_test_binary.max()], [y_test_binary.min(), y_test_binary.max()], 'k--', lw=2)
axes[0].set_xlabel('True values', fontsize=12, color="red")
axes[0].set_ylabel('Predicted values', fontsize=12, color="red")
axes[0].set_title('Predicted vs true values', color="red")
axes[0].grid(color='lightgray', linestyle='--', linewidth=0.5)

# Plot the residuals vs predicted values
residuals = y_test_binary - predictions
sns.residplot(x=predictions, y=residuals, color='green', scatter_kws={'alpha': 0.5}, ax=axes[1])
axes[1].plot([y_test_binary.min(), y_test_binary.max()], [0, 0], 'k--', lw=2)
axes[1].set_xlabel('Predicted values', fontsize=12, color="green")
axes[1].set_ylabel('Residuals', fontsize=12, color="green")
axes[1].set_title('Residual plot', fontsize=15, color="green")
axes[1].grid(color='lightgray', linestyle='--', linewidth=0.5)
# Add a title to the figure
fig.suptitle('Comparison of Predicted vs True Values and Residual Plot\nGaussianNB', fontsize=15, color="black")

# Adjust the spacing between subplots
plt.subplots_adjust(wspace=0.4)

# Display the figure with the title
plt.show()