背景:有个关于回归的任务,因保护客户数据资料,用鸢尾花数据集代替,完成随机森林算法实现部分功能。
完整代码在最后
1. 加载数据集
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# 加载示例数据集
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
print(iris.DESCR)
此时会显示当前数据的部分相关描述
:Summary Statistics:
============== ==== ==== ======= ===== ====================
Min Max Mean SD Class Correlation
============== ==== ==== ======= ===== ====================
sepal length: 4.3 7.9 5.84 0.83 0.7826
sepal width: 2.0 4.4 3.05 0.43 -0.4194
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
============== ==== ==== ======= ===== ====================:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988
2. 输出数据特征之间的相关性矩阵
# 输出特征之间的相关性矩阵
correlation_matrix = np.corrcoef(X_train, rowvar=False)
# 使用热图可视化相关性矩阵
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', xticklabels=feature_names, yticklabels=feature_names)
plt.title('Correlation Matrix of Iris Features')
plt.show()
3. 训练模型并保存joblib文件
# 创建随机森林模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# 训练模型
rf_model.fit(X_train, y_train)
# 保存模型
joblib.dump(rf_model, 'random_forest_model.joblib')
4. 加载模型并预测输出均方误差和R方评估指标
# 加载模型
loaded_model = joblib.load('random_forest_model.joblib')
# 使用加载的模型进行预测
y_pred = loaded_model.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
Mean Squared Error: 0.0013833333333333336
R-squared: 0.9980206677265501
5. 特征重要性分析
# 输出特征的重要性
feature_importances = loaded_model.feature_importances_
print('Feature Importances:')
for i, importance in enumerate(feature_importances):
print(f'Feature {i+1}: {importance}')
# 将特征重要性进行可视化
plt.figure(figsize=(10, 6))
sorted_idx = np.argsort(feature_importances)[::-1] # 反向排序
plt.bar(list(range(len(feature_importances))), feature_importances[sorted_idx], align='center')
plt.xticks(list(range(len(feature_importances))), np.array(feature_names)[sorted_idx], rotation=0)
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.title('Feature Importance Scores')
plt.show()
Feature Importances:
Feature 1: 0.007247638926907056
Feature 2: 0.01241623468021743
Feature 3: 0.4956256973314748
Feature 4: 0.48471042906140077
6. 完整代码
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# 加载示例数据集
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
# print(iris.DESCR)
# 创建随机森林模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# 训练模型
rf_model.fit(X_train, y_train)
# 保存模型
joblib.dump(rf_model, 'random_forest_model.joblib')
# 加载模型
loaded_model = joblib.load('random_forest_model.joblib')
# 使用加载的模型进行预测
y_pred = loaded_model.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
feature_names = iris.feature_names
# 输出特征之间的相关性矩阵
correlation_matrix = np.corrcoef(X_train, rowvar=False)
# 使用热图可视化相关性矩阵
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', xticklabels=feature_names, yticklabels=feature_names)
plt.title('Correlation Matrix of Iris Features')
plt.show()
# 输出特征的重要性
feature_importances = loaded_model.feature_importances_
print('Feature Importances:')
for i, importance in enumerate(feature_importances):
print(f'Feature {i+1}: {importance}')
# 将特征重要性进行可视化
plt.figure(figsize=(10, 6))
sorted_idx = np.argsort(feature_importances)[::-1] # 反向排序
plt.bar(list(range(len(feature_importances))), feature_importances[sorted_idx], align='center')
plt.xticks(list(range(len(feature_importances))), np.array(feature_names)[sorted_idx], rotation=0)
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.title('Feature Importance Scores')
plt.show()
后续还可以添加一些寻优逻辑,比如网格搜索,交叉验证等。