-
随机森林-重要性权重分析
#随机森林模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators = 50, random_state = 1)
rf.fit(X, y)
# argsort函数给数组做一个排列
importances = rf.feature_importances_
included = X.columns.values
indices = np.argsort(importances)[::-1]
#绘制条形图
import matplotlib.pyplot as plt
plt.figure(figsize = (20, 5))
plt.bar(x = included[indices][0:20], height = importances[indices][0:20], color = 'g', edgecolor = 'k')
plt.xticks(rotation = -45, fontsize = 12, ha = 'left')
plt.yticks(fontsize = 12)
plt.show()
-
相关系数热力图
# import warnings
# warnings.filterwarnings("ignore")
# print(data.corr()) # 计算相关系数矩阵
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
def test(df):
dfData = df.corr()
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文
plt.rcParams['axes.unicode_minus'] = False
plt.subplots(figsize=(29, 29)) # 设置画面大小
sns.heatmap(dfData, annot=True, vmax=1, square=True, cmap="Blues")
plt.savefig('./相关系数热力图.png')
plt.show()
test(data)
-
多重共线性检查VIF
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
# 当VIF<10,说明不存在多重共线性;当10<=VIF<100,存在较强的多重共线性,当VIF>=100,存在严重多重共线性
name = X.columns
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF = pd.DataFrame({'feature':name,"VIF":vif})
方差膨胀因子(Variance Inflation Factor,以下简称VIF),指解释变量之间存在多重共线性时的方差与不存在多重共线性时的方差之比。
有警告但是能出来结果