数据爬取与基础模型搭建请戳快速问医生数据集分析
降维——统计特征重要性累计95%的特征个数
sorted_importance = [item[1] for item in feature_importances ]
sorted_features = [item[0] for item in feature_importances ]
cul_importance = np.cumsum(sorted_importance)
plt.plot(x_values,cul_importance)
plt.hlines(y=0.95,xmin=0,xmax=len(sorted_importance),colors='r',linestyles='dashed')
plt.xticks(x_values,sorted_features,rotation='vertical')
plt.xlabel('Variable')
plt.ylabel('Cul_Importance')
plt.title('Cul_Importance')
#统计重要特征个数
np.where(cul_importance>0.95)[0][0]
#获取重要特征索引,对训练集和测试集进行处理
importance_features = [feature[0] for feature in feature_importances[0:8]]
importance_indices = [feature_list.index(feature) for feature in importance_features]
im_train_features = train_features[:,importance_indices]
im_test_features = test_features[:,importance_indices]
print(im_train_features.shape)
print(im_test_features.shape)
#(1448, 8)
#(966, 8)
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model
rf = RandomForestClassifier(n_estimators= 10, random_state=44)
rf.fit(im_train_features,train_labels)
predictions = rf.predict(im_test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))
accuracy = 0.8571428571428571%
- 相比于之前的基础模型,反而有些下降,这是因为RF本身就会考虑特征的问题,会优先选有价值的。认为去掉一下,对于算法来说可供选择的就少了。
- 但是计算速度会提高
调参
- RF重要参数
rf.get_params()
#获取目前的参数
{‘bootstrap’: True,
‘ccp_alpha’: 0.0,
‘class_weight’: None,
‘criterion’: ‘gini’,
‘max_depth’: None,
‘max_features’: ‘auto’,
‘max_leaf_nodes’: None,
‘max_samples’: None,
‘min_impurity_decrease’: 0.0,
‘min_impurity_split’: None,
‘min_samples_leaf’: 1,
‘min_samples_split’: 2,
‘min_weight_fraction_leaf’: 0.0,
‘n_estimators’: 10,
‘n_jobs’: None,
‘oob_score’: False,
‘random_state’: 44,
‘verbose’: 0,
‘warm_start’: False}
- 先用随机参数筛选
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
#max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf = RandomForestClassifier()
#随机参数搜索,使用3层交叉验证
#试试100种不同的参数组合
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
n_iter = 100, scoring='neg_mean_absolute_error',
cv = 3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(train_features, train_labels)
#最优参数
rf_random.best_params_
{‘n_estimators’: 2000,
‘min_samples_split’: 10,
‘min_samples_leaf’: 4,
‘max_features’: ‘sqrt’,
‘max_depth’: 10,
‘bootstrap’: False}
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 2000, min_samples_split= 10,
min_samples_leaf= 4,
max_features= 'sqrt',
max_depth= 10,
bootstrap= False)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))
新模型1:accuracy = 0.8633540372670807%
老模型:accuracy = 0.860248447204969%
- 再用网格搜索,将所有组合都过一遍
from sklearn.model_selection import GridSearchCV
param_grid = {
'bootstrap': [False],
'max_depth': [10, 20, 30, 40],
'max_features': [2, 3],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [1000, 1200, 1400, 2000]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
scoring = 'neg_mean_absolute_error', cv = 3,
n_jobs = -1, verbose = 2)
grid_search.fit(train_features, train_labels)
grid_search.best_params_
{‘bootstrap’: False,
‘max_depth’: 10,
‘max_features’: 3,
‘min_samples_leaf’: 5,
‘min_samples_split’: 8,
‘n_estimators’: 1000}
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(bootstrap = False,
max_depth = 10,
max_features = 3,
min_samples_leaf = 5,
min_samples_split = 8,
n_estimators = 1000)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions,test_labels)]
accuracy = (sum(map(int, correct)))/966
print ('accuracy = {0}%'.format(accuracy))
新模型2:accuracy = 0.865424430641822%
新模型1:accuracy = 0.8633540372670807%
老模型:accuracy = 0.860248447204969%
最终确定下来的参数
’bootstrap’: False,
‘max_depth’: 10,
‘max_features’: 3,
‘min_samples_leaf’: 5,
‘min_samples_split’: 8,
‘n_estimators’: 1000