import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn import model_selection, metrics import matplotlib.pylab as plt # 导入数据,查看数据类别分布 pima_indians_diabetes = pd.read_csv("\pima_indians_diabetes.csv", header = None) pima_indians_diabetes.columns = ['Number of times pregnant', 'Plasma glucose concentration','Diastolic blood pressure ','Triceps skin fold thickness', '2-Hour serum insulin','Body mass index','Diabetes pedigree function', 'Age','Class'] Class = 'Class' # Class的值就是二元分类的输出 pima_indians_diabetes.head() pima_indians_diabetes.iloc[:, 8].value_counts() #查看判别列二分类输出0:500 1:268 train, test = train_test_split(pima_indians_diabetes, test_size = 0.2, random_state = 0) # 选择好样本特征和类别输出,样本特征为除去输出类别的第八列 x_columns = [x for x in train.columns if x not in [Class]] train_x = train[x_columns] train_y = train['Class'] #建立模型 features_list = train_x.columns.values #获取train_x的列名 forest = RandomForestClassifier(n_estimators = 70, max_depth =3,min_samples_leaf = 10, min_samples_split = 80, oob_score = True, random_state =10) forest.fit(train_x, train_y) feature_importance = forest.feature_importances_ # 调整特征重要性的数值范围 feature_importance = 100.0 * (feature_importance / feature_importance.max()) fi_threshold = 15 # 取特征重要性前15%的变量 important_idx = np.where(feature_importance > fi_threshold) # 取特征名称 important_features = features_list[important_idx] print("\n", important_features.shape[0], "Important features(", fi_threshold, "% of max importance):\n",important_features) #4 Importantfeatures(15 % of max importance): #['Number of times pregnant' 'Plasma glucose concentration','Body mass index' 'Age'] # 排序 sorted_idx = np.argsort(feature_importance[important_idx])[::-1] print("nFeatures sorted by importance (DESC):n", important_features[sorted_idx]) pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align = 'center') plt.yticks(pos, important_features[sorted_idx[::-1]]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.draw() plt.show() # 优化超参 # 网格搜索会逐个穷举所有参数组合,得到的模型效果更优,但耗时长 # 随机搜索得到的结果比前者稍差,但花费的时间短 # 如果参数空间较大,可先用随机搜索搜索范围,然后再用网格搜索遍历寻找最优超参 # 从搜索结果中取最优超参集合 import operator def report(grid_scores, n_top =5): params = None top_scores = sorted(grid_scores, key = operator.itemgetter(1), reverse = True)[:n_top] for i, score in enumerate(top_scores): print("Parameters with rank:{0}".format(i + 1)) print("Mean validation score: {0:.4f} (std: {1:.4f})".format( score.mean_validation_score, np.std(score.cv_validation_scores) )) print("Parameters: {0}".format(score.parameters)) print("") if params == None: params = score.parameters # 取特征数的平方根 sqrtfeat = np.sqrt(train_x.shape[1]) # 参数空间 grid_test1 = {"n_estimators": [1000, 2500, 5000], "criterion": ["gini", "entropy"], "max_features": [sqrtfeat - 1, sqrtfeat, sqrtfeat + 1], "max_depth": [5, 10, 25], "min_samples_split": [2, 5, 10]} # 默认任何参数,拟合数据 rf0 = RandomForestClassifier(oob_score = True, random_state = 10) rf0.fit(train_x, train_y) print(rf0.oob_score_) # 0.682410423453 y_predprob = rf0.predict_proba(train_x)[:, 1] print("AUC Score (Train): %f" % metrics.roc_auc_score(train_y, y_predprob)) # AUC Score (Train): 0.999528 # 可以理解为袋外分数为0.682,即为模型的泛化能力 # AUC的分数很高,从一堆样本中随机抽一个,抽到正样本的概率比抽到负样本的概率大的可能性 # 对n_estimators进行网格搜索 param_test1 = {'n_estimators': range(10, 71, 10)} # 从10到71,间隔为10 gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split = 100,min_samples_leaf = 20, max_depth = 8, max_features = 'sqrt', random_state = 10), param_grid = param_test1, scoring = 'roc_auc', cv = 5) gsearch1.fit(train_x, train_y) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ # 得出最佳的弱学习器迭代次数n_estimators = 70,对决策树最大深度max_depth的内部节点在划分所需最小样本数min_samples_split进行网格搜索 param_test2 = {'max_depth':range(3, 14, 2), 'min_samples_split':range(50, 201, 20)} gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 70, min_samples_leaf = 20, max_features = 'sqrt', oob_score = True, random_state = 10), param_grid = param_test2, scoring = 'roc_auc', cv = 5) gsearch2.fit(train_x, train_y) gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ # 最大深度'max_depth': 3, 划分的最小样本数'min_samples_split': 70,现在看模型的袋外分数: rf1 = RandomForestClassifier(n_estimators = 70, max_depth = 3, min_samples_split = 70, min_samples_leaf = 20, max_features = 'sqrt', oob_score = True,random_state = 10) rf1.fit(train_x, train_y) print(rf1.oob_score_) # 0.749185667752 # 此时袋外分数有一些提高,泛化能力增强了,对于内部节点再划分所需最小样本数 # min_samples_split,暂时不能一起定下来,因为这个和决策树其他参数存在关联,再对 # 内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参 param_test3 = {'min_samples_split':range(80, 150, 20), 'min_samples_leaf':range(10, 60, 10)} gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 70, max_depth = 3, max_features = 'sqrt', oob_score = True, random_state = 10), param_grid = param_test3, scoring = 'roc_auc', iid = False, cv = 5) gsearch3.fit(train_x, train_y) gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_ # 内部节点再划分所需最小样本数'min_samples_leaf': 10, 叶子节点最少样本数'min_samples_split': 80 # 最后对最大特征数max_features做调参 # GridSearchCV中param_grid参数是字典构成的列表 param_test4 = {'max_features':range(3, 11, 2)} gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 70, max_depth =3, min_samples_leaf = 10, min_samples_split = 80, oob_score = True, random_state =10), param_grid = param_test4, scoring = 'roc_auc', iid = False, cv = 5) gsearch4.fit(train_x, train_y) gsearch4.grid_scores_, gsearch4.best_params_,gsearch4.best_score_ # 找到最佳参数,看看最终模型 rf2 = RandomForestClassifier(n_estimators = 70, max_depth =3, min_samples_leaf = 10, min_samples_split = 80, oob_score = True, random_state =10) rf2.fit(train_x, train_y) print(rf2.oob_score_) # 0.750814332248
网格搜索随机森林
最新推荐文章于 2024-06-20 16:12:01 发布