目的:
1、搜索所有给定的参数,优化机器流水线
2、输出有助于评估流水线质量的指标
from sklearn.model_selection import GridSearchCV
def get_best_model_and_accuracy(model,params,X,y):
grid=GridSearchCV(model,
params,
error_score=0.)
grid.fit(X,y)#拟合模型参数
print("Best Accuracy:{}".format(grid.best_score_))
print("Best Params:{}".format(grid.best_params_))
#拟合的平均之间
print("Average time to fit (s):{}".format(round(grid.cv_results_['mean_fit_time'].mean(),3)))
#预测的平均时间,从指标可以看出在真实世界的性能
print("Average time to fit (s):{}".format(round(grid.cv_results_['mean_score_time'].mean(),3)))
2、创建准机器学习流线
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
lr_params={'C':[1e-1,1e0,1e1,1e2],'penalty':['l1','l2']}
knn_params={'n_neighbors':[1,3,5,7]}
tree_params={'max_depth':[None,1,3,5,7]}
forest_params={'n_estimators':[10,50,100],'max_depth':[None,1,3,5,7]}
lr=LogisticRegression()
knn=KNeighborsClassifier()
d_tree=DecisionTreeClassifier()
forest=RandomForestClassifier()
from sklearn.datasets import load_iris
data,target=load_iris().data,load_iris().target
df=pd.concat([DataFrame(data),DataFrame(target)],axis=1)
df.columns=load_iris().feature_names+["target"]
df.head()
get_best_model_and_accuracy(lr,lr_params,df.iloc[:,:-1],df.iloc[:,-1])
get_best_model_and_accuracy(knn,knn_params,df.iloc[:,:-1],df.iloc[:,-1])
get_best_model_and_accuracy(d_tree,tree_params,df.iloc[:,:-1],df.iloc[:,-1])
get_best_model_and_accuracy(forest,forest_params,df.iloc[:,:-1],df.iloc[:,-1])