自定义特征选择转换器
#保存响应变量
class CustomCorrelationChooser(TransformerMixin,BaseEstimator):
def __init__(self,response,cols_keep=[],threshold=None):
#保存响应变量
self.response=response
#初始化一个变量,存放要保留的特征名
self.cols_keep=cols_keep
#保存阈值
self.threshold=threshold
def transform(self,X):
#转换会选择合适的列
return X[self.cols_keep]
def fit(self,X,*_):
#新创建的DataFrame,存放特征和响应
df=pd.concat([X,self.response],axis=1)
#保存高于阈值的列的名称
self.cols_keep=df.columns[df.corr()[df.columns[-1]].abs()>self.threshold]
#只保留X的列,删掉响应变量
self.cols_keep=[c for c in self.cols_keep if c in X.columns]
return self
ccc=CustomCorrelationChooser(threshold=.2,response=df["target"])
ccc.fit(df.iloc[:,:-1])
ccc.cols_keep
ccc.transform(df.iloc[:,:-1]).head()
组装流水线
from copy import deepcopy
#使用响应变量初始化特征选择器
# import Pipeline
ccc=CustomCorrelationChooser(response=df["target"])
#创建流水线,包括选择器
# ccc_pipe=Pipeline([("Correlation_select",ccc),('classifier',d_tree)])
tree_pipe_params={'classifier_max_depth':[None,1,3,5,7,9,11,13,15,17,19,21]}
#复制决策树参数
ccc_pipe_params=deepcopy(tree_pipe_params)
ccc_pipe_params.update({"Correlation_select":[0,.1,.2,.3]})
ccc_pipe_params
{‘Correlation_select’: [0, 0.1, 0.2, 0.3],
‘classifier_max_depth’: [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
get_best_model_and_accuracy(forest,ccc_pipe_params,df.iloc[:,:-1],df.iloc[:,-1])