建模调参
特征工程目标
1、了解常用的机器学习模型,
2、掌握机器学习模型的建模与调参流程
(主要通过模型的效果来调参)
线性回归模型
sample_feature=sample_feature.dropna().replace('-',0).reset_index(drop=True)
sample_feature['notRepairedDamage']=sample_feature['notRepairedDamage'].astype(np.float32)
train=sample_feature[continuous_feature_name+['price']]
train_X=train[continuous_feature_name]
train_Y=train['price']
print(train_X.shape)
print(train_Y.shape)
#建模(LR模型)
from sklearn.linear_model import LinearRegression
model=LinearRegression(normalize=True)
model=model.fit(train_X,train_Y)
#查看训练的截距(intercept)和权重(coef)
print('intercept:'+str(model.intercept_))
sorted(dict(zip(continuous_feature_name,model.coef_)).items(),key=lambda x:x[1],reverse=True)
可视化效果:
import seaborn as sns
print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y)
plt.subplot(1,2,2)
sns.distplot(train_Y[train_Y<np.quantile(train_Y,0.9)])
可见数据存在长尾问题,进行log变换
#进行log变换后,呈现正态分布
train_Y_ln=np.log(train_Y+1)
print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y_ln)
plt.subplot(1,2,2)
sns.distplot(train_Y_ln[train_Y_ln<np.quantile(train_Y_ln,0.9)])
#五折交叉验证
因为在实际的训练中,训练的结果对于训练集的拟合程度通常还是挺好的(初始条件敏感),
但 是对于训练集之外的数据的拟合程度通常就不那么令人满意了。
因此我们通常并不会把所有的数 据集都拿来训练,而是分出一部分来(这一部分不参加训练)对训练集生成的参数进行测试,
相 对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证(Cross Validation)
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,make_scorer
def log_transfer(func):
def wrapper(y,yhat):
result=func(np.log(y),np.nan_to_num(np.log(yhat)))
return result
return wrapper
#使用线性回归模型,对未处理标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y,verbose=1,cv=5,
scoring=make_scorer(log_transfer(mean_absolute_error)))
print("AVG:",np.mean(scores))
#使用线性回归模型,对处理过标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=1,cv=5,
scoring=make_scorer(log_transfer(mean_absolute_error)))
print("log_AVG:",np.mean(scores))
scores=pd.DataFrame(scores.reshape(1,-1))
scores.columns=['cv'+str(x) for x in range(1,6)]
scores.index=['MAE']
scores
由于我们并不具有预知未来的能力,五折交叉验证在某些与时间相关的数据集上反而反映了不真实 的情况。
通过2018年的二手车价格预测2017年的二手车价格,这显然是不合理的,因此我们还可以采用时间顺序
对数据集进行分隔。在本例中,我们选用靠前时间的4/5样本当作训练集,靠后时间的1/5当作验证集,
最终结果 与五折交叉验证差距不大
#多种线性模型
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
决策树模型 GBDT模型 XGBoost模型 LightGBM模型
#非线性模型
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
models=[LinearRegression(),
SVC(),
DecisionTreeRegressor(),
RandomForestRegressor(),
GradientBoostingRegressor(),
MLPRegressor(solver='lbfgs',max_iter=100),
XGBRegressor(n_estimators=100,objective='reg:squarederror'),
LGBMRegressor(n_estimators=100)]
result=dict()
for model in models:
start=time.time()
model_name=str(model).split('(')[0]
scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))
result[model_name]=scores
end=time.time()
print("total time:",end-start)
print(model_name+'is Filished!')
result=pd.DataFrame(result)
result.index=['cv'+str(x) for x in range(1,6)]
result
调参方法
三种调参方法(贪心算法、网格调参、贝叶斯调参)
贪心算法
#LGB的参数集合:
objective=['regression','regression_l1','mape','huber','fair']
num_leaves=[3,5,10,15,20,40,55]
max_depth=[3,5,10,15,20,40,55]
bagging_fraction=[]
feature_fraction=[]
drop_rate=[]
#贪心调参
best_obj = dict()
for obj in objective:
model = LGBMRegressor(objective=obj)
score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_obj[obj] = score
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_leaves[leaves] = score
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_depth[depth] = score
sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves','3_turning_depth'],
y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())])
网格搜索
from sklearn.model_selection import GridSearchCV
start=time.time()
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_Y)
end=time.time()
print("total Time:",end-start)
clf.best_params_
start=time.time()
model=LGBMRegressor(objective='regression',
num_leaves=55,
max_depth=15)
print(np.mean(cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))))
end=time.time()
print("total Time:",end-start)
贝叶斯调参
#贝叶斯调参
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
val = cross_val_score(
LGBMRegressor(objective = 'regression_l1',
num_leaves=int(num_leaves),
max_depth=int(max_depth),
subsample = subsample,
min_child_samples = int(min_child_samples)
),
X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
).mean()
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (2, 100),
'max_depth': (2, 100),
'subsample': (0.1, 1),
'min_child_samples' : (2, 100)
}
)
rf_bo.maximize()
1-rf_bo.max['target']
#提升精度的方法
plt.figure(figsize=(13,5))
sns.lineplot(x=['0_origin','1_log_transfer','2_L1_&_L2','3_change_model','4_parameter_turning'],
y=[1.36 ,0.19, 0.19, 0.14, 0.13])
总结
1、首先需要根据不同的模型进行数据处理,选择符合模型的数据处理方式;
2、不同模型的构建方法;
3、三种不同的调参方法(较好的是贝叶斯);