DataWhale数据挖掘(二手车价格预测)第四次打卡

8 篇文章 0 订阅
8 篇文章 0 订阅

建模调参

特征工程目标

1、了解常用的机器学习模型,
2、掌握机器学习模型的建模与调参流程
(主要通过模型的效果来调参)

线性回归模型

sample_feature=sample_feature.dropna().replace('-',0).reset_index(drop=True)
sample_feature['notRepairedDamage']=sample_feature['notRepairedDamage'].astype(np.float32)
train=sample_feature[continuous_feature_name+['price']]

train_X=train[continuous_feature_name]
train_Y=train['price']
print(train_X.shape)
print(train_Y.shape)

#建模(LR模型)
from sklearn.linear_model import LinearRegression
model=LinearRegression(normalize=True)
model=model.fit(train_X,train_Y)

#查看训练的截距(intercept)和权重(coef)
print('intercept:'+str(model.intercept_))

sorted(dict(zip(continuous_feature_name,model.coef_)).items(),key=lambda x:x[1],reverse=True)

可视化效果:

import seaborn as sns

print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y)
plt.subplot(1,2,2)
sns.distplot(train_Y[train_Y<np.quantile(train_Y,0.9)])

在这里插入图片描述
可见数据存在长尾问题,进行log变换

#进行log变换后,呈现正态分布
train_Y_ln=np.log(train_Y+1)

print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y_ln)
plt.subplot(1,2,2)
sns.distplot(train_Y_ln[train_Y_ln<np.quantile(train_Y_ln,0.9)])

#五折交叉验证
因为在实际的训练中,训练的结果对于训练集的拟合程度通常还是挺好的(初始条件敏感),
但 是对于训练集之外的数据的拟合程度通常就不那么令人满意了。
因此我们通常并不会把所有的数 据集都拿来训练,而是分出一部分来(这一部分不参加训练)对训练集生成的参数进行测试,
相 对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证(Cross Validation)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,make_scorer

def log_transfer(func):
    def wrapper(y,yhat):
        result=func(np.log(y),np.nan_to_num(np.log(yhat)))
        return result
    return wrapper

#使用线性回归模型,对未处理标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y,verbose=1,cv=5,
                       scoring=make_scorer(log_transfer(mean_absolute_error)))

print("AVG:",np.mean(scores))

#使用线性回归模型,对处理过标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=1,cv=5,
                       scoring=make_scorer(log_transfer(mean_absolute_error)))
print("log_AVG:",np.mean(scores))

scores=pd.DataFrame(scores.reshape(1,-1))
scores.columns=['cv'+str(x) for x in range(1,6)]
scores.index=['MAE']
scores

由于我们并不具有预知未来的能力,五折交叉验证在某些与时间相关的数据集上反而反映了不真实 的情况。
通过2018年的二手车价格预测2017年的二手车价格,这显然是不合理的,因此我们还可以采用时间顺序
对数据集进行分隔。在本例中,我们选用靠前时间的4/5样本当作训练集,靠后时间的1/5当作验证集,
最终结果 与五折交叉验证差距不大

#多种线性模型
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

决策树模型 GBDT模型 XGBoost模型 LightGBM模型

#非线性模型

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

models=[LinearRegression(),
        SVC(),
       DecisionTreeRegressor(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        MLPRegressor(solver='lbfgs',max_iter=100),
        XGBRegressor(n_estimators=100,objective='reg:squarederror'),
        LGBMRegressor(n_estimators=100)]
result=dict()
for model in models:
    start=time.time()
    model_name=str(model).split('(')[0]
    scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))
    result[model_name]=scores
    end=time.time()
    print("total time:",end-start)
    print(model_name+'is Filished!')
result=pd.DataFrame(result)
result.index=['cv'+str(x) for x in range(1,6)]
result

调参方法

三种调参方法(贪心算法、网格调参、贝叶斯调参)

贪心算法

#LGB的参数集合:
objective=['regression','regression_l1','mape','huber','fair']

num_leaves=[3,5,10,15,20,40,55]
max_depth=[3,5,10,15,20,40,55]
bagging_fraction=[]
feature_fraction=[]
drop_rate=[]

#贪心调参
best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_obj[obj] = score
    
best_leaves = dict()
for leaves in num_leaves:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_leaves[leaves] = score
    
best_depth = dict()
for depth in max_depth:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                          num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                          max_depth=depth)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_depth[depth] = score
sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves','3_turning_depth'], 
             y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())])

网格搜索

from sklearn.model_selection import GridSearchCV

start=time.time()
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_Y)
end=time.time()
print("total Time:",end-start)

clf.best_params_

start=time.time()
model=LGBMRegressor(objective='regression',
                   num_leaves=55,
                   max_depth=15)

print(np.mean(cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))))
end=time.time()
print("total Time:",end-start)

贝叶斯调参

#贝叶斯调参
from bayes_opt import BayesianOptimization

def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val

rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'subsample': (0.1, 1),
    'min_child_samples' : (2, 100)
    }
)

rf_bo.maximize()
1-rf_bo.max['target']

#提升精度的方法
plt.figure(figsize=(13,5))

sns.lineplot(x=['0_origin','1_log_transfer','2_L1_&_L2','3_change_model','4_parameter_turning'], 
            y=[1.36 ,0.19, 0.19, 0.14, 0.13])

总结

1、首先需要根据不同的模型进行数据处理,选择符合模型的数据处理方式;
2、不同模型的构建方法;
3、三种不同的调参方法(较好的是贝叶斯);

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值