DataWhale数据挖掘（二手车价格预测）第四次打卡

最新推荐文章于 2022-01-21 23:54:44 发布

JerryZengZ

最新推荐文章于 2022-01-21 23:54:44 发布

阅读量222

点赞数

分类专栏：总结笔记打卡文章标签：机器学习数据挖掘打卡模型调参

本文链接：https://blog.csdn.net/JerryZengZ/article/details/105252207

版权

总结同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

笔记

8 篇文章 0 订阅

订阅专栏

打卡

8 篇文章 0 订阅

订阅专栏

建模调参

特征工程目标

1、了解常用的机器学习模型，
2、掌握机器学习模型的建模与调参流程
（主要通过模型的效果来调参）

线性回归模型

sample_feature=sample_feature.dropna().replace('-',0).reset_index(drop=True)
sample_feature['notRepairedDamage']=sample_feature['notRepairedDamage'].astype(np.float32)
train=sample_feature[continuous_feature_name+['price']]

train_X=train[continuous_feature_name]
train_Y=train['price']
print(train_X.shape)
print(train_Y.shape)

#建模(LR模型)
from sklearn.linear_model import LinearRegression
model=LinearRegression(normalize=True)
model=model.fit(train_X,train_Y)

#查看训练的截距（intercept）和权重（coef）
print('intercept:'+str(model.intercept_))

sorted(dict(zip(continuous_feature_name,model.coef_)).items(),key=lambda x:x[1],reverse=True)

可视化效果：

import seaborn as sns

print('It is clear to see the price shows a typical exponential distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y)
plt.subplot(1,2,2)
sns.distplot(train_Y[train_Y<np.quantile(train_Y,0.9)])

在这里插入图片描述
可见数据存在长尾问题，进行log变换

#进行log变换后，呈现正态分布
train_Y_ln=np.log(train_Y+1)

print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_Y_ln)
plt.subplot(1,2,2)
sns.distplot(train_Y_ln[train_Y_ln<np.quantile(train_Y_ln,0.9)])

#五折交叉验证
因为在实际的训练中，训练的结果对于训练集的拟合程度通常还是挺好的(初始条件敏感)，
但是对于训练集之外的数据的拟合程度通常就不那么令人满意了。
因此我们通常并不会把所有的数据集都拿来训练，而是分出一部分来(这一部分不参加训练)对训练集生成的参数进行测试，
相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证(Cross Validation)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,make_scorer

def log_transfer(func):
    def wrapper(y,yhat):
        result=func(np.log(y),np.nan_to_num(np.log(yhat)))
        return result
    return wrapper

#使用线性回归模型，对未处理标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y,verbose=1,cv=5,
                       scoring=make_scorer(log_transfer(mean_absolute_error)))

print("AVG:",np.mean(scores))

#使用线性回归模型，对处理过标签的特征数据进行五折交叉验证
scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=1,cv=5,
                       scoring=make_scorer(log_transfer(mean_absolute_error)))
print("log_AVG:",np.mean(scores))

scores=pd.DataFrame(scores.reshape(1,-1))
scores.columns=['cv'+str(x) for x in range(1,6)]
scores.index=['MAE']
scores

由于我们并不具有预知未来的能力，五折交叉验证在某些与时间相关的数据集上反而反映了不真实的情况。
通过2018年的二手车价格预测2017年的二手车价格，这显然是不合理的，因此我们还可以采用时间顺序
对数据集进行分隔。在本例中，我们选用靠前时间的4/5样本当作训练集，靠后时间的1/5当作验证集，
最终结果与五折交叉验证差距不大

#多种线性模型
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

决策树模型 GBDT模型 XGBoost模型 LightGBM模型

#非线性模型

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

models=[LinearRegression(),
        SVC(),
       DecisionTreeRegressor(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        MLPRegressor(solver='lbfgs',max_iter=100),
        XGBRegressor(n_estimators=100,objective='reg:squarederror'),
        LGBMRegressor(n_estimators=100)]
result=dict()
for model in models:
    start=time.time()
    model_name=str(model).split('(')[0]
    scores=cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))
    result[model_name]=scores
    end=time.time()
    print("total time:",end-start)
    print(model_name+'is Filished!')
result=pd.DataFrame(result)
result.index=['cv'+str(x) for x in range(1,6)]
result

调参方法

三种调参方法（贪心算法、网格调参、贝叶斯调参）

贪心算法

#LGB的参数集合：
objective=['regression','regression_l1','mape','huber','fair']

num_leaves=[3,5,10,15,20,40,55]
max_depth=[3,5,10,15,20,40,55]
bagging_fraction=[]
feature_fraction=[]
drop_rate=[]

#贪心调参
best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_obj[obj] = score
    
best_leaves = dict()
for leaves in num_leaves:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_leaves[leaves] = score
    
best_depth = dict()
for depth in max_depth:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                          num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                          max_depth=depth)
    score = np.mean(cross_val_score(model, X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_depth[depth] = score
sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves','3_turning_depth'], 
             y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())])

网格搜索

from sklearn.model_selection import GridSearchCV

start=time.time()
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_Y)
end=time.time()
print("total Time:",end-start)

clf.best_params_

start=time.time()
model=LGBMRegressor(objective='regression',
                   num_leaves=55,
                   max_depth=15)

print(np.mean(cross_val_score(model,X=train_X,y=train_Y_ln,verbose=0,cv=5,scoring=make_scorer(mean_absolute_error))))
end=time.time()
print("total Time:",end-start)

贝叶斯调参

#贝叶斯调参
from bayes_opt import BayesianOptimization

def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=train_X, y=train_Y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val

rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'subsample': (0.1, 1),
    'min_child_samples' : (2, 100)
    }
)

rf_bo.maximize()
1-rf_bo.max['target']

#提升精度的方法
plt.figure(figsize=(13,5))

sns.lineplot(x=['0_origin','1_log_transfer','2_L1_&_L2','3_change_model','4_parameter_turning'], 
            y=[1.36 ,0.19, 0.19, 0.14, 0.13])

总结

1、首先需要根据不同的模型进行数据处理，选择符合模型的数据处理方式；
2、不同模型的构建方法；
3、三种不同的调参方法（较好的是贝叶斯）；

JerryZengZ

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
DataWhale数据挖掘（二手车价格预测）第四次打卡

建模调参特征工程目标1、了解常用的机器学习模型，2、掌握机器学习模型的建模与调参流程（主要通过模型的效果来调参）线性回归模型sample_feature=sample_feature.dropna().replace('-',0).reset_index(drop=True)sample_feature['notRepairedDamage']=sample_feature['not...
复制链接

扫一扫

专栏目录