heywhale练习赛-民宿价格预测baseline-线上验证5.258

一、说明

数据来源为(和鲸社区)练习赛-新人赛-民宿价格预测。
1.整体流程:

  1. 数据清洗
  2. 自动化调参+模型建立
  3. 模型融合

2.说明:

  1. 参考了多个相关文章,包括官方baseline、基于Hyperopt的自动化调参另一个baseline(但是单模型没有达到这位大兄弟所说的效果0.0,后来数据处理调整了部分才勉强达到,可能有设备和数据划分的因素。)、模型融合
  2. 线上得分5.258,还有优化的空间,限于设备、时间以及知识储备,目前到此为止了,最高4名上下,最后大概7名上下。

二、代码

1.导入需要的包

import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,make_scorer

from sklearn.model_selection import KFold,RepeatedKFold
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import GradientBoostingRegressor as GBDT
from sklearn.ensemble import ExtraTreesRegressor as ET
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.ensemble import AdaBoostRegressor as ADA
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:' %.5f' % x)```

2.加载数据,合并训练集和测试集

train = pd.read_csv(r'C:\Users\hp\Desktop\新建文件夹\民宿预测\训练集.csv')
test = pd.read_csv(r'C:\Users\hp\Desktop\新建文件夹\民宿预测\测试集.csv')
df_features = train.append(test)
df_features.head()

3.数据处理

# 查看数据集相关信息
df_features.info()
#查看缺失率
all_data_na = (df_features.isnull().sum()/len(df_features))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'缺失率' : all_data_na})
missing_data.head(20)
#缺失值填补
df_features['房主回复率'].fillna('-1', inplace=True)
df_features['房主回复率'] = df_features['房主回复率'].astype(str).apply(lambda x: x.replace('%', ''))
df_features['房主回复率'] = df_features['房主回复率'].astype(int)
# 根据各特征的现实含义,填充合适的值
feature1 = ['床的类型','邮编','民宿周边','房主身份是否验证','房主是否有个人资料图片']
for i in feature1:
    df_features[i] = df_features[i].fillna('na')
    
feature2 = ['评论个数','洗手间数量']
for i in feature2:
    df_features[i] = df_features[i].fillna(0)

feature3 = ['民宿评分', '卧室数量', '取消条款', '床的数量','经度' ,'维度','房主回复率']
for i in feature3:
    df_features[i] = df_features[i].fillna(df_features[i].mode()[0])
df_features.dtypes[df_features.dtypes != 'object'].index
for feat in ['房主是否有个人资料图片', '房主身份是否验证', '民宿周边', '邮编']:
    lbl = LabelEncoder()
    lbl.fit(df_features[feat])
    df_features[feat] = lbl.transform(df_features[feat])

def freq_enc(df, col):
    vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
    df[f'{col}_freq'] = df[col].map(vc)
    return df

for feat in ['容纳人数', '洗手间数量', '床的数量', '床的类型',
             '卧室数量', '取消条款', '所在城市', '清洁费',
             '房主是否有个人资料图片', '房主回复率', '是否支持随即预订',
             '民宿周边', '房产类型', '房型', '邮编']:
    df_features = freq_enc(df_features, feat)
df_features.head(5)
# 时间特征处理
from tqdm import tqdm

df_features['首次评论日期'] = pd.to_datetime(df_features['首次评论日期']).values.astype(np.int64) // 10 ** 9
df_features['何时成为房主'] = pd.to_datetime(df_features['何时成为房主']).values.astype(np.int64) // 10 ** 9
df_features['最近评论日期'] = pd.to_datetime(df_features['最近评论日期']).values.astype(np.int64) // 10 ** 9

df_features['待业时间'] = df_features['首次评论日期'] - df_features['何时成为房主']
df_features['开业时间'] = df_features['最近评论日期'] - df_features['首次评论日期']
df_features['购房时间'] = df_features['最近评论日期'] - df_features['何时成为房主']


def brute_force(df, features, groups):
    for method in tqdm(['max', 'min', 'mean', 'median', 'std']):
        for feature in features:
            for group in groups:
                df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)

    return df


dense_feats = ['待业时间', '开业时间', '购房时间']
# cate_feats  = ['房型']
cate_feats  = []

df_features = brute_force(df_features, dense_feats, cate_feats)
# 独热编码
for feat in [ '床的类型', '所在城市', '是否支持随即预订', '房产类型', '房型']:
    df_oh = pd.get_dummies(df_features[feat],prefix=feat)
    df_features = pd.concat([df_features,df_oh], axis=1)
df_features.head()
def f(x):
    if x>0:
        return 1
    else:
        return 0 
df_features['if_bed'] = train['床的数量'].apply(f)
df_features['if_bedroom'] = train['卧室数量'].apply(f)
df_features['if_wc'] = train['洗手间数量'].apply(f)

#交叉衍生特征
df_features['人均床数量'] = df_features['容纳人数'] / (df_features['床的数量'] + 1e-3)  # 1e-3 是为了避免 zero-divide
df_features['人均卧室量'] = df_features['容纳人数'] / (df_features['卧室数量'] + 1e-3)
df_features['卧室床均量'] = df_features['床的数量'] / (df_features['卧室数量'] + 1e-3)
df_features['经纬度平方根'] = (df_features['维度']*df_features['维度'] + df_features['经度']*df_features['经度'])**.5

def get_features(df):
    features = [['人均床数量','人均卧室量'],['卧室床均量','人均卧室量']]
    for fea in features:
        df[f'{fea[0]}_{fea[1]}_std'] = df[fea].std(1)
        df[f'{fea[0]}_{fea[1]}_max'] = df[fea].max(1)
        df[f'{fea[0]}_{fea[1]}_min'] = df[fea].min(1)

        df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]

    return df

df_features = get_features(df_features)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
df_features['便利设施数量']=df_features['便利设施'].apply(lambda x:len(x.lstrip('{').rstrip('}').split(',')))
df_features['便利设施'] = df_features['便利设施'].apply(
    lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' '))
# df_features['便利设施'] = df_features['便利设施'].str.lower()

n_components = 12

X1 = list(df_features['便利设施'].values)
tfv = TfidfVectorizer(ngram_range=(1,1), max_features=10000)
tfv.fit(X1)
X_tfidf = tfv.transform(X1)
svd = TruncatedSVD(n_components= n_components)
svd.fit(X_tfidf)
X_svd = svd.transform(X_tfidf)

for i in range(n_components):
    df_features[f'便利设施_tfidf_{i}'] = X_svd[:, i]
le = LabelEncoder()
le.fit(df_features['便利设施'].values.tolist())
# transform 以后,这一列数就变成了 [0,  n-1] 这个区间的数,即是  le.classes_ 中的索引
df_features['便利设施']=le.transform(df_features['便利设施'].values.tolist())
df_features['便利设施']
df_train = df_features[~df_features['价格'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_features[df_features['价格'].isnull()]
df_train
no_features = ['数据ID', '价格']
# 输入特征列
features = [col for col in df_train.columns if col not in no_features]

X = df_train[features] # 训练集输入
y = df_train['价格']# 训练集标签
X_test = df_test[features] # 测试集输入

4.模型建立及参数调整

第一个模型-lightgbm
参数调整,详细可以去看开头链接的介绍

from sklearn.model_selection import train_test_split
x_train_all, x_predict, y_train_all, y_predict = train_test_split(X, y, test_size=0.10, random_state=100)

x_train, x_test, y_train, y_test = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=100)

train_data = lgb.Dataset(data=x_train,label=y_train)
test_data = lgb.Dataset(data=x_test,label=y_test)

from hyperopt import fmin, tpe, hp, partial

# 自定义hyperopt的参数空间
space = {"max_depth": hp.randint("max_depth", 200),
         "num_trees": hp.randint("num_trees", 30000),
         'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
         "bagging_fraction": hp.randint("bagging_fraction", 5),
         "num_leaves": hp.randint("num_leaves", 400),
         "min_child_sample": hp.randint("min_child_sample", 100),

         }

def argsDict_tranform(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 10
    argsDict['num_trees'] = argsDict['num_trees'] + 500
    argsDict["learning_rate"] = argsDict["learning_rate"] + 1e-3
    argsDict["bagging_fraction"] = argsDict["bagging_fraction"] * 0.1 + 0.5
    argsDict["num_leaves"] = argsDict["num_leaves"] + 15
    if isPrint:
        print(argsDict)
    else:
        pass

    return argsDict

from sklearn.metrics import mean_squared_error

def lightgbm_factory(argsDict):
    argsDict = argsDict_tranform(argsDict)

    params = {'nthread': -1,  # 进程数
              'max_depth': argsDict['max_depth'],  # 最大深度
              'num_trees': argsDict['num_trees'],  # 树的数量
              'eta': argsDict['learning_rate'],  # 学习率
              'bagging_fraction': argsDict['bagging_fraction'],  # 采样数
              'num_leaves': argsDict['num_leaves'],  # 终点节点最小样本占比的和
              'objective': 'regression',
              'feature_fraction': 0.7,  # 样本列采样
              'bagging_seed': 100,  # 随机种子,light中默认为100
              
              }
    params['metric'] = ['rmse']

    model_lgb = lgb.train(params, train_data, num_boost_round=300, valid_sets=[test_data],early_stopping_rounds=100)

    return get_tranformer_score(model_lgb)

def get_tranformer_score(tranformer):

    model = tranformer
    prediction = model.predict(x_predict, num_iteration=model.best_iteration)

    return mean_squared_error(y_predict, prediction)

algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(lightgbm_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
RMSE = lightgbm_factory(best)
print('best :', best)
print('best param after transform :')
argsDict_tranform(best,isPrint=True)
print('rmse of the best lightgbm:', np.sqrt(RMSE))

五折交叉验证

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)

params = {
    'learning_rate':0.005716374310109417,
    'bagging_fraction': 0.5800000000000001,
    'num_leaves': 127,
    'n_estimators':13022,
    'max_depth': 43,
    'min_child_sample': 56,
    'lambda_l1': 12, 
    'lambda_l2': 25,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 2022,
    'n_jobs': -1,
}
oof_lgb = np.zeros(len(X))
predictions_lgb  = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric='rmse',
              verbose=50, early_stopping_rounds=200)
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
    oof_lgb[valid_index] = y_pred_valid.reshape(-1, )
    predictions_lgb  += y_pred
predictions_lgb  /= n_fold
score=mean_squared_error(oof_lgb,train['价格'].values,squared=False)
score

第二个模型-xgboost

x_train_all, x_predict, y_train_all, y_predict = train_test_split(X, y, test_size=0.10, random_state=100)

x_train, x_test, y_train, y_test = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=100)

dtrain = xgb.DMatrix(data=x_train,label=y_train)
dtest = xgb.DMatrix(data=x_test,label=y_test)

evallist = [(dtest, 'eval'), (dtrain, 'train')]

space = {"max_depth": hp.randint("max_depth", 200),
         "n_estimators": hp.randint("n_estimators", 30000),
         'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
         "subsample": hp.randint("subsample", 5),
         "min_child_weight": hp.randint("min_child_weight", 6),

         }

def argsDict_tranform(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 10
    argsDict['n_estimators'] = argsDict['n_estimators'] + 300
    argsDict["learning_rate"] = argsDict["learning_rate"] + 5e-4
    argsDict["subsample"] = argsDict["subsample"] * 0.1 + 0.5
    argsDict["min_child_weight"] = argsDict["min_child_weight"] + 1
    if isPrint:
        print(argsDict)
    else:
        pass

    return argsDict
def xgboost_factory(argsDict):
    argsDict = argsDict_tranform(argsDict)

 params = {'nthread': -1,  # 进程数
              'max_depth': argsDict['max_depth'],  # 最大深度
              'n_estimators': argsDict['n_estimators'],  # 树的数量
              'eta': argsDict['learning_rate'],  # 学习率
              'subsample': argsDict['subsample'],  # 采样数
              'min_child_weight': argsDict['min_child_weight'],  # 终点节点最小样本占比的和
              'objective': 'reg:linear',
              'silent': 0,  # 是否显示
              'gamma': 0,  # 是否后剪枝
              'colsample_bytree': 0.7,  # 样本列采样
              'alpha': 0,  # L1 正则化
              'lambda': 0,  # L2 正则化
              'scale_pos_weight': 0,  # 取值>0时,在数据不平衡时有助于收敛
              'seed': 2022  # 随机种子

              }
    params['eval_metric'] = ['rmse']

    xrf = xgb.train(params, dtrain, 300, evallist,early_stopping_rounds=100)

    return get_tranformer_score(xrf)

def get_tranformer_score(tranformer):

    xrf = tranformer
    dpredict = xgb.DMatrix(x_predict)
    prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit)

    return mean_squared_error(y_predict, prediction)

# 开始使用hyperopt进行自动调参
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(xgboost_factory, space, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
RMSE = xgboost_factory(best)
print('best :', best)
print('best param after transform :')
argsDict_tranform(best,isPrint=True)
print('rmse of the best xgboost:', np.sqrt(RMSE))
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)
oof_xgb  = np.zeros(len(X))
predictions_xgb  = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    model = XGBRegressor(n_estimators=3000,  # 迭代次数
    learning_rate=0.01,  # 步长
    max_depth=19,  # 树的最大深度
    min_child_weight=7,  # 决定最小叶子节点样本权重和
    subsample=0.57,  # 每个决策树所用的子样本占总样本的比例(作用于样本)
    colsample_bytree=0.8,  # 建立树时对特征随机采样的比例(作用于特征)典型值:0.5-1
    objective='reg:linear',  
    nthread=8,
    seed=2022)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric='rmse',
              verbose=50, early_stopping_rounds=200)
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, ntree_limit=model.best_ntree_limit)
    oof_xgb [valid_index] = y_pred_valid.reshape(-1, )
    predictions_xgb  += y_pred
predictions_xgb  /= n_fold
score=mean_squared_error(oof_xgb,train['价格'].values,squared=False)
score

第三个模型-catboost

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=1314)

oof_cat = np.zeros(len(X))
prediction_cat = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    cate_features=['房主是否有个人资料图片','房主身份是否验证','是否支持随即预订','房产类型',
                  '房型','if_bed','if_bedroom','if_wc']
    train_pool = Pool(X_train, y_train, cat_features=cate_features)
    eval_pool = Pool(X_valid, y_valid, cat_features=cate_features)
    cbt_model = CatBoostRegressor(iterations=60000, # 注:baseline 提到的分数是用 iterations=60000 得到的,但运行时间有点久
                           learning_rate=0.01, # 注:事实上好几个 property 在 lr=0.1 时收敛巨慢。后面可以考虑调大
                           eval_metric='SMAPE',
                           use_best_model=True,
                           random_seed=2022,
                           logging_level='Verbose',
                           devices='0',
                           gpu_ram_part=0.5,
                           early_stopping_rounds=200)
    
    cbt_model.fit(train_pool,
              eval_set=eval_pool,
              verbose=1000)

    y_pred_valid = cbt_model.predict(X_valid)
    y_pred = cbt_model.predict(X_test)
    oof_cat[valid_index] = y_pred_valid.reshape(-1, )
    prediction_cat += y_pred
prediction_cat /= n_fold 

score = mean_squared_error(oof_cat, df_train['价格'].values, squared=False)
print(score)

5.模型融合

# 将lgb、xgb、catb的结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb,oof_cat]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb,prediction_cat]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2022)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,y)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], y.iloc[val_idx].values   
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10
mean_squared_error(y.values, oof_stack)
print("CV score: {:<8.8f}".format(mean_squared_error(y.values, oof_stack)))
test['价格'] = predictions
test[['数据ID', '价格']].to_csv(r'C:\Users\hp\Desktop\新建文件夹\民宿预测\sub.csv'.format(score), index=None)
test[['数据ID', '价格']].head()

6.后续优化
本轮比赛末期做的,由于白天上班晚上改论文,时间有限,做的很粗糙,而且确实头次接触模型融合,后续还需要一点点都学明白。
1)数据处理,更细致
2)特征筛选
3)参数调整
4)更多效果好的模型融合

  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

爱挠静香的下巴

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值