一、说明
数据来源为(和鲸社区)练习赛-新人赛-民宿价格预测。
1.整体流程:
- 数据清洗
- 自动化调参+模型建立
- 模型融合
2.说明:
- 参考了多个相关文章,包括官方baseline、基于Hyperopt的自动化调参、另一个baseline(但是单模型没有达到这位大兄弟所说的效果0.0,后来数据处理调整了部分才勉强达到,可能有设备和数据划分的因素。)、模型融合。
- 线上得分5.258,还有优化的空间,限于设备、时间以及知识储备,目前到此为止了,最高4名上下,最后大概7名上下。
二、代码
1.导入需要的包
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,make_scorer
from sklearn.model_selection import KFold,RepeatedKFold
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import GradientBoostingRegressor as GBDT
from sklearn.ensemble import ExtraTreesRegressor as ET
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.ensemble import AdaBoostRegressor as ADA
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x:' %.5f' % x)```
2.加载数据,合并训练集和测试集
train = pd.read_csv(r'C:\Users\hp\Desktop\新建文件夹\民宿预测\训练集.csv')
test = pd.read_csv(r'C:\Users\hp\Desktop\新建文件夹\民宿预测\测试集.csv')
df_features = train.append(test)
df_features.head()
3.数据处理
# 查看数据集相关信息
df_features.info()
#查看缺失率
all_data_na = (df_features.isnull().sum()/len(df_features))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({
'缺失率' : all_data_na})
missing_data.head(20)
#缺失值填补
df_features['房主回复率'].fillna('-1', inplace=True)
df_features['房主回复率'] = df_features['房主回复率'].astype(str).apply(lambda x: x.replace('%', ''))
df_features['房主回复率'] = df_features['房主回复率'].astype(int)
# 根据各特征的现实含义,填充合适的值
feature1 = ['床的类型','邮编','民宿周边','房主身份是否验证','房主是否有个人资料图片']
for i in feature1:
df_features[i] = df_features[i].fillna('na')
feature2 = ['评论个数','洗手间数量']
for i in feature2:
df_features[i] = df_features[i].fillna(0)
feature3 = ['民宿评分', '卧室数量', '取消条款', '床的数量','经度' ,'维度','房主回复率']
for i in feature3:
df_features[i] = df_features[i].fillna(df_features[i].mode()[0])
df_features.dtypes[df_features.dtypes != 'object'].index
for feat in ['房主是否有个人资料图片', '房主身份是否验证', '民宿周边', '邮编']:
lbl = LabelEncoder()
lbl.fit(df_features[feat])
df_features[feat] = lbl.transform(df_features[feat])
def freq_enc(df, col):
vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
df[f'{
col}_freq'] = df[col].map(vc)
return df
for feat in ['容纳人数', '洗手间数量', '床的数量', '床的类型',
'卧室数量', '取消条款', '所在城市', '清洁费',
'房主是否有个人资料图片', '房主回复率', '是否支持随即预订',
'民宿周边', '房产类型', '房型', '邮编']:
df_features = freq_enc(df_features, feat)
df_features.head(5)
# 时间特征处理
from tqdm import tqdm
df_features['首次评论日期'] = pd.to_datetime(df_features['首次评论日期']).values.astype(np.int64) // 10 ** 9
df_features['何时成为房主'] = pd.to_datetime(df_features['何时成为房主']).values.astype(np.int64) // 10 ** 9
df_features['最近评论日期'] = pd.to_datetime(df_features['最近评论日期']).values.astype(np