import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)
# model
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, LassoCV, BayesianRidge
from sklearn.metrics import roc_auc_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.impute import KNNImputer
from sklearn.feature_extraction import FeatureHasher
from bayes_opt import BayesianOptimization
# neural network
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import LearningRateScheduler
%matplotlib inline
Train = pd.read_csv('used_car_train_20200313.csv', sep=' ')
Test = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
Train = reduce_mem_usage(Train)
Test = reduce_mem_usage(Test)
Train.info()
Test.info()
#profile = ProfileReport(Test)
#profile
#profile.to_file('output_test.html')
plt.figure(figsize=(15, 10))
sns.distplot(Train['price'], kde=False)
将标签price拉回到合理的分布
log_price, lamda = boxcox(Train['price'].values)
Train['price'] = list(log_price)
Test['price'] = -999
data = pd.concat([Train, Test], axis=0)
print(f'整个数据集的样本数量:{data.shape[0]}')
# 这样的话seller和offerType都是单属性特征了,直接删去吧
print(data.shape)
del data['seller']
del data['offerType']
print(data.shape)
处理不合理的power
(data['power'].value_counts().sort_index() / len(data)).cumsum()
# power特征
def power_map(x):
if x == 0:
return 0
elif 0<x<=100:
return 1
elif 100<x<=200:
return 2
elif 200<x<=300:
return 3
elif 300<x<=600:
return 4
data['power'] = data['power'].apply(lambda x:x if x<=600 else 600)
data['power_cut'] = data['power'].apply(power_map)
建立交互特征的函数
def feat_mean(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的平均值,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].mean()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_mean" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_max(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的最大值,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].max()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_max" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_min(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的最小值,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].min()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_min" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_std(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的方差值,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].std()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_std" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_skew(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的偏差值,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].skew()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_skew" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_count(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的数量,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].count()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_count" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feat_nunique(df, df_feature, fe, value, name=""):
#根据fe分组,统计每个分组中value特征的不同值的个数,然后再合并到原数据中
df_count = pd.DataFrame(df_feature.groupby(fe)[value].nunique()).reset_index()
if not name:
df_count.columns = fe + [value+"_%s_nunique" % ("_".join(fe))]
else:
df_count.columns = fe + [name]
df = df.merge(df_count, on=fe, how="left").fillna(0)
return df
def feature_count(data, df_feature, features=[]):
new_feature = 'count'
for i in features:
new_feature += '_' + i
try:
del data[new_feature]
except:
pass
temp = df_feature.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features).fillna(0)
return data
df_feature = data[data['price']!=-999]
缺失值用众数填充
#data['isknown_bodyType'] = data['bodyType'].apply(lambda x: 0 if pd.isnull(x) else 1)
data['bodyType'] = data['bodyType'].fillna(data['bodyType'].mode()[0])
#data['isknown_fuelType'] = data['fuelType'].apply(lambda x: 0 if pd.isnull(x) else 1)
data['fuelType'] = data['fuelType'].fillna(data['fuelType'].mode()[0])
#data['isknown_gearbox'] = data['gearbox'].apply(lambda x: 0 if pd.isnull(x) else 1)
data['gearbox'] = data['gearbox'].fillna(data['gearbox'].mode()[0])
#data['isknown_model'] = data['model'].apply(lambda x: 0 if pd.isnull(x) else 1)
data['model'] = data['model'].fillna(data['model'].mode()[0])
v_0到v_14特征分布的研究
plt.figure(figsize=(15, 10))
sns.distplot(data['v_0'], kde=False)
data['v_0_diverce'] = np.where(data['v_0']<=39, 0, 1)
data['v_0_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_1'], kde=False)
data['v_1_diverce'] = np.where(data['v_1']<=-2, 0, 1)
data['v_1_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_5'], kde=False)
data['v_5_diverce'] = np.where(data['v_5']<0.1, 0, 1)
data['v_5_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_6'], kde=False)
#8.628845e-03与1.047516e-02有个突变
data['v_6_diverce'] = np.where(data['v_6']<0.01, 0, 1)
data['v_6_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_7'], kde=False)
data['v_7_diverce'] = np.where(data['v_7']<0.8, 0, 1)
data['v_7_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_10'], kde=False)
data['v_10_diverce'] = np.where(data['v_10']<0, 0, np.where(data['v_10']<7.5, 1, 2))
data['v_10_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_11'], kde=False)
data['v_11_diverce'] = np.where(data['v_11']<8, 0, 1)
data['v_11_diverce'].value_counts()
plt.figure(figsize=(15, 10))
sns.distplot(data['v_13'], kde=False)
plt.figure(figsize=(15, 10))
sns.distplot(data['v_14'], kde=False)
# 有些时间是错误的,比如19950001,月份不可能为00,所以+100变成19950101
data['regDate'] = data['regDate'].apply(lambda x: x if str(x)[5]!='0' else x+100)
data['creatDate'] = pd.to_datetime(data['creatDate'], format='%Y%m%d')
data['regDate'] = pd.to_datetime(data['regDate'], format='%Y%m%d')
# 取出年、季度、月、日等时间特征
data['creatDate_year'] = data['creatDate'].dt.year
data['creatDate_month'] = data['creatDate'].dt.month
data['creatDate_day'] = data['creatDate'].dt.day
data['creatDate_quarter'] = data['creatDate'].dt.quarter
data['regDate_year'] = data['regDate'].dt.year
data['regDate_month'] = data['regDate'].dt.month
data['regDate_day'] = data['regDate'].dt.day
data['regDate_quarter'] = data['regDate'].dt.quarter
# 使用年限 = 总使用天数 / 365,保留一位小数
data['usedYears'] = round((data['creatDate'] - data['regDate']).dt.days / 365.0, 1)
# 众数填充
data['notRepairedDamage'] = data['notRepairedDamage'].replace('-', data['notRepairedDamage'].mode()[0])
data.isnull().sum()
根据name分组后建立新的count特征,之后删除name
data = feature_count(data, df_feature, ['name'])
del data['name']
# 平均每年行驶的里程数 = 总里程数 / 使用年限
data['km_per_year'] = data['kilometer'] / data['usedYears']
(data['km_per_year'].value_counts().sort_index()/len(data)).cumsum()
def km_per_year_cut(x):
if x<=1:
return 0
elif 1<x<=3:
return 1
elif x>3:
return 2
data['km_per_year_cut'] = data['km_per_year'].apply(km_per_year_cut)
对v_13与v_14做异常值处理
data['v_13'] = data['v_13'].apply(lambda x: x if x<=6 else 6)
data['v_14'] = data['v_14'].apply(lambda x: x if x<=4 else 4)
def usedYears_cut(x):
if x<5:
return 0
elif 5<=x<10:
return 1
elif 10<=x<15:
return 2
elif 15<=x<20:
return 3
elif x>=20:
return 4
data['usedYears_cut'] = data['usedYears'].apply(usedYears_cut)
data.head()
data.to_csv('data/raw_data_0927.csv', index=False)
data = pd.read_csv("data/raw_data_0927.csv")
data['city'] = data['regionCode'].apply(lambda x: str(x)[:2])
del data['regionCode']
取出训练集数据用作提取新的交互特征
data_feature = data[data['price']!=-999]
data[data['price']!=-999].corr()['price'].sort_values()
%%time
# 根据重要的类别特征提取关于标签price的新特征(分组后的mean、max、min、skew、std等)
cat_col = ['model', 'brand', 'bodyType', 'fuelType', 'usedYears_cut',
'gearbox', 'power_cut', 'kilometer', 'notRepairedDamage', 'city',
'regDate_year', 'km_per_year_cut', ]
for i in cat_col:
for j in ['price']:
data = feat_mean(data, data_feature, [i], j)
data = feat_max(data, data_feature, [i], j)
data = feat_min(data, data_feature, [i], j)
data = feat_skew(data, data_feature, [i], j)
data = feat_std(data, data_feature, [i], j)
# 重要类别特征之间的交互(分组后的count、nunique等)
count_col = ['usedYears_cut', 'kilometer', 'km_per_year_cut', 'power_cut', 'regDate_year']
# cache存放已经处理过的变量
cache1 = []
for i in count_col:
for j in count_col:
if i+"_"+j+"count" not in cache1 and i!=j:
data = feat_count(data, data_feature, [i], j)
data = feat_nunique(data, data_feature, [i], j)
cache1.append(i+"_"+j+"count")
elif i+"_"+j+"count" not in cache1 and i==j:
data = feature_count(data, data_feature, [i])
cross = ['v_3', 'v_0', 'v_12', 'v_8', 'usedYears']
numeric = ['v_0', 'v_3', 'v_8', 'v_12']
cache2 = []
# 特征相乘建立新特征
for i in cross:
for j in cross:
if (i+"*"+j not in cache2) and (j+"*"+i not in cache2):
data[i+"*"+j] = data[i] * data[j]
cache2.append(i+"*"+j)
# 特征相加建立新特征
for i in numeric:
for j in numeric:
if (i+"+"+j not in cache2) and (j+"+"+i not in cache2):
data[i+"+"+j] = data[i] + data[j]
cache2.append(i+"+"+j)
# 特征相减建立新特征
for i in numeric:
for j in numeric:
if (i+"-"+j not in cache2) and (j+"-"+i not in cache2):
data[i+"-"+j] = data[i] - data[j]
cache2.append(i+"-"+j)
删除一些冗余或无意义的特征
data.drop(['SaleID', 'regDate', 'creatDate'], axis=1, inplace=True)
data.drop(['v_0-v_0', 'v_3-v_3', 'v_8-v_8', 'v_12-v_12', 'v_0+v_0', 'v_3+v_3', 'v_8+v_8', 'v_12+v_12'], axis=1, inplace=True)
特征交互后可能会产生部分单属性特征,检查后并删除
print(f"before: {data.shape[1]}")
for col in data.columns:
if data[col].nunique() == 1:
del data[col]
print(f"after: {data.shape[1]}")
data.to_csv('data/cleaned_data_0927.csv', index=False)
cleaned_data = pd.read_csv('data/cleaned_data_0927.csv')
cat_feature = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage',
'power_cut', 'regDate_year', 'km_per_year_cut', 'v_0_diverce',
'v_1_diverce', 'v_5_diverce', 'v_6_diverce', 'v_7_diverce',
'v_10_diverce', 'v_11_diverce', 'km_per_year_cut', 'city',
'creatDate_month', 'usedYears_cut', 'creatDate_quarter', 'regDate_month',
'regDate_quarter', 'usedYears_cut']
cleaned_data[cat_feature] = cleaned_data[cat_feature].astype(int)
特征工程完成之后,恢复训练集和测试集
train_data = cleaned_data[cleaned_data['price'] != -999]
test_data = cleaned_data[cleaned_data['price'] == -999].drop(['price'], axis=1)
X_train = train_data.drop(['price'], axis=1)
Y_train = train_data['price']
# filterd features
print(X_train.shape)
print(Y_train.shape)
print(test_data.shape)
一种特征选择方法:递归特征消除
针对我们的数据进行多次的训练,每一次训练后,都会根据权值系数来移除权重比较低的特征,此时,再根据新的特征,继续下一轮的特征的选择
from sklearn.feature_selection import RFE
import time
model = lgb.LGBMRegressor(n_estimators=1000, n_jobs=-1)
rfe = RFE(model, 101, verbose=1)
start = time.time()
RFE_X_train = rfe.fit_transform(X_train, Y_train)
RFE_test_data = rfe.transform(test_data)
print(time.time() - start)
根据get_support()得到选择后的新特征集合
choosed_X_train = X_train.loc[:, list(rfe.get_support())]
choosed_feature = X_train.loc[:, list(rfe.get_support())].columns
choosed_test_data = test_data.loc[:, list(rfe.get_support())]
%%time
def lgb_bayesian(n_estimators,
learning_rate,
max_depth,
subsample,
colsample_bytree,
min_child_samples,
min_child_weight,
reg_alpha,
reg_lambda):
min_child_samples = int(min_child_samples)
max_depth = int(max_depth)
n_estimators = int(n_estimators)
lgb_model = lgb.LGBMRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
min_child_weight=min_child_weight,
min_child_samples=min_child_samples,
objective='mean_absolute_error',
n_jobs=-1,
#device='gpu',
random_state=927)
cv = KFold(n_splits=5, shuffle=True, random_state=927)
oof_lgb = np.zeros(len(X_train))
for train_idx, val_idx in cv.split(X_train, Y_train):
lgb_model.random_state += 1
train_x = X_train.iloc[train_idx]
train_y = Y_train.iloc[train_idx]
val_x = X_train.iloc[val_idx]
val_y = Y_train.iloc[val_idx]
lgb_model.fit(train_x, train_y,
eval_set=[(val_x, val_y)],
categorical_feature=cat_feature,
eval_metric='mean_absolute_error',
verbose=0,
early_stopping_rounds=200)
oof_lgb[val_idx] = lgb_model.predict(val_x)
print(f"******lightgbm MAE******: {mean_absolute_error(inv_boxcox(val_y, lamda), inv_boxcox(lgb_model.predict(val_x), lamda))}")
mae_error = mean_absolute_error(inv_boxcox(Y_train.values, lamda), inv_boxcox(oof_lgb, lamda))
return -mae_error
lgb_bounds = {'n_estimators': (500, 10000),
'learning_rate': (0.001, 0.1),
'max_depth': (5, 15),
'subsample': (0.2, 1),
'colsample_bytree': (0.2, 1),
'min_child_samples': (20, 200),
'min_child_weight': (0.0001, 0.5),
'reg_alpha': (0, 10),
'reg_lambda': (0, 10)
}
lgb_bo = BayesianOptimization(lgb_bayesian, lgb_bounds, random_state=927)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
lgb_bo.maximize(init_points=10, n_iter=15)
lgb_bo.max
lgb_model = lgb.LGBMRegressor(n_estimators=7013,
colsample_bytree=0.2836,
subsample=0.6611,
learning_rate=0.0355,
max_depth=6,
min_child_samples=56,
min_child_weight=0.3156,
reg_alpha=5.2605,
reg_lambda=5.8944,
objective='mean_absolute_error',
n_jobs=-1,
random_state=927)
cv = KFold(n_splits=5, shuffle=True, random_state=927)
# 为了后面的stacking做准备
oof_lgb = np.zeros(len(X_train))
pred_lgb = np.zeros(len(test_data))
for train_idx, val_idx in cv.split(X_train, Y_train):
lgb_model.random_state += 1
train_x = X_train.iloc[train_idx]
train_y = Y_train.iloc[train_idx]
val_x = X_train.iloc[val_idx]
val_y = Y_train.iloc[val_idx]
lgb_model.fit(train_x, train_y,
eval_set=[(val_x, val_y)],
categorical_feature=cat_feature,
eval_metric='mean_absolute_error',
verbose=0,
early_stopping_rounds=200)
oof_lgb[val_idx] = lgb_model.predict(val_x)
pred_lgb += lgb_model.predict(test_data) / cv.n_splits
print(f"lightgbm CV score(mae): {mean_absolute_error(inv_boxcox(Y_train.values, lamda), inv_boxcox(oof_lgb, lamda))}")
另一种特征选择方法:根据树模型的特征重要性
plt.figure(figsize=(60, 120))
feat_importance = pd.Series(lgb_model.feature_importances_, index=X_train.columns)
feat_importance.sort_values(ascending=True).plot(kind='barh')
feat_importance.sort_values(ascending=False)
特征选择:根据树模型的特征重要性删除你认为可删除的特征
drop_feat = ['kilometer_regDate_year_nunique', 'regDate_year_usedYears_cut_nunique',
'price_km_per_year_cut_min', 'price_km_per_year_cut_max',
'km_per_year_cut_kilometer_nunique', 'v_7_diverce', 'v_5_diverce',
'v_11_diverce', 'creatDate_year', 'kilometer_km_per_year_cut_nunique']
X_train_choiced = X_train.drop(drop_feat, axis=1)
test_data_chioced = test_data.drop(drop_feat, axis=1)
根据你选择出的特征,建立新的类别属性的特征(LightGBM、catboost)
choosed_feature = X_train_choiced.columns
choosed_cat_feature = []
for fea in cat_feature:
if fea in choosed_feature:
choosed_cat_feature.append(fea)
choosed_cat_feature.extend(['km_per_year_cut', 'usedYears_cut'])
%%time
def lgb_bayesian(n_estimators,
learning_rate,
max_depth,
subsample,
colsample_bytree,
min_child_samples,
min_child_weight,
reg_alpha,
reg_lambda):
min_child_samples = int(min_child_samples)
max_depth = int(max_depth)
n_estimators = int(n_estimators)
lgb_model = lgb.LGBMRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
subsample=subsample,
colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
min_child_weight=min_child_weight,
min_child_samples=min_child_samples,
objective='mean_absolute_error',
n_jobs=-1,
#device='gpu',
random_state=927)
cv = KFold(n_splits=5, shuffle=True, random_state=927)
oof_lgb = np.zeros(len(choosed_X_train))
for train_idx, val_idx in cv.split(choosed_X_train, Y_train):
lgb_model.random_state += 1
train_x = choosed_X_train.iloc[train_idx]
train_y = Y_train.iloc[train_idx]
val_x = choosed_X_train.iloc[val_idx]
val_y = Y_train.iloc[val_idx]
lgb_model.fit(train_x, train_y,
eval_set=[(val_x, val_y)],
categorical_feature='auto',
eval_metric='mean_absolute_error',
verbose=0,
early_stopping_rounds=200)
oof_lgb[val_idx] = lgb_model.predict(val_x)
print(f"******lightgbm MAE******: {mean_absolute_error(inv_boxcox(val_y, lamda), inv_boxcox(lgb_model.predict(val_x), lamda))}")
mae_error = mean_absolute_error(inv_boxcox(Y_train.values, lamda), inv_boxcox(oof_lgb, lamda))
return -mae_error
lgb_bounds = {'n_estimators': (6000, 8000),
'learning_rate': (0.02, 0.05),
'max_depth': (5, 8),
'subsample': (0.5, 0.7),
'colsample_bytree': (0.2, 0.4),
'min_child_samples': (50, 70),
'min_child_weight': (0.2, 0.4),
'reg_alpha': (0, 10),
'reg_lambda': (0, 10)
}
lgb_bo = BayesianOptimization(lgb_bayesian, lgb_bounds, random_state=927)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
lgb_bo.maximize(init_points=10, n_iter=10)
lgb_bo.max
lgb_model = lgb.LGBMRegressor(n_estimators=7013,
colsample_bytree=0.4,
subsample=0.5,
learning_rate=0.05,
max_depth=8,
min_child_samples=56,
min_child_weight=0.2,
reg_alpha=10,
reg_lambda=10,
objective='mean_absolute_error',
n_jobs=-1,
random_state=927)
cv = KFold(n_splits=5, shuffle=True, random_state=927)
# 为了后面的stacking做准备
oof_lgb = np.zeros(len(choosed_X_train))
pred_lgb = np.zeros(len(choosed_test_data))
for train_idx, val_idx in cv.split(choosed_X_train, Y_train):
lgb_model.random_state += 1
train_x = choosed_X_train.iloc[train_idx]
train_y = Y_train.iloc[train_idx]
val_x = choosed_X_train.iloc[val_idx]
val_y = Y_train.iloc[val_idx]
lgb_model.fit(train_x, train_y,
eval_set=[(val_x, val_y)],
categorical_feature='auto',
eval_metric='mean_absolute_error',
verbose=0,
early_stopping_rounds=200)
oof_lgb[val_idx] = lgb_model.predict(val_x)
pred_lgb += lgb_model.predict(choosed_test_data) / cv.n_splits
print(f"lightgbm CV score(mae): {mean_absolute_error(inv_boxcox(Y_train.values, lamda), inv_boxcox(oof_lgb, lamda))}")
最后一步:将预测出的结果填入提交文档中
result = pd.read_csv("used_car_sample_submit.csv")
result['price'] = list(pred_lgb)
result['price'] = result['price'].apply(lambda x: inv_boxcox(x, lamda))
result.head(10)
result.to_csv("output/single_lightgbm_0928.csv", index=False)
可尝试的一种提分方法:对预测出的标签取整
result = pd.read_csv("output/single_lightgbm_0928.csv")
result['price'] = result['price'].apply(lambda x: round(x, 2))
result.head()
result.to_csv("output/single_lightgbm_round2_0928.csv")