数据集说明
数据下载地址:https://aistudio.baidu.com/aistudio/competition/detail/51
目标:预测下个月用户是否购买
测试集test1.csv = submission.csv
(customer_id顺序一致,result为预测结果)
train.csv字段说明
字段 | 说明 |
---|---|
order_detail_id | 订单详情id |
order_id | 订单id |
order_detail_id | 订单详情id |
order_total_num | 订单商品总购买数量 |
order_amount | 订单商品总金额 |
order_total_payment | 订单实付金额 |
order_total_discount | 订单优惠金额 |
order_pay_time | 付款时间 |
order_status | 订单状态: 1表示等待买家付款, 2表示卖家部分发货, 3表示卖家发货, 4表示等待买家确认收货, 5表示买家已签收, 6表示交易成功 |
order_count | 订单包含的子订单数量 |
is_customer_rate | 用户是否评价,0没有评价,1已经评价 |
order_detail_status | 订单详细状态 |
order_detail_good_num | 订单中的商品数量 |
order_detail_amount | 订单应付总金额 |
order_detail_payment | 订单实付金额 |
order_detail_discount | 订单优惠金额 |
member_id | 会员id |
customer_gender | 用户id |
customer_province | 性别:0未知,1男,2女 |
customer_city | 用户省份所在地 |
member_status | 会员状态:1正常,2冻结,3已删除 |
is_member_active | 用户城市所在地 |
goods_id | 用户城市所在地 |
goods_class_id | 商品分类id |
goods_price | 商品原始价格 |
goods_status | 商品库存状态:1出售中,2库中 |
goods_has_discount | 是否支持会员折扣:0不支持,1支持 |
goods_list_time | 商品最新上架时间 |
goods_delist_time | 商品最新下架时间 |
submission.csv 字段说明
字段 | 说明 |
---|---|
customer_id | 用户id |
result | 下个月是否会购买:0 不购买,1购买 |
模型构建
import datetime
import numpy as np
import pandas as pd
# 对Goods_id进行标签编码
raw['goods_id'] = pd.factorize(raw['goods_id'])[0]
raw['goods_id'].value_counts()
# 对数据进行预处理,各种特征提取
def preprocess(raw, train='train'):
# 对性别为空的情况,进行0填充
data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))
# 添加新的列:商品相关(最后一次行为)
data[['goods_id_last', 'goods_status_last', 'goods_price_last', 'goods_has_discount_last', 'goods_list_time_last', 'goods_delist_time_last']] = raw.groupby('customer_id')['goods_id', 'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time'].last()
data[['order_total_num_last', 'order_amount_last', 'order_total_payment_last', 'order_total_discount_last', 'order_pay_time_last', 'order_status_last', 'order_count_last', 'is_customer_rate_last', 'order_detail_status_last', 'order_detail_goods_num_last', 'order_detail_amount_last', 'order_detail_payment_last', 'order_detail_discount']] = raw.groupby('customer_id')['order_total_num', 'order_amount', 'order_total_payment', 'order_total_discount', 'order_pay_time', 'order_status', 'order_count', 'is_customer_rate', 'order_detail_status', 'order_detail_goods_num', 'order_detail_amount', 'order_detail_payment', 'order_detail_discount'].last()
# 添加商品原始价格(多种统计字段) goods_price: mean, min, max, std
data[['goods_price_mean', 'goods_price_min', 'goods_price_max', 'goods_price_std']] = raw.groupby('customer_id')['goods_price'].agg([
('goods_price_mean', 'mean'),
('goods_price_min', 'min'),
('goods_price_max', 'max'),
('goods_price_std', 'std')])
# 订单实付金额(多种统计字段) order_detail_payment: mean, min, max, std
data[['order_detail_payment_mean', 'order_detail_payment_min', 'order_detail_payment_max', 'order_detail_payment_std']] = raw.groupby('customer_id')['order_detail_payment'].agg([
('order_detail_payment_mean', 'mean'),
('order_detail_payment_min', 'min'),
('order_detail_payment_max', 'max'),
('order_detail_payment_std', 'std')])
# 用户购买的订单数量
data['count'] = raw.groupby('customer_id')['order_id'].nunique()
# 用户购买的商品数量
#data['goods_count'] = raw.groupby('customer_id')['goods_id'].nunique()
# order_total_num
data['order_total_sum'] = raw.groupby('customer_id')['order_total_num'].sum()
# 用户所在省份
data['customer_province'] = raw.groupby('customer_id')['customer_province'].last()
# 用户所在城市
data['customer_city'] = raw.groupby('customer_id')['customer_city'].last()
# 用户是否评价 统计结果(平均,总和)
data[['is_customer_rate_ratio', 'is_customer_rate_sum']] = raw.groupby('customer_id')['is_customer_rate'].agg([
('is_customer_rate_ratio', np.mean),
('is_customer_rate_sum', np.sum)
])
# 用户购买的goods数量
data['order_detail_total_num'] = raw.groupby('customer_id')['order_detail_goods_num'].sum()
# 商品折扣统计属性(sum, mean)
data[['goods_has_discount_sum', 'goods_has_discount_mean']] = raw.groupby('customer_id')['goods_has_discount'].agg([
('goods_has_discount_sum', np.sum),
('goods_has_discount_mean', np.mean)
])
# 订单实付金额 统计属性(sum, mean)
data[['order_total_payment_sum', 'order_total_payment_mean']] = raw.groupby('customer_id')['order_total_payment'].agg([
('order_total_payment_sum', np.sum),
('order_total_payment_mean', np.mean)
])
# 时间转换
def time2multi(x):
t = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
# 2013-01-01 的day = 1, weekday = 星期二(2-1=1)
return pd.Series([t.month, t.day, t.weekday(), t.hour])
# 起始时间 2013-01-01
t_str = '2013-01-01 00:00:00'
t = datetime.datetime.strptime(t_str, '%Y-%m-%d %H:%M:%S')
# 商品最新上架时间diff(假设起始时间为2013-01-01 00:00:00)
data['goods_list_time_diff'] = data['goods_list_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
# 商品最新下架时间diff(假设起始时间为2013-01-01 00:00:00)
data['goods_delist_time_diff'] = data['goods_delist_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
# 商品展示时间
data['goods_diff'] = data['goods_delist_time_diff'] - data['goods_list_time_diff']
# 付款时间的尺度
data[['order_pay_time_last_m', 'order_pay_time_last_d', 'order_pay_time_last_week', 'order_pay_time_last_h']] = data['order_pay_time_last'].apply(time2multi)
data['order_pay_time_last_diff'] = data['order_pay_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
return data
label_raw = set(raw[raw['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())
# 如果这个用户在8月份进行了购买 label=1,否则label=0
# 只有用户有order 就会有行为
train_raw['label'] = train_raw.index.map(lambda x:int(x in label_raw))
get_ipython().run_cell_magic('time', '', '# 用全量的用户特征进行预测,raw 即9月份之前的所有数据\ntest = preprocess(raw)')
get_ipython().run_cell_magic('time', '', "# 8月之前的做训练集,8月的数据做验证集\nraw['order_pay_time'].max()\ntrain_raw = raw[raw['order_pay_time'] <= '2013-07-31 23:59:59']\n# 提取训练集 各种特征\ntrain_raw = preprocess(train_raw)\ntrain_raw")
保存和加载模型
import pickle
with open('test.pkl', 'wb') as file:
pickle.dump(test, file)
with open('train_raw.pkl', 'wb') as file:
pickle.dump(train_raw, file)
特征处理
# 有些时间戳提取了时间尺度,需要去掉
train_data = train_raw.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1)
train_data['customer_province'].fillna('0', inplace=True)
train_data['customer_city'].fillna('0', inplace=True)
利用sklearn数据归一化
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['customer_province'] = le.fit_transform(train_data['customer_province'])
train_data['customer_city'] = le.fit_transform(train_data['customer_city'])
train_data = train_data.drop(['customer_province', 'customer_city'], axis=1)
# 分类变量
cate1 = ['goods_id_last', 'goods_status_last', 'order_status_last', 'customer_gender', 'order_detail_status_last', 'order_pay_time_last_h', 'order_pay_time_last_week', 'order_pay_time_last_h']
查看train_data数据的情况
train_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1435404 entries, 1000000 to 2826574
Data columns (total 43 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customer_gender 1435404 non-null float64
1 goods_id_last 1435404 non-null int64
2 goods_status_last 1435404 non-null float64
3 goods_price_last 1435280 non-null float64
4 goods_has_discount_last 1435404 non-null float64
5 order_total_num_last 1435404 non-null float64
6 order_amount_last 1435404 non-null float64
7 order_total_payment_last 1435404 non-null float64
8 order_total_discount_last 1435404 non-null float64
9 order_status_last 1435404 non-null int64
10 order_count_last 1435404 non-null float64
11 is_customer_rate_last 1435404 non-null float64
12 order_detail_status_last 1435404 non-null float64
13 order_detail_goods_num_last 1435404 non-null float64
14 order_detail_amount_last 1435404 non-null float64
15 order_detail_payment_last 1435404 non-null float64
16 order_detail_discount 1435404 non-null float64
17 goods_price_mean 1435280 non-null float64
18 goods_price_min 1435280 non-null float64
19 goods_price_max 1435280 non-null float64
20 goods_price_std 396081 non-null float64
21 order_detail_payment_mean 1435404 non-null float64
22 order_detail_payment_min 1435404 non-null float64
23 order_detail_payment_max 1435404 non-null float64
24 order_detail_payment_std 396177 non-null float64
25 count 1435404 non-null int64
26 order_total_sum 1435404 non-null float64
27 is_customer_rate_ratio 1435404 non-null float64
28 is_customer_rate_sum 1435404 non-null float64
29 order_detail_total_num 1435404 non-null float64
30 goods_has_discount_sum 1435404 non-null float64
31 goods_has_discount_mean 1435404 non-null float64
32 order_total_payment_sum 1435404 non-null float64
33 order_total_payment_mean 1435404 non-null float64
34 goods_list_time_diff 1435404 non-null float64
35 goods_delist_time_diff 1435404 non-null float64
36 goods_diff 1435404 non-null float64
37 order_pay_time_last_m 1435404 non-null int64
38 order_pay_time_last_d 1435404 non-null int64
39 order_pay_time_last_week 1435404 non-null int64
40 order_pay_time_last_h 1435404 non-null int64
41 order_pay_time_last_diff 1435404 non-null float64
42 label 1435404 non-null int64
dtypes: float64(35), int64(8)
memory usage: 481.9 MB
数据集切分
# 训练集切分
from sklearn.model_selection import KFold
import lightgbm as lgb
# 采用CV=5折交叉验证
kf = KFold(n_splits=5,shuffle=True)
y_pred = 0
for train_index , test_index in kf.split(train_data):
# 设置每一折的 train和test
X_train, X_valid, y_train, y_valid = train_data.drop('label', axis=1).iloc[train_index], train_data.drop('label', axis=1).iloc[test_index], np.array(train_data['label'])[train_index], np.array(train_data['label'])[test_index]
param = {
'num_leaves': 121,
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.05,
'metric': 'binary_logloss'
}
# 使用lgb进行训练
trn_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_valid, y_valid)
lgbm = lgb.train(param, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=50, categorical_feature=cate1)
# 计算test的预测结果
test = test[X_train.columns]
y_pred = y_pred + lgbm.predict(test)*0.2
Training until validation scores don't improve for 100 rounds.
[50] training's binary_logloss: 0.0659339 valid_1's binary_logloss: 0.0697238
[100] training's binary_logloss: 0.0629397 valid_1's binary_logloss: 0.0697213
[150] training's binary_logloss: 0.0609746 valid_1's binary_logloss: 0.0698468
Early stopping, best iteration is:
[73] training's binary_logloss: 0.0643987 valid_1's binary_logloss: 0.0696613
Training until validation scores don't improve for 100 rounds.
[50] training's binary_logloss: 0.0660194 valid_1's binary_logloss: 0.0693105
[100] training's binary_logloss: 0.0630359 valid_1's binary_logloss: 0.069384
[150] training's binary_logloss: 0.0611705 valid_1's binary_logloss: 0.0694661
Early stopping, best iteration is:
[62] training's binary_logloss: 0.0652081 valid_1's binary_logloss: 0.0692976
Training until validation scores don't improve for 100 rounds.
[50] training's binary_logloss: 0.0658128 valid_1's binary_logloss: 0.0702261
[100] training's binary_logloss: 0.06284 valid_1's binary_logloss: 0.0702096
[150] training's binary_logloss: 0.0608663 valid_1's binary_logloss: 0.0703097
Early stopping, best iteration is:
[77] training's binary_logloss: 0.0640384 valid_1's binary_logloss: 0.0701494
Training until validation scores don't improve for 100 rounds.
[50] training's binary_logloss: 0.0658765 valid_1's binary_logloss: 0.069946
[100] training's binary_logloss: 0.0628522 valid_1's binary_logloss: 0.0699529
[150] training's binary_logloss: 0.060943 valid_1's binary_logloss: 0.0700854
Early stopping, best iteration is:
[73] training's binary_logloss: 0.0643301 valid_1's binary_logloss: 0.0699093
Training until validation scores don't improve for 100 rounds.
[50] training's binary_logloss: 0.0654505 valid_1's binary_logloss: 0.0715896
[100] training's binary_logloss: 0.0625174 valid_1's binary_logloss: 0.0716002
[150] training's binary_logloss: 0.0606312 valid_1's binary_logloss: 0.071723
Early stopping, best iteration is:
[67] training's binary_logloss: 0.0642761 valid_1's binary_logloss: 0.071539
创建DataFrame
pd.DataFrame({
'column': X_train.columns,
'importance':lgbm.feature_importance()
}).sort_values(by='importance')
保存要提交的csv
def f(x):
"""
if x<0.1:
return 0.1
if x>0.9:
return 0.9
"""
return round(x*1000)/1000
test['result'] = y_pred
test['result'] = test['result'].map(f)
result = pd.DataFrame(test['result'])
result.to_csv('submission1.csv')
result
完整代码
#!/usr/bin/env python
# coding: utf-8
import datetime
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
df = reduce_mem_usage(pd.read_csv('train.csv'))
print(df.info())
print(df.shape) #(2306871, 29)
print(df['goods_id'].value_counts())
# 对Goods_id进行标签编码
df['goods_id'] = pd.factorize(df['goods_id'])[0]
df['goods_id'].value_counts()
# 对数据进行预处理,各种特征提取
def preprocess(raw, train='train'):
# 对性别为空的情况,进行0填充
data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))
# 添加新的列:商品相关(最后一次行为)
data[['goods_id_last', 'goods_status_last', 'goods_price_last', 'goods_has_discount_last', 'goods_list_time_last', 'goods_delist_time_last']] = raw.groupby('customer_id')['goods_id', 'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time'].last()
data[['order_total_num_last', 'order_amount_last', 'order_total_payment_last', 'order_total_discount_last', 'order_pay_time_last', 'order_status_last', 'order_count_last', 'is_customer_rate_last', 'order_detail_status_last', 'order_detail_goods_num_last', 'order_detail_amount_last', 'order_detail_payment_last', 'order_detail_discount']] = raw.groupby('customer_id')['order_total_num', 'order_amount', 'order_total_payment', 'order_total_discount', 'order_pay_time', 'order_status', 'order_count', 'is_customer_rate', 'order_detail_status', 'order_detail_goods_num', 'order_detail_amount', 'order_detail_payment', 'order_detail_discount'].last()
# 添加商品原始价格(多种统计字段) goods_price: mean, min, max, std
data[['goods_price_mean', 'goods_price_min', 'goods_price_max', 'goods_price_std']] = raw.groupby('customer_id')['goods_price'].agg([
('goods_price_mean', 'mean'),
('goods_price_min', 'min'),
('goods_price_max', 'max'),
('goods_price_std', 'std')])
# 订单实付金额(多种统计字段) order_detail_payment: mean, min, max, std
data[['order_detail_payment_mean', 'order_detail_payment_min', 'order_detail_payment_max', 'order_detail_payment_std']] = raw.groupby('customer_id')['order_detail_payment'].agg([
('order_detail_payment_mean', 'mean'),
('order_detail_payment_min', 'min'),
('order_detail_payment_max', 'max'),
('order_detail_payment_std', 'std')])
# 用户购买的订单数量
data['count'] = raw.groupby('customer_id')['order_id'].nunique()
# 用户购买的商品数量
#data['goods_count'] = raw.groupby('customer_id')['goods_id'].nunique()
# order_total_num
data['order_total_sum'] = raw.groupby('customer_id')['order_total_num'].sum()
# 用户所在省份
data['customer_province'] = raw.groupby('customer_id')['customer_province'].last()
# 用户所在城市
data['customer_city'] = raw.groupby('customer_id')['customer_city'].last()
# 用户是否评价 统计结果(平均,总和)
data[['is_customer_rate_ratio', 'is_customer_rate_sum']] = raw.groupby('customer_id')['is_customer_rate'].agg([
('is_customer_rate_ratio', np.mean),
('is_customer_rate_sum', np.sum)
])
# 用户购买的goods数量
data['order_detail_total_num'] = raw.groupby('customer_id')['order_detail_goods_num'].sum()
# 商品折扣统计属性(sum, mean)
data[['goods_has_discount_sum', 'goods_has_discount_mean']] = raw.groupby('customer_id')['goods_has_discount'].agg([
('goods_has_discount_sum', np.sum),
('goods_has_discount_mean', np.mean)
])
# 订单实付金额 统计属性(sum, mean)
data[['order_total_payment_sum', 'order_total_payment_mean']] = raw.groupby('customer_id')['order_total_payment'].agg([
('order_total_payment_sum', np.sum),
('order_total_payment_mean', np.mean)
])
# 时间转换
def time2multi(x):
t = datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
# 2013-01-01 的day = 1, weekday = 星期二(2-1=1)
return pd.Series([t.month, t.day, t.weekday(), t.hour])
# 起始时间 2013-01-01
t_str = '2013-01-01 00:00:00'
t = datetime.datetime.strptime(t_str, '%Y-%m-%d %H:%M:%S')
# 商品最新上架时间diff(假设起始时间为2013-01-01 00:00:00)
data['goods_list_time_diff'] = data['goods_list_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
# 商品最新下架时间diff(假设起始时间为2013-01-01 00:00:00)
data['goods_delist_time_diff'] = data['goods_delist_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
# 商品展示时间
data['goods_diff'] = data['goods_delist_time_diff'] - data['goods_list_time_diff']
# 付款时间的尺度
data[['order_pay_time_last_m', 'order_pay_time_last_d', 'order_pay_time_last_week', 'order_pay_time_last_h']] = data['order_pay_time_last'].apply(time2multi)
data['order_pay_time_last_diff'] = data['order_pay_time_last'].map(lambda x: (datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - t).days/365)
return data
train_raw = df[df['order_pay_time'] <= '2013-07-31 23:59:59']
train_raw = preprocess(train_raw)
label_raw = set(df[df['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())
# 如果这个用户在8月份进行了购买 label=1,否则label=0
# 只有用户有order 就会有行为
train_raw['label'] = train_raw.index.map(lambda x:int(x in label_raw))
test = preprocess(df)
import pickle
with open('test.pkl', 'wb') as file:
pickle.dump(test, file)
with open('train_raw.pkl', 'wb') as file:
pickle.dump(train_raw, file)
# 有些时间戳提取了时间尺度,需要去掉
train_data = train_raw.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1)
train_data['customer_province'].fillna('0', inplace=True)
train_data['customer_city'].fillna('0', inplace=True)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['customer_province'] = le.fit_transform(train_data['customer_province'])
train_data['customer_city'] = le.fit_transform(train_data['customer_city'])
train_data = train_data.drop(['customer_province', 'customer_city'], axis=1)
# 分类变量
cate1 = ['goods_id_last', 'goods_status_last', 'order_status_last', 'customer_gender', 'order_detail_status_last', 'order_pay_time_last_h', 'order_pay_time_last_week', 'order_pay_time_last_h']
#train_data[['customer_province', 'customer_city']].isnull().sum()
train_data['customer_province'] = train_data['customer_province'].astype('int64')
train_data['customer_city'] = train_data['customer_city'].astype('int64')
# 训练集切分
# 采用CV=5折交叉验证
kf = KFold(n_splits=5,shuffle=True)
y_pred = 0
for train_index , test_index in kf.split(train_data):
# 设置每一折的 train和test
X_train, X_valid, y_train, y_valid = train_data.drop('label', axis=1).iloc[train_index], train_data.drop('label', axis=1).iloc[test_index], np.array(train_data['label'])[train_index], np.array(train_data['label'])[test_index]
param = {
'num_leaves': 121,
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.05,
'metric': 'binary_logloss'
}
# 使用lgb进行训练
trn_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_valid, y_valid)
lgbm = lgb.train(param, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=50, categorical_feature=cate1)
# 计算test的预测结果
test = test[X_train.columns]
y_pred = y_pred + lgbm.predict(test)*0.2
pd.DataFrame({
'column': X_train.columns,
'importance':lgbm.feature_importance()
}).sort_values(by='importance')
def f(x):
"""
if x<0.1:
return 0.1
if x>0.9:
return 0.9
"""
return round(x*1000)/1000
test['result'] = y_pred
test['result'] = test['result'].map(f)
result = pd.DataFrame(test['result'])
result.to_csv('submission_1.csv')