#!/usr/bin/env python
# coding: utf-8
# ## Please input your directory for the top level folder
# folder name : SUBMISSION MODEL
# In[ ]:
dir_ = '/home/artemis/M5/A1-Yeon/' # input only here
# #### setting other directory
# In[ ]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'
# In[ ]:
####################################################################################
########################### 1-1. recursive model by store ##########################
####################################################################################
# In[ ]:
ver, KKK = 'priv', 0
STORES_IDS = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']
# In[ ]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
from multiprocessing import Pool
warnings.filterwarnings('ignore')
# In[ ]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
random.seed(seed)
np.random.seed(seed)
## Multiprocess Runs
def df_parallelize_run(func, t_split):
num_cores = np.min([N_CORES,len(t_split)])
pool = Pool(num_cores)
df = pd.concat(pool.map(func, t_split), axis=1)
pool.close()
pool.join()
return df
该代码定义了一个函数 df_parallelize_run,它可以并行地运行一个给定函数 func,该函数接受一个输入参数 t_split,该参数会被拆分成多个子列表,每个子列表都会被传递给 func 进行计算。
该函数使用 np.min([N_CORES,len(t_split)]) 计算可用处理器数量与 t_split 子列表数量之间的最小值,以确保并行运行时不会超过处理器数量。
然后使用 Pool() 函数创建进程池,并将 num_cores 作为参数传递给 Pool() 函数来创建多个并行运行的进程。接着使用 pool.map() 函数并行地应用 func 函数,将 t_split 子列表分配给各个进程并运行。最后使用 pd.concat() 函数连接所有进程返回的结果,生成一个 Pandas DataFrame,并使用 pool.close() 和 pool.join() 函数关闭并销毁进程池。
最终函数返回并行计算后的 Pandas DataFrame
# In[ ]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
# Read and contact basic feature
df = pd.concat([pd.read_pickle(BASE),
pd.read_pickle(PRICE).iloc[:,2:],
pd.read_pickle(CALENDAR).iloc[:,2:]],
axis=1)
pd.read_pickle(BASE) 读取第一个pickle文件(路径为BASE),并将其作为合并后DataFrame的第一个部分,返回一个DataFrame对象。
df = df[df['d']>=START_TRAIN]
df = df[df['store_id']==store]
df2 = pd.read_pickle(MEAN_ENC)[mean_features]
df2 = df2[df2.index.isin(df.index)]
df3 = pd.read_pickle(LAGS).iloc[:,3:]
df3 = df3[df3.index.isin(df.index)]
df = pd.concat([df, df2], axis=1)
del df2
df = pd.concat([df, df3], axis=1)
del df3
features = [col for col in list(df) if col not in remove_features]
df = df[['id','d',TARGET]+features]
df = df.reset_index(drop=True)
return df, features
# Recombine Test set after training
def get_base_test():
base_test = pd.DataFrame()
for store_id in STORES_IDS:
temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'.pkl')
temp_df['store_id'] = store_id
base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
return base_test
########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
lag_df = base_test[['id','d',TARGET]]
col_name = 'sales_lag_'+str(LAG_DAY)
lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
return lag_df[[col_name]]
def make_lag_roll(LAG_DAY):
shift_day = LAG_DAY[0]
roll_wind = LAG_DAY[1]
lag_df = base_test[['id','d',TARGET]]
col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
return lag_df[[col_name]]
# In[ ]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'tweedie',
'tweedie_variance_power': 1.1,
'metric': 'rmse',
'subsample': 0.5,
'subsample_freq': 1,
'learning_rate': 0.015,
'num_leaves': 2**11-1,
'min_data_in_leaf': 2**12-1,
'feature_fraction': 0.5,
'max_bin': 100,
'n_estimators': 3000,
'boost_from_average': False,
'verbose': -1,
}
以上代码定义了一个LightGBM模型的参数字典,这个模型用于回归问题,具体参数如下:
boosting_type: 梯度增强算法类型,这里设置为'gbdt'(Gradient Boosting Decision Tree,梯度增强决策树)
objective: 模型优化目标,这里设置为'tweedie'(Tweedie分布)
tweedie_variance_power: Tweedie分布方差幂,用于控制方差大小,这里设置为1.1
metric: 评估指标,这里设置为均方根误差(rmse)
subsample: 样本抽样比例,这里设置为0.5,表示每次迭代随机抽取50%的样本用于训练
subsample_freq: 抽样频率,这里设置为1,表示每次迭代都进行抽样
learning_rate: 学习率,这里设置为0.015,表示每次迭代参数调整的步长
num_leaves: 每棵决策树的叶子节点数目,这里设置为2的11次方-1,即2047
min_data_in_leaf: 每个叶子节点的最小数据量,这里设置为2的12次方-1,即4095
feature_fraction: 特征抽样比例,这里设置为0.5,表示每次迭代随机抽取50%的特征用于训练
max_bin: 每个特征的最大箱数,这里设置为100,用于离散化连续特征
n_estimators: 决策树的数目,这里设置为3000
boost_from_average: 是否从平均值开始增强,这里设置为False
verbose: 是否输出冗余信息,这里设置为-1,表示不输出
# In[ ]:
# In[ ]:
########################### Vars
#################################################################################
VER = 1
SEED = 42
seed_everything(SEED)
lgb_params['seed'] = SEED
N_CORES = psutil.cpu_count()
以上代码使用了psutil模块中的cpu_count函数,它返回当前系统的CPU逻辑核心数量。将其赋值给变量N_CORES,可以在后续的代码中使用该变量来根据CPU繁忙程度等信息调整程序的行为。
#LIMITS and const
TARGET = 'sales'
START_TRAIN = 0
END_TRAIN = 1941 - 28*KKK
P_HORIZON = 28
USE_AUX = False
remove_features = ['id','state_id','store_id',
'date','wm_yr_wk','d',TARGET]
mean_features = ['enc_cat_id_mean','enc_cat_id_std',
'enc_dept_id_mean','enc_dept_id_std',
'enc_item_id_mean','enc_item_id_std']
ORIGINAL = raw_data_dir
BASE = processed_data_dir+'grid_part_1.pkl'
PRICE = processed_data_dir+'grid_part_2.pkl'
CALENDAR = processed_data_dir+'grid_part_3.pkl'
LAGS = processed_data_dir+'lags_df_28.pkl'
MEAN_ENC = processed_data_dir+'mean_encoding_df.pkl'
#SPLITS for lags creation
SHIFT_DAY = 28
N_LAGS = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
for j in [7,14,30,60]:
ROLS_SPLIT.append([i,j])
# In[ ]:
########################### Train Models
#################################################################################
for store_id in STORES_IDS:
print('Train', store_id)
grid_df, features_columns = get_data_by_store(store_id)
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = (grid_df['d']>(END_TRAIN-100)) & (grid_df['d'] <= END_TRAIN+P_HORIZON)
train_data = lgb.Dataset(grid_df[train_mask][features_columns],
label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns],
label=grid_df[valid_mask][TARGET])
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
d_sales = grid_df[['d','sales']]
substitute = d_sales['sales'].values
substitute[(d_sales['d'] > END_TRAIN)] = np.nan
grid_df['sales'] = substitute
grid_df.to_pickle(processed_data_dir+'test_'+store_id+'.pkl')
del grid_df, d_sales, substitute
seed_everything(SEED)
estimator = lgb.train(lgb_params,
train_data,
valid_sets = [valid_data],
verbose_eval = 100,
)
# display(pd.DataFrame({'name':estimator.feature_name(),
# 'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25))
model_name = model_dir+'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))
这是一个Python的代码段,使用了pickle模块的dump函数来将一个Python对象(这里是一个名为estimator的对象)保存到文件中。打开文件时使用二进制模式('wb')来写入文件。model_name是文件的名称或路径,可以自定义。这段代码的作用是将训练好的模型保存到文件中,以供后续使用或分享。
del train_data, valid_data, estimator
gc.collect()
该代码是Python中的垃圾回收工具,可以手动触发Python的垃圾回收机制。gc是Python内置的垃圾回收模块,collect()是该模块中的函数,执行collect()函数可以强制Python回收未使用的内存空间。这个函数并非必须使用,但在一些特定的情况下,比如程序运行过程中内存消耗过多、频繁创建大量对象等情况下,手动执行垃圾回收可以有效减少内存使用
MODEL_FEATURES = features_columns
M5-competition第一名代码解析学习-1-1. recursive_store_TRAIN.py
于 2023-07-04 12:10:48 首次发布