数据集介绍:
时间维度:2023-05-01-2023-12-01
字段:[货品编码、时间、期初总量、每日实际销量]
导入数据
#导入库
import pymysql
import numpy as np
import pandas as pd
def sql_query(sql, database):
conn = pymysql.connect(host='localhost',
user='root',
password='*******',
database=database,
charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)
results = cursor.fetchall()
columnDes = cursor.description
cursor.close()
conn.close()
columnNames = [columnDes[i][0] for i in range(len(columnDes))]
results = pd.DataFrame([list(i) for i in results], columns=columnNames)
return results
#历史数据、库存数据获取
sql1 = """select `货品编号`,`日期`,`实际销售量` from `历史销量明细`"""
sql2 = """select `货品编码`,`货品ID`,`日期`,`期初总量` from `库存明细汇总`"""
h_df = sql_query(sql1, '历史销量')
kc_df = sql_query(sql2, '库存明细')
数据处理:
h_df['实际销售量'] = h_df['实际销售量'].astype(float)
h_df = h_df.groupby(['货品编号', '日期']).sum().reset_index()
kc_df['期初总量'] = kc_df['期初总量'].astype(float)
kc_df['货品编码'] = kc_df['货品编码'].fillna(kc_df.groupby('货品ID')['货品编码'].transform('first'))
kc_df = kc_df.drop('货品ID', axis=1)
#该时间格式
h_df['日期'] = pd.to_datetime(h_df['日期'])
kc_df['日期'] = pd.to_datetime(kc_df['日期'])
#统一字段名称
h_df.rename(columns={'货品编号':'货品编码'}, inplace=True)
货品编码 | 日期 | 实际销售量 |
---|---|---|
0200*********** | 2023-10-30 | 1.0 |
0200*********** | 2023-10-30 | 2.0 |
0200*********** | 2023-10-30 | 3.0 |
0200*********** | 2023-10-30 | 5.0 |
0200*********** | 2023-10-30 | 0.0 |
0200*********** | 2023-10-30 | 0.0 |
... | ... | ... |
LHZ************ | 2023-10-30 | 21.0 |
PHS************ | 2023-10-30 | 59.0 |
YX0************ | 2023-10-30 | 71.0 |
YX0************ | 2023-10-30 | 8.0 |
YX0************ | 2023-10-30 | 9.0 |
YX0************ | 2023-10-30 | 900.0 |
YX0************ | 2023-10-30 | 8.0 |
货品编码 | 日期 | 期初总量 |
---|---|---|
6604************ | 2023-05-01 | 104.0 |
6602************ | 2023-05-01 | 5.0 |
6602************ | 2023-05-01 | 152.0 |
0200************ | 2023-05-01 | 25.0 |
6600************ | 2023-05-01 | 157.0 |
... | ... | ... |
6602************ | 2023-12-05 | 1.0 |
6602************ | 2023-12-05 | 67.0 |
6604************ | 2023-12-05 | 29.0 |
6602************ | 2023-12-05 | 1.0 |
6602************ | 2023-12-05 | 506.0 |
6602************ | 2023-12-05 | 102.0 |
6602************ | 2023-12-05 | 111.0 |
6602************ | 2023-12-05 | 10.0 |
数据初步探索
min_date = h_df['日期'].min()
max_date = h_df['日期'].max()
# 生成完整的日期范围
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
df_len = len(date_range)
print('完整日期长度:',df_len)
kc_grouped = kc_df.groupby('货品编码').size()
kc_list = list(kc_grouped[kc_grouped > df_len].index)
print('库存异常编号:',kc_list)
h_grouped = h_df.groupby('货品编码').size()
h_list = list(h_grouped[h_grouped > df_len].index)
print('销量异常编号:',h_list)
#销量历史编号
h_id_set = set(h_df['货品编码'])
kc_id_set = set(kc_df['货品编码'])
hkc_id_set = set([*h_df['货品编码'],*kc_df['货品编码']])
print('销量编号:',len(h_id_set))
print('库存编号:',len(kc_id_set))
print('对齐编号:',len(hkc_id_set))
hid_lis_fill = h_grouped[h_df.groupby('货品编码').size()<df_len].index.to_list()
kcid_lis_fill = kc_grouped[kc_df.groupby('货品编码').size()<df_len].index.to_list()
print('销量长度小于',df_len,'的编码',hid_lis_fill[:2],'....等')
print('库存长度小于',df_len,'的编码',kcid_lis_fill[:2],'....等')
print('销量目标长度:',df_len*len(h_id_set))
print('库存目标长度:',df_len*len(kc_id_set))
完整日期长度: 219 库存异常编号: [] 销量异常编号: [] 销量编号: 4588 库存编号: 5518 对齐编号: 5618 销量长度小于 219 的编码 ['020020010099', '020020010100'] ....等 库存长度小于 219 的编码 ['020020010099', '020020010100'] ....等 销量目标长度: 1004772 库存目标长度: 1208442
并行填充缺失日期
from joblib import Parallel, delayed
#填充缺失日期--销量
def fill_date_h(id_,date_range,temp_df):
temp_df = temp_df.drop_duplicates()
#设置日期为索引
temp_df.set_index('日期', inplace=True)
temp_df = temp_df.reindex(date_range)
missing_dates = temp_df[temp_df.isnull().any(axis=1)].index
temp_df.loc[temp_df.index.isin(missing_dates), '货品编码'] = id_
temp_df.loc[temp_df.index.isin(missing_dates), '实际销售量'] = 0
return temp_df
h_df_values = Parallel(n_jobs=-1, backend="loky", batch_size='auto')(delayed(fill_date_h)(
id_,date_range,h_df[(h_df['货品编码']==id_)&(h_df['实际销售量']!=0)]) for id_ in h_id_set)
#填充缺失日期--库存
def fill_date_kc(id_,date_range,temp_df):
temp_df = temp_df.drop_duplicates()
#设置日期为索引
temp_df.set_index('日期', inplace=True)
temp_df = temp_df.reindex(date_range)
missing_dates = temp_df[temp_df.isnull().any(axis=1)].index
temp_df.loc[temp_df.index.isin(missing_dates), '货品编码'] = id_
temp_df.loc[temp_df.index.isin(missing_dates), '期初总量'] = 0
return temp_df
kc_df_values = Parallel(n_jobs=-1, backend="loky", batch_size='auto')(delayed(fill_date_kc)(
id_,date_range,kc_df[(kc_df['货品编码']==id_)&(kc_df['期初总量']!=0)]) for id_ in kc_id_set)
#重新合并销量
fill_kc_df = pd.concat(kc_df_values)
fill_kc_df = fill_kc_df.reset_index(drop=False).rename(columns={"index":"日期"})
#重新合并销量
fill_h_df = pd.concat(h_df_values)
fill_h_df = fill_h_df.reset_index(drop=False).rename(columns={"index":"日期"})
#合并销量表与库存表
hb_df = fill_kc_df.merge(fill_h_df , on=['货品编码', '日期'],how='outer')
#nan值变为0
hb_df = hb_df.fillna(0)
#量化货品编码
id_dict = dict(zip(range(len(set(hb_df['货品编码']))), set(hb_df['货品编码'])))
reversed_id_dict = {value: key for key, value in id_dict.items()}
#替换
hb_df['货品编码'] = hb_df['货品编码'].map(reversed_id_dict)
节假日日期处理
#节假日日期处理
holidays = {
"春节": "农历-01-01",
"元宵节": "农历-01-15",
"龙抬头": "农历-02-02",
"上巳节": "农历-03-03",
"清明节": "公历-04-05",
"端午节": "农历-05-05",
"七夕节": "农历-07-07",
"中元节": "农历-07-15",
"中秋节": "农历-08-15",
"重阳节": "农历-09-09",
"下元节": "农历-10-15",
"冬至节": "公历-12-22",
"除夕": "农历-12-29",
"元旦": "公历-01-01",
"妇女节": "公历-03-08",
"植树节": "公历-03-12",
"劳动节": "公历-05-01",
"青年节": "公历-05-04",
"儿童节": "公历-06-01",
"建党日": "公历-07-01",
"建军日": "公历-08-01",
"教师节": "公历-09-10",
"国庆节": "公历-10-01" ,
"双十一": "公历-11-11",
"双十二": "公历-12-12"
}
flipped_holidays = {v: k for k, v in holidays.items()}
#当前阳历日期
solar_date_start = datetime(2023,5,1).date()
lunar_date_start = LunarDate.fromSolarDate(2023,5,1)
solar_date_now = datetime.now().date()
lunar_date_now = LunarDate.today()
holidays_list = []
for k,v in flipped_holidays.items():
for a in range(solar_date_start.year,solar_date_now.year+1):
if "公历" in k:
holidays_list.append(k.replace("公历",str(a)))
for b in range(lunar_date_start.year,lunar_date_now.year+1):
if "农历" in k:
lunar_date = LunarDate(b,int(k.split("-")[1]),int(k.split("-")[2]))
lunar_date = lunar_date.toSolarDate()
lunar_date_str = f"{lunar_date.year}-{lunar_date.month}-{lunar_date.day}"
holidays_list.append(lunar_date_str)
时间特征扩充
#时间特征扩充
hb_df['日期'] = pd.to_datetime(hb_df['日期'])
hb_df['年份'] = hb_df['日期'].dt.year
hb_df['月份'] = hb_df['日期'].dt.month
hb_df['日'] = hb_df['日期'].dt.day
hb_df['星期'] = hb_df['日期'].dt.dayofweek
hb_df['周数'] = hb_df['日期'].dt.isocalendar().week
hb_df['季度'] = hb_df['日期'].dt.quarter
holidays_list = pd.to_datetime(holidays_list)
hb_df['节日前1周后1天'] = 0
for holiday in holidays_list:
start_date = holiday - pd.DateOffset(days=7)
end_date = holiday + pd.DateOffset(days=1)
hb_df.loc[(hb_df['日期'] >= start_date) & (hb_df['日期'] <= end_date), '节日前1周后1天'] = 1
hb_df['是否节假日'] = hb_df['日期'].isin(holidays_list)
hb_df['是否工作日'] = hb_df['日期'].dt.dayofweek < 5
hb_df['是否月末'] = (hb_df['日期'] + pd.offsets.MonthEnd(0)) == hb_df['日期']
hb_df['是否季末'] = (hb_df['日期'] + pd.offsets.QuarterEnd(0)) == hb_df['日期']
# hb_df = hb_df.drop('日期', axis=1)
#数据类型转换
hb_df.loc[:,hb_df.columns!="日期"] = hb_df.loc[:,hb_df.columns!="日期"].astype(float)
滞后特征扩充
# 滞后特征
horizon = 1
targets = ["实际销售量","期初总量"]
lags=list(range(horizon, horizon+7))+ [28,35,42,49]
def process_lag_target(hb_df, lag, target):
tmpe = pd.DataFrame()
tmpe[f'{target}_lag_{lag}'] = hb_df.groupby(['货品编码'],as_index=False)[target].shift(lag)
tmpe[f'{target}_MA_lag_1_{lag}_mean'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).mean().shift(1))
tmpe[f'{target}_MA_lag_1_{lag}_max'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).max().shift(1))
tmpe[f'{target}_MA_lag_1_{lag}_min'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).min().shift(1))
tmpe[f'{target}_MA_lag_1_{lag}_std'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).std().shift(1))
tmpe[f'{target}_MA_lag_1_{lag}_median'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).median().shift(1))
tmpe[f'{target}_lag_{lag}_var'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).var().shift(1))
tmpe[f'{target}_lag_{lag}_q1'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).quantile(0.25).shift(1))
tmpe[f'{target}_lag_{lag}_q3'] = hb_df.groupby(['货品编码'])[target].transform(lambda x: x.rolling(window=lag).quantile(0.75).shift(1))
return tmpe
results = Parallel(n_jobs=-1)(delayed(process_lag_target)(hb_df.copy(), lag, target) for lag in lags for target in targets)
hb_df = pd.concat([hb_df,*results],axis=1).fillna(0)
无用特征删除
#删除无用特征
hb_df_de = hb_df.describe()
for i in hb_df_de.columns:
if hb_df_de.loc["std",i]==0:
print("删除无用的列:")
hb_df = hb_df.drop(i, axis=1)
print(i)
删除无用的列: 年份 删除无用的列: 实际销售量_MA_lag_1_1_std 删除无用的列: 实际销售量_lag_1_var 删除无用的列: 期初总量_MA_lag_1_1_std 删除无用的列: 期初总量_lag_1_var