资金流入流出预测时间序列规则与baseline
赛题:资金流入流出预测
赛题数据:竞赛中使用的数据主要包含四个部分,分别为用户基本信息数据、用户申购赎回数据、收益率表和银行间拆借利率表。
时间序列规则
- 很多比赛都可基于对背景的理解和数据分析获得有用规则,通过“if A then B“等方式设
计效果良好的基准方案。 - 在企业、公司里也常采用基于规则的方案。
时间序列规则法快速入门
基于周期因子的时间序列预测
支付数据、客流量数据、交通数据等时间序列通常都有明显的周期性。
我们首先要确定周期是月还是周还是日,确定组成一个周期的元素,结合STL分解观察周期变化。缺点就是不考虑节假日、突发事件等。
-
Step1 获得周期因子
- 两种方式
- 方式(1):除以周均值 ,然后按列取中位数
- 方式(2):季节指数的计算方式;获得每日(工作日或周末)均值,再除以整体均值
-
Step2 预测
- 乘以base
- 直接用最后一周的平均客流确定base( 建议:去周期后再平均)
基于时间序列规则的资金流入流出预测
以星期为周期的中位数预测
-
选取时段
-
按星期重新组织数据
-
提取中位数,并做预测
baseline代码
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
data_balance = pd.read_csv(path)
data_balance = add_timestamp(data_balance)
return data_balance.reset_index(drop=True)
# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
data_balance = data.copy()
data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
return data_balance.reset_index(drop=True)
# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
df_tmp = data.copy()
df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
df_tmp.reset_index(inplace=True)
return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)
# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy()
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,15):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
total_balance = pd.concat([total_balance, testdata], axis = 0)
total_balance = total_balance.reset_index(drop=True)
return total_balance.reset_index(drop=True)
# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
return pd.read_csv(path)
# 载入数据
balance_data = load_data('C:/Users/87569/Desktop/Time Series/Purchase Redemption Data/user_balance_table.csv')
balance_data = add_timestamp(balance_data)
total_balance = get_total_balance(balance_data, date = '2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
# 创建数据的深层拷贝
data = total_balance.copy()
# 定义生成时间序列规则预测结果的方法
def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:
# 选中固定时间段的数据集
total_balance = df.copy()
total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
total_balance = total_balance[(total_balance['date'] >= datetime.datetime(2014,3,1)) & (total_balance['date'] < datetime.datetime(2014, month_index, 1))]
# 加入时间戳
total_balance['weekday'] = total_balance['date'].dt.weekday
total_balance['day'] = total_balance['date'].dt.day
total_balance['week'] = total_balance['date'].dt.week
total_balance['month'] = total_balance['date'].dt.month
# 统计翌日因子
mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()
for name in ['total_purchase_amt','total_redeem_amt']:
mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})
mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])
# 合并统计结果到原数据集
total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')
# 分别统计翌日在(1~31)号出现的频次
weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')
# 依据频次对翌日因子进行加权,获得日期因子
weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()
# 将训练集中所有日期的均值剔除日期残差得到base
day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()
day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']
# 生成测试集数据
for index, row in day_pre.iterrows():
if month_index in (2,4,6,9) and row['day'] == 31:
break
day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
# 基于base与翌日因子获得最后的预测结果
day_pre['weekday'] = day_pre.date.dt.weekday
day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]
day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')
day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']
day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]
return day_pre
# 生成预测结果(以及残差)
base_list = []
for i in range(4, 10):
base_list.append(generate_base(data, i).reset_index(drop=True))
base = pd.concat(base_list).reset_index(drop=True)
for i in ['total_purchase_amt','total_redeem_amt']:
base = base.rename(columns={i: i+'_base'})
data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)
data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']
data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']
# 对结果表重命名
data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]
for i in data.columns:
if i == 'date':
data[i] = data[i].astype(str)
data[i] = data[i].str.replace('-','')
data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]
# 保存预测结果到本地
data.to_csv('C:/Users/87569/Desktop/Time Series/Purchase Redemption Data/base.csv',index=False)
天池提交结果:135.5091