时间序列规则与baseline
本节目标:
- 了解挖掘规则做预测的重要性
- 掌握时间序列规则的基本方法
- 利用时间序列规则获得资金流入流出赛题数据的baseline
1. 时间序列规则
1.1 为什么要重视规则
• 很多比赛都可基于对背景的理解和数据分析获得有用规则,通过“if A then B“等方式设计效果良好的基准方
• 在企业、公司里也常采用基于规则的方案
• 在时间序列预测相关比赛中挖掘出的有用规则
1.2 中位数、临近数据等简单统计量
• 简单统计量,可作为特征
1.3 基于周期因子的时间序列预测
• 支付数据、客流量数据、交通数据等时间序列通常都具有明显的周期性
2. 基于时间序列规则的资金流入流出预测
2.1 以星期为周期的中位数预测
2.2 基于周期因子(以星期为周期)的预测
2.3 基于周期因子(以月份为周期)的预测
2.4 效果对比
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
data_balance = pd.read_csv(path)
data_balance = add_timestamp(data_balance)
return data_balance.reset_index(drop=True)
# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
data_balance = data.copy()
data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
return data_balance.reset_index(drop=True)
# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
df_tmp = data.copy()
df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
df_tmp.reset_index(inplace=True)
return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)
# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy()
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,15):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
total_balance = pd.concat([total_balance, testdata], axis = 0)
total_balance = total_balance.reset_index(drop=True)
return total_balance.reset_index(drop=True)
# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
return pd.read_csv(path)
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
data_balance = pd.read_csv(path)
data_balance = add_timestamp(data_balance)
return data_balance.reset_index(drop=True)
# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
data_balance = data.copy()
data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
return data_balance.reset_index(drop=True)
# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
df_tmp = data.copy()
df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
df_tmp.reset_index(inplace=True)
return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)
# Generate the test data
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy()
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,15):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
total_balance = pd.concat([total_balance, testdata], axis = 0)
total_balance = total_balance.reset_index(drop=True)
return total_balance.reset_index(drop=True)
# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
return pd.read_csv(path)
# 载入数据
balance_data = load_data('user_balance_table.csv')
balance_data = add_timestamp(balance_data)
total_balance = get_total_balance(balance_data, date = '2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
# 创建数据的深层拷贝
data = total_balance.copy()
# 定义生成时间序列规则预测结果的方法
def generate_base(df: pd.DataFrame, month_index: int)->pd.DataFrame:
# 选中固定时间段的数据集
total_balance = df.copy()
total_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]
total_balance = total_balance[(total_balance['date'] >= datetime(2014,3,1)) & (total_balance['date'] < datetime(2014, month_index, 1))]
# 加入时间戳
total_balance['weekday'] = total_balance['date'].dt.weekday
total_balance['day'] = total_balance['date'].dt.day
total_balance['week'] = total_balance['date'].dt.week
total_balance['month'] = total_balance['date'].dt.month
# 统计翌日因子
mean_of_each_weekday = total_balance[['weekday']+['total_purchase_amt','total_redeem_amt']].groupby('weekday',as_index=False).mean()
for name in ['total_purchase_amt','total_redeem_amt']:
mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name+'_weekdaymean'})
mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])
# 合并统计结果到原数据集
total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')
# 分别统计翌日在(1~31)号出现的频次
weekday_count = total_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()
weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')
# 依据频次对翌日因子进行加权,获得日期因子
weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()
# 将训练集中所有日期的均值剔除日期残差得到base
day_mean = total_balance[['day'] + ['total_purchase_amt','total_redeem_amt']].groupby('day',as_index=False).mean()
day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']
# 生成测试集数据
for index, row in day_pre.iterrows():
if month_index in (2,4,6,9) and row['day'] == 31:
break
day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
# 基于base与翌日因子获得最后的预测结果
day_pre['weekday'] = day_pre.date.dt.weekday
day_pre = day_pre[['date','weekday']+['total_purchase_amt','total_redeem_amt']]
day_pre = pd.merge(day_pre, mean_of_each_weekday,on='weekday')
day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']
day_pre = day_pre.sort_values('date')[['date']+['total_purchase_amt','total_redeem_amt']]
return day_pre
# 生成预测结果(以及残差)
base_list = []
for i in range(4, 10):
base_list.append(generate_base(data, i).reset_index(drop=True))
base = pd.concat(base_list).reset_index(drop=True)
for i in ['total_purchase_amt','total_redeem_amt']:
base = base.rename(columns={i: i+'_base'})
data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)
data['purchase_residual'] = data['total_purchase_amt'] / data['total_purchase_amt_base']
data['redeem_residual'] = data['total_redeem_amt'] / data['total_redeem_amt_base']
# 对结果表重命名
data = data[['date','purchase_residual','redeem_residual','total_purchase_amt_base', 'total_redeem_amt_base']]
for i in data.columns:
if i == 'date':
data[i] = data[i].astype(str)
data[i] = data[i].str.replace('-','')
data.columns = [['date'] + ['total_purchase_amt','total_redeem_amt'] + ['total_purchase_predicted_by_cycle','total_redeem_predicted_by_cycle'] ]
# 保存预测结果到本地
data.to_csv('/Users/74893/数据分析/base.csv',index=False)
- 了解规则的重要性
- 熟悉基于规则的时间序列分析方法
- 针对资金流入流出预测给出baseline