规则与baseline
前言
规则的好处是简单高效且可解释性强,有时候样本量不足时,使用有用的规则比建模更有效,面对问题的时候不能只想着建模,而是根据问题去寻找解决的方案有哪些,互相比较,找到最好的解决方案
时间序列预测的规则
简单的统计量
- 中位数:很稳健
- 均值:分布较符合正态分布时适用
- 临近数据:指与待预测数据较近的数据,例如用8点的数据预测9点效果
周期因子
基于周的周期因子预测
周期通常有一周、一月、一年,支付行为、客流量、交通数据等时间序列都具有明显的周期性,缺点是不考虑特定日子和节假日。
若以周为周期,则如何计算周期因子?
第一种方式:除以周均值,然后按列取中位数
第二种方式:先获得列的均值(周内每日),再除以整体均值(整体均值就是所有值的平均值)
第二步就是预测,得到周期因子,就可以预测第四周的客流量,使用第一种方式,中位数直接乘以base,这里的base使用了第三周的均值。
base除了用最后一周的均值,我们可以使用去周期后取平均,即直接用第三周的原始客流量直接去除以中位数,中位数可以一定程度代表算数平均数。
基于月的周期因子预测
- 对每日的申购总量和赎回总量都求均值
- 统计周一到周日的频次
- 使用基于周的周期因子作为加权均值
- 根据因子和每日均值去计算预测值
规则应用到实战中
以星期为周期的中位数预测
train = data[(data['date']>=datetime.datetime(2014,3,31))&(data['date']<=datetime.datetime(2014,8,3))]
test = data[(data['date']>datetime.datetime(2014,8,3))&(data['date']<=datetime.datetime(2014,8,31))]
pred = pd.DataFrame(columns=['date','purchase','redeem'])
s = []
for i in range(4,32):
a = datetime.datetime(2014,8,i)
s.append(a)
pred['date'] = s
train.reset_index(inplace=True)
test.reset_index(inplace=True)
del train['index']
del test['index']
pred['purchase'] = test['total_purchase_amt']
pred['redeem'] = test['total_redeem_amt']
pred['date'] = pd.to_datetime(pred['date'])
pred['weekday'] = pred['date'].dt.weekday + 1
df_purchase = pd.DataFrame(columns=['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7'])
count = 0
for i in range(len(train)):
df_purchase.loc[count,'weekday'+str((train.loc[i,'weekday'])+1)] = train.loc[i,'total_purchase_amt']
if train.loc[i,'weekday'] == 6 :
count = count + 1
df_redeem = pd.DataFrame(columns=['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7'])
count = 0
for i in range(len(train)):
df_redeem.loc[count,'weekday'+str((train.loc[i,'weekday'])+1)] = train.loc[i,'total_redeem_amt']
if train.loc[i,'weekday'] == 6 :
count = count + 1
def predictByWeekdayFactor(df_,weekId): #周期因子第一种计算方式
df_['week_mean'] = df_.mean(axis=1)
for i in range(7):
df_['weekday' + str(i+1) + '_coefficient'] = df_['weekday' + str(i+1)]/ df_['week_mean']
coef_list = ['weekday1_coefficient','weekday2_coefficient','weekday3_coefficient','weekday4_coefficient','weekday5_coefficient','weekday6_coefficient','weekday7_coefficient']
weekday_rate = df_[coef_list].median()
base = np.array(df_.loc[weekId,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']]) * 1/np.array(df_[coef_list].median())
Pre = np.mean(base) * np.array(weekday_rate)
return Pre
以星期为周期的流入流出预测
trade_weekday = train.groupby(['weekday'])['total_purchase_amt','total_redeem_amt'].mean() # 周内每日的均值
trade_weekday.reset_index(inplace=True)
purchase_mean = np.mean(train['total_purchase_amt']) # 整体均值
redeem_mean = np.mean(train['total_redeem_amt'])
#周期因子
purchase_weekday_rate_2 = trade_weekday['total_purchase_amt']/purchase_mean
redeem_weekday_rate_2 = trade_weekday['total_redeem_amt']/ redeem_mean
basePurchase_2 = np.mean(df_purchase_copy2.loc[len(df_purchase_copy2)-1,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']]*1/np.array(purchase_weekday_rate_2))
purchasePre_2 = basePurchase_2 * purchase_weekday_rate_2
baseRedeem_2 = np.mean(df_redeem_copy2.loc[len(df_redeem_copy2)-1,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']]*1/np.array(redeem_weekday_rate_2))
redeemPre_2 = baseRedeem_2 * redeem_weekday_rate_2
以月份为周期的流入流出预测
def predictByMonthDayFactor(df_,purchase_weekday_rate,redeem_weekday_rate):
# 获得每日均值
trade_day = df_.groupby(['day'])['total_purchase_amt','total_redeem_amt'].mean()
trade_day.reset_index(inplace=True)
#初始化星期的列和周期因子列
for i in range(7):
trade_day['weekday'+str(i+1)] = 0
trade_day['purchase_day_rate'] = 0
trade_day['redeem_day_rate'] = 0
#对每一天都判断,不知道啥操作,好像是和星期有关
for i in range(31):
for j in range(7):
trade_day.loc[i,'weekday'+str(j+1)] = sum((df_['day'==i+1])&(df_['weekday']==j+1))
for i in range(31):
trade_day.loc[i,'purchase_day_rate'] = sum(np.array(purchase_weekday_rate)*np.array(trade_day.loc[i,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']])) / sum(trade_day.loc[i,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']])
trade_day.loc[i,'redeem_day_rate'] = sum(np.array(redeem_weekday_rate)*np.array(trade_day.loc[i,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']])) / sum(trade_day.loc[i,['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7']])
trade_day['purchasePre'] = trade_day['total_purchase_amt']/trade_day['purchase_day_rate']
trade_day['redeemPre'] = trade_day['total_redeem_amt']/trade_day['redeem_day_rate']
return trade_day
trade_day= predictByMonthDayFactor(train,purchase_weekday_rate_2,redeem_weekday_rate_2)
pred['day'] = pred['date'].dt.day
pred = pd.merge(pred,trade_day[['day','purchasePre','redeemPre']])
for i in range(len(pred)):
pred.loc[i,'purchasePre'] = pred.loc[i,'purchasePre'] * purchase_weekday_rate_2[pred,loc[i,'weekday']-1]
pred.loc[i,'redeemPre'] = pred.loc[i,'redeemPre'] * redeem_weekday_rate_2[pred,loc[i,'weekday']-1]