学习笔记-数据挖掘(时间序列)-时间序列ARIMA建模-建模预测

本文详细介绍了使用ARIMA模型进行时间序列预测的过程,包括仅使用IS特征的建模测试,多模型对比,特征劣汰后的效果,以及AIC模型平均和针对残差的建模。在不同阶段,展示了预测结果和实际图的对比,强调了残差处理和周期因子的重要性。
摘要由CSDN通过智能技术生成
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
from typing import *
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')
np.random.seed(1024)

labels = ['total_purchase_amt', 'total_redeem_amt']
# 分割数据集

def split_data_underline(data: pd.DataFrame)->pd.DataFrame:
    trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,8,1))]
    testset = data[(datetime.date(2014,8,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
    return trainset, testset

def split_data_online(data: pd.DataFrame)->pd.DataFrame:
    trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
    testset = data[(datetime.date(2014,9,1) <= data['date']) & (data['date'] < datetime.date(2014,10,1))]
    return trainset, testset
# 定义评价函数

def AE(y: Iterable, yhat: Iterable)->Iterable:
    return np.abs(y - yhat) / np.abs(y)

def total_AE(purchasehat: Iterable, redeemhat: Iterable, purchase: Iterable, redeem: Iterable, h: int = 0.3)->Iterable:
    return sum(map(lambda x : np.exp(-x/h)*10, AE(purchase, purchasehat))) * 0.45 + sum(map(lambda x : np.exp(-x/h)*10, AE(redeem, redeemhat))) * 0.55
# 在不同的时间段对模型进行验证

def week_evalution_single(data: pd.DataFrame, model: object, types: str)->pd.DataFrame:
    results = []
    a_month = relativedelta(months=1)
    for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), 
          datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
        trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
        testset = data[(i <= data['date']) & (data['date'] < i + a_month)]
        if len(testset) == 0 or len(trainset) == 0:
            i = datetime.date(2014, 4, 20)
            trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
            testset = data[(i <= data['date']) & (data['date'] < datetime.date(2014, 9, 1))]
        feature = [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]
        
        model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
        result_lr = model.predict(testset[feature])
        
        h = 0.3
        results.append(sum(AE(testset['total_' + types + '_amt'], result_lr).apply(lambda x : np.exp(-x/h))*10))
    return pd.DataFrame(results)
# 输出评级表格

def draw_eva_table(df: pd.DataFrame)->pd.DataFrame:
    rest = df.copy()
    rest['interval'] = [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), 
          datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]
    return rest
# 对生成结果进行可视化

def visual(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->None:
    fig = plt.figure(figsize=(10,4))
    plt.plot(testset['date'], result_purchase_lr, label='predicted_purchase')
    plt.plot(testset['date'], testset['total_purchase_amt'], label='real_redeem')

    plt.legend(loc='best')
    plt.title("The distribution of real and predict purchase")
    plt.xlabel("Time")
    plt.ylabel("Amount")
    plt.show()
    fig = plt.figure(figsize=(10,4))
    sns.barplot(testset['date'].dt.day ,result_purchase_lr - testset['total_purchase_amt'])

    fig = plt.figure(figsize=(10,4))
    plt.plot(testset['date'], result_redeem_lr, label='predicted_redeem')
    plt.plot(testset['date'], testset['total_redeem_amt'], label='real_redeem')

    plt.legend(loc='best')
    plt.title("The distribution of real and predict redeem")
    plt.xlabel("Time")
    plt.ylabel("Amount")
    plt.show()
    fig = plt.figure(figsize=(10,4))
    sns.barplot(testset['date'].dt.day ,result_redeem_lr - testset['total_redeem_amt'])
# 定义提取线下最好效果特征的函数

def feature_extract(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], List[float]]:
    features = [x for x in data.columns if x not in labels + ['date']]
    random.shuffle(features)
    results = []
    score = -1
    for i in features:
        score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types))
        if score_update > score:
            score = score_update
            results.append(i)
    return results, score
    
def robust_feature_extract(data: pd.DataFrame, model: object, types: str):
    results = []
    score = -1
    for i in range(10):
        results_update, score_update = feature_extract(data, model, types)
        if score_update > score:
            score = score_update
            results = results_update
        print(results_update, score_update)
    return results
# 定义AIC,BIC评价指标

def AIC(L: Iterable, delta: float, n_features: int):
    return L * np.log10(delta) + 2 * (n_features + 1)
def AIC(L: Iterable, delta: float, n_features: int):
    return L * np.log10(delta) + (n_features + 1) * np.log10(L)
# 使用AIC指标融合模型

def feature_extract_AIC(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], float]:
    features = [x for x in data.columns if x not in labels + ['date']]
    random.shuffle(features)
    results = []
    test_score = 1e9
    train_score = 0
    for i in features:
        test_score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types)[0])
        if test_score_update < test_score:
            test_score = test_score_update
            results.append(i)
            
    trainset, testset = split_data_underline(data)
    feature = results
    model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
    train_result_lr = model.predict(trainset[feature])
    delta = mean_squared_error(train_result_lr, trainset['total_' + types + '_amt'])
    #delta = np.sum(AE(trainset['total_' + types + '_amt'], train_result_lr).apply(lambda x : np.exp(-x/0.1))*10)
    return results, AIC(len(trainset), delta, len(feature))

def multi_model(data: pd.DataFrame, model: object, types: str)->Tuple[List[List[str]], float]:
    features = []
    weights = []
    for i in range(100):
        results_update, score_update = feature_extract_AIC(data, model, types)
        features.append(results_update)
        weights.append(score_update)
    avg = np.mean(weights)
    weights = [x - avg for x in weights]
    weights = [np.power((-1 * x / 2), 10) for x in weights]
    summ = np.sum(weights)
    weights = [x / summ for x in weights]
    return features, weights
# 生成线上结果

def generate_online_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
    trainset, testset = split_data_online(df)
    model.fit(X=trainset[feature], y=trainset[target])
    result_purchase_lr = model.predict(testset[feature])
    return result_purchase_lr
def generate_under_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
    trainset, testset = split_data_underline(df)
    model.fit(X=trainset[feature], y=trainset[target])
    result_purchase_lr = model.predict(testset[feature])
    return result_purchase_lr
# 生成线上提交的格式

def normalize_upload_file(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->pd.DataFrame:
    testset['total_purchase_amt'] = result_purchase_lr
    testset['total_redeem_amt'] = result_redeem_lr
    online_upload = testset[['date','total_purchase_amt','total_redeem_amt']]
    online_upload['date'] = online_upload['date'].astype(str)
    online_upload['date'] = online_upload['date'].str.replace('-','')
    return online_upload
# 线上结果可视化

def draw_result(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame):
    fig = plt.figure(figsize=(10,4))
    plt.plot(testset['date'].dt.day, result_purchase_lr, label='online_purchase')
    plt.plot(testset['date'].dt.day, result_redeem_lr, label='online_redeem')

    plt.legend(loc='best')
    plt.title("The predict values")
    plt.xlabel("Time")
    plt.ylabel("Amount")
# 重载DataFrame加法

def add_two_df(df1, df2, features = None, left_a = 0.45, right_a = 0.55):
    data = df1.copy()
    if not features:
        features = [x for x in data.columns if x != 'interval']
    for i in features:
        data[i] = (data[i] * left_a + df2[i] * right_a)
    return data
# 重载DataFrame乘法

def scale_df(df1, features = None, eta = 1):
    data = df1.copy()
    if not features:
        features = [x for x in data.columns if x != 'interval']
    for i in features:
        data[i] *= eta
    return data

建模测试

一、仅使用IS特征

data = pd.read_csv('Dataset/feature0522.csv')
data['date'] = pd.to_datetime(data['date'])
trainset, testset = split_data_underline(data)
result_purchase_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt')
result_redeem_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt')

在八月份预测结果

total_AE(result_purchase_lr, result_redeem_lr, testset['total_purchase_amt'], testset['total_redeem_amt'])
189.31445991054966

滑窗测试结果

draw_eva_table(week_evalution_single(data, model=LinearRegression(), types = 'purchase'))
0interval
0197.2773212014-08-01
1167.8093632014-07-25
2162.5695722014-07-18
3159.2147332014-07-11
4144.0626332014-07-04
5142.3323392014-06-27
6126.2403932014-06-20
draw_eva_table(week_evalution_single(data, LinearRegression(), 'redeem'))
0interval
0182.7993922014-08-01
1172.0934402014-07-25
2181.2102112014-07-18
3168.4882522014-07-11
4175.1606222014-07-04
5174.4654942014-06-27
6175.2012452014-06-20

八月份预测图与真实图

visual(result_purchase_lr, result_redeem_lr, testset)

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

result_purchase_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')

九月份预测效果图(线性)

trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QSsS0Zz8-1598254170083)(output_29_0.png)]

normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190612_only_is.csv',index=False,header=None)

二、多模型对比

def multi_model_eva(data, types:str = 'purchase'):
    results = pd.DataFrame()
    for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
        if results.empty:
            results = draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]})
        else:
            results = pd.merge(results, \
                               draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]}), on='interval')
    results = results[['interval'] + [x for x in results.columns if x != 'interval']]
    return results
add_two_df(multi_model_eva(data, 'purchase'), multi_model_eva(data, 'redeem'))
intervalLinearRegressionDecisionTreeRegressorRandomForestRegressorGradientBoostingRegressorMLPRegressorXGBRegressor
02014-08-01189.314460188.743896187.715746188.808471188.744889189.283918
12014-07-25170.165605171.123954172.599158171.155189169.383859171.483486
22014-07-18172.821924175.689847176.236757175.700583174.238004175.781053
32014-07-11164.315168167.489060168.552495167.497959164.755084167.463230
42014-07-04161.166527164.436476163.818487164.457772162.257028164.322969
52014-06-27160.005574163.849417162.010456163.820346160.703909163.713086
62014-06-20153.168861156.563393157.063395156.511847156.539468156.445360

三、劣汰后特征对比

data_purchase = pd.read_csv('Feature/purchase_feature_droped_0614.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
data_redeem = pd.read_csv('Feature/redeem_feature_droped_0614.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_lr = generate_under_result(data_purchase, [x for x in data_purchase.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_lr = generate_under_result(data_redeem, [x for x in data_redeem.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_redeem_amt')
total_AE(result_purchase_lr, result_redeem_lr, testset_purchase['total_purchase_amt'], testset_redeem['total_redeem_amt'])
189.90232809854422
add_two_df(multi_model_eva(data_purchase, 'purchase'), multi_model_eva(data_redeem, 'redeem'))
intervalLinearRegressionDecisionTreeRegressorRandomForestRegressorGradientBoostingRegressorMLPRegressorXGBRegressor
02014-08-01189.902328167.901814170.798802177.216664186.847975178.177163
12014-07-25177.650078167.752839177.877918169.722728180.481298171.346027
22014-07-18182.113181184.259270176.164814175.773043180.043535177.470257
32014-07-11182.092305178.165773184.042765178.501400179.125274181.784144
42014-07-04181.210428172.354412164.208141170.061840176.928324165.812636
52014-06-27185.309340178.630136182.713216189.882640177.631714186.246480
62014-06-20169.342125168.941842173.833505173.421845169.514554172.578523

八月份预测效果(线性)

trainset, testset = split_data_underline(data)
visual(result_purchase_lr, result_redeem_lr, testset)

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')

生成线上效果(线性)

可以看到28号很高(work in Sunday)

trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0WlLjKHq-1598254170104)(output_46_0.png)]

purchase feature

‘dis_to_nowork’, ‘dis_to_work’, ‘dis_from_work’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘weekday_onehot_6’,
‘dis_from_nowork’, ‘is_holiday’, ‘weekday_onehot_1’, ‘weekday_onehot_2’,
‘weekday_onehot_0’, ‘dis_from_middleofweek’, ‘dis_from_holiendday’,
‘weekday_onehot_3’, ‘is_lastday_of_holiday’, ‘is_firstday_of_holiday’,
‘weekday_onehot_4’, ‘is_worked_yestday’, ‘is_second_week’,
‘is_third_week’, ‘dis_from_startofmonth’, ‘dis_from_holiday’,
‘dis_to_nowork%%%%dis_from_purchase_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’

Redeem feature

‘is_work’, ‘dis_from_redeem_valley’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘is_gonna_work_tomorrow’,
‘is_holiday’, ‘dis_from_nowork’, ‘weekday_onehot_0’, ‘weekday_onehot_1’,
‘is_firstday_of_holiday’, ‘weekday_onehot_2’, ‘is_lastday_of_holiday’,
‘dis_from_holiday’, ‘is_work_on_sunday’, ‘is_firstday_of_work’,
‘is_secday_of_month’, ‘dis_from_holiendday’,
‘dis_from_redeem_valley%%%%dis_from_redeem_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’

normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped.csv',index=False,header=None)

生成线上效果(MLP)

result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
                                                            if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                            MLPRegressor(solver='lbfgs'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns 
                                                        if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                          MLPRegressor(solver='lbfgs'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lQx7lLVn-1598254170106)(output_52_0.png)]

normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped_MLP.csv',index=False,header=None)

生成线上效果(Xgboost)

result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns 
                                                            if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                            xgb.XGBRegressor(objective='reg:squarederror'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns 
                                                        if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                          xgb.XGBRegressor(objective='reg:squarederror'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)

在这里插入图片描述

normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190615_droped_XGB.csv',index=False,header=None)

四、AIC模型平均

purchase_features, purchase_weight = multi_model(data_purchase, model=LinearRegression(), types = 'purchase')
redeem_features, redeem_weight = multi_model(data_redeem, model=LinearRegression(), types = 'redeem')
def eva_for_aic(data_purchase, purchase_features, purchase_weight):
    results = pd.DataFrame()
    for index, feature in enumerate(purchase_features):
        if results.empty:
            results = scale_df(multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase'), 
                               eta = purchase_weight[index])
        else:
            results = add_two_df(results, multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase')
                                 , left_a = 1,
                                 right_a = purchase_weight[index])
    return results
add_two_df(eva_for_aic(data_purchase, purchase_features, purchase_weight), 
           eva_for_aic(data_redeem, redeem_features, redeem_weight))
intervalLinearRegressionDecisionTreeRegressorRandomForestRegressorGradientBoostingRegressorMLPRegressorXGBRegressor
02014-08-01196.963095183.792030186.524287190.406781203.741264191.603561
12014-07-25166.890259162.282961165.155036167.600603170.796653168.847170
22014-07-18164.275651164.099009165.696601165.439729166.863106166.027208
32014-07-11165.406218173.410333175.492379174.605650171.207335172.707320
42014-07-04159.754349159.385969157.523226159.920750165.437567159.528064
52014-06-27170.536885170.257129172.330545172.594357173.168680172.330513
62014-06-20171.443469162.639913164.623205164.944005171.675562166.762065

五、针对残差建模

data_purchase = pd.read_csv('Feature/residual_feature_purchase_0621.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
data_redeem = pd.read_csv('Feature/residual_feature_redeem_0621.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
base = pd.read_csv('Dataset/base.csv')
def generate_residual_result(data, base, model=LinearRegression(), types = 'purchase', split_time = datetime.date(2014,8,1)):
    a_month = relativedelta(months=1)
    trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < split_time)]
    testset = data[(split_time <= data['date']) & (data['date'] < split_time + a_month)]
    feature = [x for x in data_purchase.columns 
               if x not in ['total_purchase_amt','total_redeem_amt','date']]
    model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
    result_purchase_rate = model.predict(testset[feature])
    
    base['date'] = pd.to_datetime(base['date'], format= "%Y%m%d")
    result_purchase_cycle = np.array(base[(base['date'] >= split_time)
                                          &(base['date'] < split_time + a_month)]['total_'+types+'_predicted_by_cycle'])
    result_purchase_residual =  result_purchase_rate * np.array(result_purchase_cycle)
    return result_purchase_residual
def generate_evaluate_for_residual(model=LinearRegression()):
    result = []
    for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), 
              datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
        result_purchase_residual = generate_residual_result(data_purchase, base, model=model, types='purchase', split_time = i)
        result_redeem_residual = generate_residual_result(data_purchase, base, model=model, types='redeem', split_time= i)
        a_month = relativedelta(months=1)
        testset = data[(data['date'] >= i) & (data['date'] < i + a_month)]
        real_purchase = testset['total_purchase_amt']
        real_redeem = testset['total_redeem_amt']
        result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
    return pd.DataFrame(result)
def multi_model_eva_for_residual():
    results = pd.DataFrame()
    for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
        if results.empty:
            results = draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]})
        else:
            results = pd.merge(results, \
                               draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]}))
    results = results[['interval'] + [x for x in results.columns if x != 'interval']]
    return results
def generate_evaluate_for_cycle():
    result = []
    for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), 
                  datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
        a_month = relativedelta(months=1)
        testset = base[(base['date'] >= i) & (base['date'] < i + a_month)].reset_index(drop=True)
        result_purchase_residual = testset['total_purchase_predicted_by_cycle']
        result_redeem_residual = testset['total_redeem_predicted_by_cycle']
        testset = data[(data['date'] >= i) & (data['date'] < i + a_month)].reset_index(drop=True)
        real_purchase = testset['total_purchase_amt']
        real_redeem = testset['total_redeem_amt']
        result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
    return pd.DataFrame(result).rename(columns={0: 'PureTimeSeries'})
pd.merge(multi_model_eva_for_residual(), draw_eva_table(generate_evaluate_for_cycle()))
intervalLinearRegressionDecisionTreeRegressorRandomForestRegressorGradientBoostingRegressorMLPRegressorXGBRegressorPureTimeSeries
02014-08-01180.766534155.580904167.712675171.177173171.180097175.335191175.933714
12014-07-25161.447048159.351623161.705225162.192265158.220680163.794727155.916275
22014-07-18172.796145153.972984170.188452169.297901156.933699173.068241169.890622
32014-07-11165.437181154.819818166.759593161.607800158.148139165.108007165.668307
42014-07-04150.715908140.131046148.064148147.667609148.554344151.872781155.458113
52014-06-27161.207466157.575158161.413391160.319942145.957128160.949321160.758547
62014-06-20146.011833147.507212149.043333154.453050142.023337155.435619154.336379

(1) 只使用周期因子在8月份的预测效果

_, testset = split_data_underline(data)
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_redeem_predicted_by_cycle'])
total_AE(result_purchase_cycle, result_redeem_cycle, real_purchase, real_redeem)
175.93371418259747
trainset, testset = split_data_underline(data)
visual(result_purchase_cycle, result_redeem_cycle, testset)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

(2) 只使用周期因子+预测残差在8月份的预测效果(比单纯用因子好)

trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_rate = generate_under_result(data_purchase, [x for x in data_purchase.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_rate = generate_under_result(data_redeem, [x for x in data_redeem.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_redeem_amt')
total_AE(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, real_purchase, real_redeem)
182.628220303351
trainset, testset = split_data_underline(data)
visual(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, testset)

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

(3) 生成线上结果

trainset_purchase, testset_purchase = split_data_online(data_purchase)
result_purchase_rate = generate_online_result(data_purchase, [x for x in data_purchase.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_online(data_redeem)
result_redeem_rate = generate_online_result(data_redeem, [x for x in data_redeem.columns
                                                           if x not in ['total_purchase_amt','total_redeem_amt','date']], 
                                           target='total_redeem_amt')
# 修正一下预测结果试试
result_purchase_rate = result_purchase_rate / np.mean(result_purchase_rate)
result_redeem_rate = result_redeem_rate / np.mean(result_redeem_rate)
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_redeem_predicted_by_cycle'])
result_purchase_residual = result_purchase_rate * result_purchase_cycle
result_redeem_residual = result_redeem_rate * result_redeem_cycle
月份周期因子线上结果(135)
draw_result(result_purchase_cycle, result_redeem_cycle, testset_redeem)

在这里插入图片描述

残差处理后结果
draw_result(result_purchase_residual, result_redeem_residual, testset_redeem)

在这里插入图片描述

normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).to_csv('20190622_residual_liner.csv',index=False,header=None)
仅用与修正节假日的结果
result_score135 = pd.read_csv('Result/timeseries0606.csv',header=None)
result_residual = normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).reset_index(drop=True)
result_residual['date'] = result_residual['date'].astype(int)
days_need_to_change = [
    20140906,
    20140907,
    20140908,
    20140928
]
for index,row in result_score135.iterrows():
    if row[0] in days_need_to_change:
        result_score135.loc[index, 1] =  result_residual.loc[index, 'total_purchase_amt']
        result_score135.loc[index, 2] =   result_residual.loc[index, 'total_redeem_amt']
result_score135.to_csv('result135_fixed_by_residual_0621.csv',index=False,header=None)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>