赛题简介
蚂蚁金服拥有上亿会员并且业务场景中每天都涉及大量的资金流入和流出,面对如此庞大的用户群,资金管理压力会非常大。在既保证资金流动性风险最小,又满足日常业务运转的情况下,精准地预测资金的流入流出情况变得尤为重要。此届大赛以《资金流入流出预测》为题,期望参赛者能够通过对例如余额宝用户的申购赎回数据的把握,精准预测未来每日的资金流入流出情况。对货币基金而言,资金流入意味着申购行为,资金流出为赎回行为 。
赛题与数据
竞赛中使用的数据主要包含四个部分,分别为用户基本信息数据、用户申购赎回数据、收益率表和银行间拆借利率表。https://tianchi.aliyun.com/competition/entrance/231573/information
建模预测
import pandas as pd
import sklearn as skr
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
from typing import *
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
np.random.seed(1024)
labels = ['total_purchase_amt', 'total_redeem_amt']
# 分割数据集
def split_data_underline(data: pd.DataFrame)->pd.DataFrame:
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,8,1))]
testset = data[(datetime.date(2014,8,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
return trainset, testset
def split_data_online(data: pd.DataFrame)->pd.DataFrame:
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]
testset = data[(datetime.date(2014,9,1) <= data['date']) & (data['date'] < datetime.date(2014,10,1))]
return trainset, testset
# 定义评价函数
def AE(y: Iterable, yhat: Iterable)->Iterable:
return np.abs(y - yhat) / np.abs(y)
def total_AE(purchasehat: Iterable, redeemhat: Iterable, purchase: Iterable, redeem: Iterable, h: int = 0.3)->Iterable:
return sum(map(lambda x : np.exp(-x/h)*10, AE(purchase, purchasehat))) * 0.45 + sum(map(lambda x : np.exp(-x/h)*10, AE(redeem, redeemhat))) * 0.55
# 在不同的时间段对模型进行验证
def week_evalution_single(data: pd.DataFrame, model: object, types: str)->pd.DataFrame:
results = []
a_month = relativedelta(months=1)
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
testset = data[(i <= data['date']) & (data['date'] < i + a_month)]
if len(testset) == 0 or len(trainset) == 0:
i = datetime.date(2014, 4, 20)
trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]
testset = data[(i <= data['date']) & (data['date'] < datetime.date(2014, 9, 1))]
feature = [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]
model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
result_lr = model.predict(testset[feature])
h = 0.3
results.append(sum(AE(testset['total_' + types + '_amt'], result_lr).apply(lambda x : np.exp(-x/h))*10))
return pd.DataFrame(results)
# 输出评级表格
def draw_eva_table(df: pd.DataFrame)->pd.DataFrame:
rest = df.copy()
rest['interval'] = [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]
return rest
# 对生成结果进行可视化
def visual(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->None:
fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'], result_purchase_lr, label='predicted_purchase')
plt.plot(testset['date'], testset['total_purchase_amt'], label='real_redeem')
plt.legend(loc='best')
plt.title("The distribution of real and predict purchase")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(10,4))
sns.barplot(testset['date'].dt.day ,result_purchase_lr - testset['total_purchase_amt'])
fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'], result_redeem_lr, label='predicted_redeem')
plt.plot(testset['date'], testset['total_redeem_amt'], label='real_redeem')
plt.legend(loc='best')
plt.title("The distribution of real and predict redeem")
plt.xlabel("Time")
plt.ylabel("Amount")
plt.show()
fig = plt.figure(figsize=(10,4))
sns.barplot(testset['date'].dt.day ,result_redeem_lr - testset['total_redeem_amt'])
# 定义提取线下最好效果特征的函数
def feature_extract(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], List[float]]:
features = [x for x in data.columns if x not in labels + ['date']]
random.shuffle(features)
results = []
score = -1
for i in features:
score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types))
if score_update > score:
score = score_update
results.append(i)
return results, score
def robust_feature_extract(data: pd.DataFrame, model: object, types: str):
results = []
score = -1
for i in range(10):
results_update, score_update = feature_extract(data, model, types)
if score_update > score:
score = score_update
results = results_update
print(results_update, score_update)
return results
# 定义AIC,BIC评价指标
def AIC(L: Iterable, delta: float, n_features: int):
return L * np.log10(delta) + 2 * (n_features + 1)
def AIC(L: Iterable, delta: float, n_features: int):
return L * np.log10(delta) + (n_features + 1) * np.log10(L)
# 使用AIC指标融合模型
def feature_extract_AIC(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], float]:
features = [x for x in data.columns if x not in labels + ['date']]
random.shuffle(features)
results = []
test_score = 1e9
train_score = 0
for i in features:
test_score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types)[0])
if test_score_update < test_score:
test_score = test_score_update
results.append(i)
trainset, testset = split_data_underline(data)
feature = results
model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
train_result_lr = model.predict(trainset[feature])
delta = mean_squared_error(train_result_lr, trainset['total_' + types + '_amt'])
#delta = np.sum(AE(trainset['total_' + types + '_amt'], train_result_lr).apply(lambda x : np.exp(-x/0.1))*10)
return results, AIC(len(trainset), delta, len(feature))
def multi_model(data: pd.DataFrame, model: object, types: str)->Tuple[List[List[str]], float]:
features = []
weights = []
for i in range(100):
results_update, score_update = feature_extract_AIC(data, model, types)
features.append(results_update)
weights.append(score_update)
avg = np.mean(weights)
weights = [x - avg for x in weights]
weights = [np.power((-1 * x / 2), 10) for x in weights]
summ = np.sum(weights)
weights = [x / summ for x in weights]
return features, weights
# 生成线上结果
def generate_online_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
trainset, testset = split_data_online(df)
model.fit(X=trainset[feature], y=trainset[target])
result_purchase_lr = model.predict(testset[feature])
return result_purchase_lr
def generate_under_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:
trainset, testset = split_data_underline(df)
model.fit(X=trainset[feature], y=trainset[target])
result_purchase_lr = model.predict(testset[feature])
return result_purchase_lr
# 生成线上提交的格式
def normalize_upload_file(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->pd.DataFrame:
testset['total_purchase_amt'] = result_purchase_lr
testset['total_redeem_amt'] = result_redeem_lr
online_upload = testset[['date','total_purchase_amt','total_redeem_amt']]
online_upload['date'] = online_upload['date'].astype(str)
online_upload['date'] = online_upload['date'].str.replace('-','')
return online_upload
# 线上结果可视化
def draw_result(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame):
fig = plt.figure(figsize=(10,4))
plt.plot(testset['date'].dt.day, result_purchase_lr, label='online_purchase')
plt.plot(testset['date'].dt.day, result_redeem_lr, label='online_redeem')
plt.legend(loc='best')
plt.title("The predict values")
plt.xlabel("Time")
plt.ylabel("Amount")
# 重载DataFrame加法
def add_two_df(df1, df2, features = None, left_a = 0.45, right_a = 0.55):
data = df1.copy()
if not features:
features = [x for x in data.columns if x != 'interval']
for i in features:
data[i] = (data[i] * left_a + df2[i] * right_a)
return data
# 重载DataFrame乘法
def scale_df(df1, features = None, eta = 1):
data = df1.copy()
if not features:
features = [x for x in data.columns if x != 'interval']
for i in features:
data[i] *= eta
return data
建模测试
仅使用IS特征
data = pd.read_csv('Dataset/feature0522.csv')
data['date'] = pd.to_datetime(data['date'])
trainset, testset = split_data_underline(data)
result_purchase_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt')
result_redeem_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt')
在八月份预测结果
total_AE(result_purchase_lr, result_redeem_lr, testset['total_purchase_amt'], testset['total_redeem_amt'])
滑窗测试结果
draw_eva_table(week_evalution_single(data, model=LinearRegression(), types = 'purchase'))
draw_eva_table(week_evalution_single(data, LinearRegression(), 'redeem'))
八月份预测图与真实图
visual(result_purchase_lr, result_redeem_lr, testset)
result_purchase_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data, [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')
九月份预测效果图(线性)
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190612_only_is.csv',index=False,header=None)
多模型对比
def multi_model_eva(data, types:str = 'purchase'):
results = pd.DataFrame()
for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
if results.empty:
results = draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]})
else:
results = pd.merge(results, \
draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]}), on='interval')
results = results[['interval'] + [x for x in results.columns if x != 'interval']]
return results
add_two_df(multi_model_eva(data, 'purchase'), multi_model_eva(data, 'redeem'))
劣汰后特征对比
data_purchase = pd.read_csv('Feature/purchase_feature_droped_0614.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
data_redeem = pd.read_csv('Feature/redeem_feature_droped_0614.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_lr = generate_under_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_lr = generate_under_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
total_AE(result_purchase_lr, result_redeem_lr, testset_purchase['total_purchase_amt'], testset_redeem['total_redeem_amt'])
add_two_df(multi_model_eva(data_purchase, 'purchase'), multi_model_eva(data_redeem, 'redeem'))
八月份预测效果(线性)
trainset, testset = split_data_underline(data)
visual(result_purchase_lr, result_redeem_lr, testset)
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], LinearRegression(),'total_redeem_amt')
生成线上效果(线性)
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
purchase feature
‘dis_to_nowork’, ‘dis_to_work’, ‘dis_from_work’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘weekday_onehot_6’,
‘dis_from_nowork’, ‘is_holiday’, ‘weekday_onehot_1’, ‘weekday_onehot_2’,
‘weekday_onehot_0’, ‘dis_from_middleofweek’, ‘dis_from_holiendday’,
‘weekday_onehot_3’, ‘is_lastday_of_holiday’, ‘is_firstday_of_holiday’,
‘weekday_onehot_4’, ‘is_worked_yestday’, ‘is_second_week’,
‘is_third_week’, ‘dis_from_startofmonth’, ‘dis_from_holiday’,
‘dis_to_nowork%%%%dis_from_purchase_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’
Redeem feature
‘is_work’, ‘dis_from_redeem_valley’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘is_gonna_work_tomorrow’,
‘is_holiday’, ‘dis_from_nowork’, ‘weekday_onehot_0’, ‘weekday_onehot_1’,
‘is_firstday_of_holiday’, ‘weekday_onehot_2’, ‘is_lastday_of_holiday’,
‘dis_from_holiday’, ‘is_work_on_sunday’, ‘is_firstday_of_work’,
‘is_secday_of_month’, ‘dis_from_holiendday’,
‘dis_from_redeem_valley%%%%dis_from_redeem_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped.csv',index=False,header=None)
生成线上效果(MLP)
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
MLPRegressor(solver='lbfgs'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
MLPRegressor(solver='lbfgs'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped_MLP.csv',index=False,header=None)
生成线上效果(Xgboost)
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
xgb.XGBRegressor(objective='reg:squarederror'),'total_purchase_amt')
result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
xgb.XGBRegressor(objective='reg:squarederror'),'total_redeem_amt')
trainset, testset = split_data_online(data)
draw_result(result_purchase_lr, result_redeem_lr, testset)
normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190615_droped_XGB.csv',index=False,header=None)
AIC模型平均
purchase_features, purchase_weight = multi_model(data_purchase, model=LinearRegression(), types = 'purchase')
redeem_features, redeem_weight = multi_model(data_redeem, model=LinearRegression(), types = 'redeem')
def eva_for_aic(data_purchase, purchase_features, purchase_weight):
results = pd.DataFrame()
for index, feature in enumerate(purchase_features):
if results.empty:
results = scale_df(multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase'),
eta = purchase_weight[index])
else:
results = add_two_df(results, multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase')
, left_a = 1,
right_a = purchase_weight[index])
return results
add_two_df(eva_for_aic(data_purchase, purchase_features, purchase_weight),
eva_for_aic(data_redeem, redeem_features, redeem_weight))
针对残差建模
data_purchase = pd.read_csv('Feature/residual_feature_purchase_0621.csv')
data_purchase['date'] = pd.to_datetime(data_purchase['date'])
data_redeem = pd.read_csv('Feature/residual_feature_redeem_0621.csv')
data_redeem['date'] = pd.to_datetime(data_redeem['date'])
base = pd.read_csv('Dataset/base.csv')
def generate_residual_result(data, base, model=LinearRegression(), types = 'purchase', split_time = datetime.date(2014,8,1)):
a_month = relativedelta(months=1)
trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < split_time)]
testset = data[(split_time <= data['date']) & (data['date'] < split_time + a_month)]
feature = [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']]
model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])
result_purchase_rate = model.predict(testset[feature])
base['date'] = pd.to_datetime(base['date'], format= "%Y%m%d")
result_purchase_cycle = np.array(base[(base['date'] >= split_time)
&(base['date'] < split_time + a_month)]['total_'+types+'_predicted_by_cycle'])
result_purchase_residual = result_purchase_rate * np.array(result_purchase_cycle)
return result_purchase_residual
def generate_evaluate_for_residual(model=LinearRegression()):
result = []
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
result_purchase_residual = generate_residual_result(data_purchase, base, model=model, types='purchase', split_time = i)
result_redeem_residual = generate_residual_result(data_purchase, base, model=model, types='redeem', split_time= i)
a_month = relativedelta(months=1)
testset = data[(data['date'] >= i) & (data['date'] < i + a_month)]
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
return pd.DataFrame(result)
def multi_model_eva_for_residual():
results = pd.DataFrame()
for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:
if results.empty:
results = draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]})
else:
results = pd.merge(results, \
draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]}))
results = results[['interval'] + [x for x in results.columns if x != 'interval']]
return results
def generate_evaluate_for_cycle():
result = []
for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11),
datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:
a_month = relativedelta(months=1)
testset = base[(base['date'] >= i) & (base['date'] < i + a_month)].reset_index(drop=True)
result_purchase_residual = testset['total_purchase_predicted_by_cycle']
result_redeem_residual = testset['total_redeem_predicted_by_cycle']
testset = data[(data['date'] >= i) & (data['date'] < i + a_month)].reset_index(drop=True)
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))
return pd.DataFrame(result).rename(columns={0: 'PureTimeSeries'})
pd.merge(multi_model_eva_for_residual(), draw_eva_table(generate_evaluate_for_cycle()))
只使用周期因子在8月份的预测效果
_, testset = split_data_underline(data)
real_purchase = testset['total_purchase_amt']
real_redeem = testset['total_redeem_amt']
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_redeem_predicted_by_cycle'])
total_AE(result_purchase_cycle, result_redeem_cycle, real_purchase, real_redeem)
trainset, testset = split_data_underline(data)
visual(result_purchase_cycle, result_redeem_cycle, testset)
只使用周期因子+预测残差在8月份的预测效果(比单纯用因子好)
trainset_purchase, testset_purchase = split_data_underline(data_purchase)
result_purchase_rate = generate_under_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_underline(data_redeem)
result_redeem_rate = generate_under_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
total_AE(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, real_purchase, real_redeem)
trainset, testset = split_data_underline(data)
visual(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, testset)
生成线上结果
trainset_purchase, testset_purchase = split_data_online(data_purchase)
result_purchase_rate = generate_online_result(data_purchase, [x for x in data_purchase.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_purchase_amt')
trainset_redeem, testset_redeem = split_data_online(data_redeem)
result_redeem_rate = generate_online_result(data_redeem, [x for x in data_redeem.columns
if x not in ['total_purchase_amt','total_redeem_amt','date']],
target='total_redeem_amt')
# 修正一下预测结果试试
result_purchase_rate = result_purchase_rate / np.mean(result_purchase_rate)
result_redeem_rate = result_redeem_rate / np.mean(result_redeem_rate)
result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_purchase_predicted_by_cycle'])
result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_redeem_predicted_by_cycle'])
result_purchase_residual = result_purchase_rate * result_purchase_cycle
result_redeem_residual = result_redeem_rate * result_redeem_cycle
draw_result(result_purchase_cycle, result_redeem_cycle, testset_redeem)
残差处理后的结果
draw_result(result_purchase_residual, result_redeem_residual, testset_redeem)
normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).to_csv('20190622_residual_liner.csv',index=False,header=None)
result_score135 = pd.read_csv('Result/timeseries0606.csv',header=None)
result_residual = normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).reset_index(drop=True)
result_residual['date'] = result_residual['date'].astype(int)
days_need_to_change = [
20140906,
20140907,
20140908,
20140928
]
for index,row in result_score135.iterrows():
if row[0] in days_need_to_change:
result_score135.loc[index, 1] = result_residual.loc[index, 'total_purchase_amt']
result_score135.loc[index, 2] = result_residual.loc[index, 'total_redeem_amt']
result_score135.to_csv('result135_fixed_by_residual_0621.csv',index=False,header=None)