import pandas as pd
import numpy as np
import datetime
import shap
import eli5
import seaborn as sns
import matplotlib.pyplot as plt
from mvtpy import mvtest
from wordcloud import WordCloud
from scipy import stats
from eli5.sklearn import PermutationImportance
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from typing import *
import warnings
warnings.filterwarnings('ignore')
The sklearn.metrics.scorer module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.
# 为方面后面操作,设置全局index变量
labels = ['total_purchase_amt','total_redeem_amt']
date_indexs = ['week','year','month','weekday','day']
# Load the balance data
def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:
data_balance = pd.read_csv(path)
return data_balance.reset_index(drop=True)
# add tiemstamp to dataset
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:
data_balance = data.copy()
data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")
data_balance['day'] = data_balance['date'].dt.day
data_balance['month'] = data_balance['date'].dt.month
data_balance['year'] = data_balance['date'].dt.year
data_balance['week'] = data_balance['date'].dt.week
data_balance['weekday'] = data_balance['date'].dt.weekday
return data_balance.reset_index(drop=True)
# total amount
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:
df_tmp = data.copy()
df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()
df_tmp.reset_index(inplace=True)
return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)
# Generate the test data(这里一直创建到10,15号是因为十月一日是国庆节,会对9月末资金的流入流出有影响)
def generate_test_data(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy()
start = datetime.datetime(2014,9,1)
testdata = []
while start != datetime.datetime(2014,10,15):
temp = [start, np.nan, np.nan]
testdata.append(temp)
start += datetime.timedelta(days = 1)
testdata = pd.DataFrame(testdata)
testdata.columns = total_balance.columns
total_balance = pd.concat([total_balance, testdata], axis = 0)
total_balance = total_balance.reset_index(drop=True)
return total_balance.reset_index(drop=True)
# Load user's information
def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:
return pd.read_csv(path)
# 读取数据集
balance_data = load_data('Dataset/user_balance_table.csv')
balance_data = add_timestamp(balance_data, time_index='report_date')
total_balance = get_total_balance(balance_data)
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date')
user_information = load_user_information('Dataset/user_profile_table.csv')
特征提取
一、基于日期的静态特征
1.1 提取 is 特征
# 获取节假日集合
def get_holiday_set()->Set[datetime.date]:
holiday_set = set()
# 清明节
holiday_set = holiday_set | {
datetime.date(2014,4,5), datetime.date(2014,4,6), datetime.date(2014,4,7)}
# 劳动节
holiday_set = holiday_set | {
datetime.date(2014,5,1), datetime.date(2014,5,2), datetime.date(2014,5,3)}
# 端午节
holiday_set = holiday_set | {
datetime.date(2014,5,31), datetime.date(2014,6,1), datetime.date(2014,6,2)}
# 中秋节
holiday_set = holiday_set | {
datetime.date(2014,9,6), datetime.date(2014,9,7), datetime.date(2014,9,8)}
# 国庆节
holiday_set = holiday_set | {
datetime.date(2014,10,1), datetime.date(2014,10,2), datetime.date(2014,10,3),\
datetime.date(2014,10,4), datetime.date(2014,10,5), datetime.date(2014,10,6),\
datetime.date(2014,10,7)}
# 中秋节
holiday_set = holiday_set | {
datetime.date(2013,9,19), datetime.date(2013,9,20), datetime.date(2013,9,21)}
# 国庆节
holiday_set = holiday_set | {
datetime.date(2013,10,1), datetime.date(2013,10,2), datetime.date(2013,10,3),\
datetime.date(2013,10,4), datetime.date(2013,10,5), datetime.date(2013,10,6),\
datetime.date(2013,10,7)}
return holiday_set
set() 函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等。
# 提取所有 is特征
def extract_is_feature(data: pd.DataFrame)->pd.DataFrame:
total_balance = data.copy().reset_index(drop=True)
# 是否是Weekend
total_balance['is_weekend'] = 0
total_balance.loc[total_balance['weekday'].isin((5,6)), 'is_weekend'] = 1
# 是否是假期
total_balance['is_holiday'] = 0
total_balance.loc[total_balance['date'].isin(get_holiday_set()), 'is_holiday'] = 1
# 是否是节假日的第一天
last_day_flag = 0
total_balance['is_firstday_of_holiday'] = 0
for index, row in total_balance.iterrows():
if last_day_flag == 0 and row['is_holiday'] == 1:
total_balance.loc[index, 'is_firstday_of_holiday'] = 1
last_day_flag = row['is_holiday']
# 是否是节假日的最后一天
total_balance['is_lastday_of_holiday'] = 0
for index, row in total_balance.iterrows():
if row['is_holiday'] == 1 and total_balance.loc[index+1, 'is_holiday'] == 0:
total_balance.loc[index, 'is_lastday_of_holiday'] = 1
# 是否是节假日后的上班第一天
total_balance['is_firstday_of_work'] = 0
last_day_flag = 0
for index, row in total_balance.iterrows():
if last_day_flag == 1 and row['is_holiday'] == 0:
total_balance.loc[index, 'is_firstday_of_work'] = 1
last_day_flag = row['is_lastday_of_holiday']
# 是否不用上班
total_balance['is_work'] = 1
total_balance.loc[(total_balance['is_holiday'] == 1) | (total_balance['is_weekend'] == 1), 'is_work'] = 0
special_work_day_set = {
datetime.date(2014,5,4), datetime.date(2014,9,28)}
total_balance.loc[total_balance['date'].isin(special_work_day_set), 'is_work'] = 1
# 是否明天要上班
total_balance['is_gonna_work_tomorrow'] = 0
for index, row in total_balance.iterrows():
if index == len(total_balance)-1:
break
if row['is_work'] == 0 and total_balance.loc[index+1, 'is_work'] == 1:
total_balance.loc[index, 'is_gonna_work_tomorrow'] = 1<