import numpy as np
import pandas as pd
# import pyreadstat
import time
import datetime as dt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
import sklearn.datasets
from pprint import pprint
from sklearn.preprocessing import PolynomialFeatures, StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import warnings
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from statsmodels.tools.sm_exceptions import HessianInversionWarning, warnings
warnings.filterwarnings(action='ignore', category=HessianInversionWarning)#
path = 'G:/data/'#数据文件路径
# path = 'E:/data/'
file_name = '2019gaoya.csv'#数据集文件名
# file_name = 'eledata.xlsx'
output_file_name = 'out_put_result' #输出文件名
#确定now是今天还是19年底
# now = pd.to_datetime('today')
now = dt.datetime(2019,12,31)
file_type = 0 #0是高压数据,1是低压数据
'''设置rfm取值参数观察期(3个月,6个月,12个月)'''
time_window = 12
#观察期的长短,用户近3/6/12个月内的缴费次数和缴费总额确定F与M分值
'''设置R参数(最近一次缴费时间距离今天的天数)'''
param_r = [15,30,60,90]
#exp:param_r = [15,30,60,90]
#小于15天得分5,15-30天得分4,30-60得分3,60-90得分2,90以上得分1
'''
未加rfm变量模型结果
总体训练集mae: 5.57585903604288
总体测试集mae: 6.811149742550856
'''
'''#=================================================
path = 'D:/tool/model/input/'#数据文件路径
file_name = 'high_data_all.csv'#数据集文件名
path_out = 'D:/tool/model/output/'#输出文件路径
output_file_name = 'out_put_result' #输出文件名
#确定now是今天还是19年底
# now = pd.to_datetime('today')
now = dt.datetime(2019,12,31)
time_window = 3
#=================================================
'''
#1.数据处理
def preprocessing(df,idx,date,date_type,amt,tw):
'''
idx:索引名
date:日期名
date_type:date字段的类型('date' or 'gap')
amt:金额名
tw:时间窗口
====
return:加工好的df
'''
starttime = dt.datetime.now()
df1 = df[[idx,date,amt]]
if date_type=='date':
df1[date] = pd.to_datetime(df1[date])
df1['time_span'] = df1[date].map(lambda x:(now-x).days)
elif date=='time_span':
pass
else:
df1['time_span'] = df1[date]
df_mid = df1[[idx,'time_span']].groupby(idx).agg([('last_pay_span',lambda x:x.min())])['time_span'].reset_index()
df_mid = df_mid.merge(df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).agg(
[('last_'+str(time_window)+'m_fee_cnt',lambda x:x.count()),
('last_'+str(time_window)+'m_fee_sum',lambda x:x.sum())])[amt].reset_index(),how='left',on=idx)
# df_mid = df_mid.merge(df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).sum().reset_index(),how='left',on=idx,suffixes=('_x', '_y'))
df_mid = df_mid.apply(lambda x:x.fillna(0))
# df_mid.rename(columns={'time_span':'last_pay_span','pay_fee_x':'last_'+str(time_window)+'m_fee_cnt','pay_fee_y':'last_'+str(time_window)+'m_fee_sum'},inplace=True)
# df2 = df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).agg(['sum','count']).reset_index()
endtime = dt.datetime.now()
print('用时',(endtime - starttime).seconds/60,'分钟')
return df_mid
#2.计算Recency分值
def R_score(param_r,x):
#param_r: R分值划分区间
#x: 需要被计算的具体数值
if x<=param_r[0]:
return 5
elif x>param_r[0] and x<=param_r[1]:
return 4
elif x>param_r[1] and x<=param_r[2]:
return 3
elif x>param_r[2] and x<=param_r[3]:
return 2
elif x>param_r[3]:
return 1
#3.计算f,m,得分及最终分类
def RFM_score(series,q,score_labels):
a = series.quantile(q=q)
if len(set(a))>len(score_labels):
return pd.qcut(series,q=len(score_labels),labels=score_labels)
else:
return pd.cut(series,bins=len(score_labels),labels=score_labels)
#4.更换列名称
def Change_col_name(df,file_type):
'''重新给列命名'''
df.rename(columns={
'单位':'unit','交费ID':'payfee_id','交费方式':'pay_chnl','用户编号':'acct_id','编号':'tq_id','名称':'tq_name',
'收费金额':'pay_fee','收费时间':'pay_date','等级':'vo_lvl','用户类别':'user_cat'},
inplace=True)
'''取出有用字段'''
df = df[['acct_id', 'tq_id', 'tq_name', 'voltage_lvl', 'user_cat' ,'pay_chnl',
'pay_fee', 'pay_date']]
'''对id进行取整'''
df['acct_id'] = df['acct_id'].astype('int64')
df['tq_id'] = df['tq_id'].astype('str')
if df['pay_date'].dtype=='object':
df['pay_date'] = pd.to_datetime(df['pay_date'])
return df
#5.加工数据集
def Deal_data_fin(df,data_type):
'''一、加工自变量'''
if data_type == 'train':
print('处理训练集')
elif data_type == 'test':
print('处理测试集')
else:
print('处理打分集')
starttime1 = dt.datetime.now()
df1 = df.copy()
df1['time_span'] = df1['pay_date'].map(lambda x:(now-x).days)
df1 = df1.sort_values(by=['acct_id','time_span']).reset_index(drop=True)
# df1.drop(columns='index',inplace=True)
df2 = df1[['acct_id','time_span']].drop_duplicates().groupby('acct_id').count().reset_index()
df1 = df1[df1['acct_id'].isin(df2[df2['time_span']>5]['acct_id'])]
#
# if df1[['acct_id','tq_id','voltage_lvl','user_cat']].drop_duplicates().shape[0]==len(df_mid['acct_id'].unique()):
# df_mid = df1[['acct_id','tq_id','voltage_lvl','user_cat']].drop_duplicates().reset_index()
# df_mid.drop(columns='index',inplace=True)
# else:
df1['rn'] = df1[['acct_id','tq_id','tq_name','voltage_lvl','user_cat']].groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
df_mid = df1[df1['rn']==1][['acct_id','tq_id','tq_name','voltage_lvl','user_cat']].reset_index()
df_mid.drop(columns='index',inplace=True)
#缴费渠道个数
df_mid = df_mid.merge(df1[['acct_id','pay_chnl']].drop_duplicates().groupby('acct_id').count().reset_index(),on='acct_id',how='left')
#按照日汇总金额
df1 = df1[['acct_id','time_span','pay_fee']].groupby(['acct_id','time_span']).agg({
'pay_fee':lambda x:x.sum()}).reset_index()
df1['rn'] = df1.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
#加工缴费间隔
df_time = df1[['acct_id','time_span','rn']]
df_time['rn'] = df_time['rn']+1
df_time = df1.merge(df_time,on=['acct_id','rn'])
df_time['rn'] = df_time['rn']-1
df_time['gap'] = df_time['time_span_x']-df_time['time_span_y']
df_time['gap_rate'] = df_time['pay_fee']/df_time['gap']
if data_type=='train':
df2 = df_time[df_time['rn']==2][['acct_id','gap']]
df2['y'] = df2['gap']
df_mid = df_mid.merge(df2[['acct_id','y']],on='acct_id',how='left')
df_time = df_time[df_time['rn']>2]
df1 = df1[df1['rn']>2]
elif data_type=='test':
df2 = df_time[df_time['rn']==1][['acct_id','gap']]
df2['y'] = df2['gap']
df_mid = df_mid.merge(df2[['acct_id','y']],on='acct_id',how='left')
df_time = df_time[df_time['rn']>1]
df1 = df1[df1['rn']>1]
else:
pass
#统一重置rn
df1['rn'] = df1.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
df_time['rn'] = df_time.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
'''近3次缴费时间,缴费金额'''
df2 = df1[df1.rn==1][['acct_id','time_span','pay_fee']]
df2.rename(columns={
'time_span':'last_pay_date','pay_fee':'last_pay_fee'},inplace=True)
df3 = df1[df1.rn==2][['acct_id','time_span','pay_fee']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'time_span':'last_pay2_date','pay_fee':'last_pay2_fee'},inplace=True)
df3 = df1[df1.rn==3][['acct_id','time_span','pay_fee']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'time_span':'last_pay3_date','pay_fee':'last_pay3_fee'},inplace=True)
df3 = df1[df1.rn==4][['acct_id','pay_fee']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'pay_fee':'last_pay4_fee'},inplace=True)
df_mid = df_mid.merge(df2,on='acct_id',how='left')
'''最近三次缴费间隔'''
df2 = df_time[df_time['rn']==1][['acct_id','gap']]
df2.rename(columns={
'gap':'last_pay_gap'},inplace=True)
df3 = df_time[df_time['rn']==2][['acct_id','gap']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'gap':'last_pay2_gap'},inplace=True)
df3 = df_time[df_time['rn']==3][['acct_id','gap']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'gap':'last_pay3_gap'},inplace=True)
df_mid = df_mid.merge(df2,on='acct_id',how='left')
'''最近三次缴费间隔率'''
df2 = df_time[df_time['rn']==1][['acct_id','gap_rate']]
df2.rename(columns={
'gap_rate':'last_pay_gap_rate'},inplace=True)
df3 = df_time[df_time['rn']==2][['acct_id','gap_rate']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'gap_rate':'last_pay2_gap_rate'},inplace=True)
df3 = df_time[df_time['rn']==3][['acct_id','gap_rate']]
df2 = df2.merge(df3,on='acct_id',how='left')
df2.rename(columns={
'gap_rate':'last_pay3_gap_rate'},inplace=True)
df_mid = df_mid.merge(df2,on='acct_id',how='left')
'''近n次缴费间隔的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度(偏度加工时会存在nan,故舍弃)'''
df2 = df_time[df_time['rn']<=3][['acct_id','gap']]
df3 = df2.groupby('acct_id').agg(
[('last_pay3_gap_max',lambda x:x.max()),
('last_pay3_gap_min',lambda x:x.min()),
('last_pay3_gap_range',lambda x:x.max()-x.min()),
('last_pay3_gap_mean',lambda x:x.mean()),
('last_pay3_gap_mode',lambda x:x.mode()[0]),
('last_pay3_gap_median',lambda x:x.median()),
('last_pay3_gap_std',lambda x:x.std()),
('last_pay3_gap_dc',lambda x:x.std()/x.mean()),
('last_pay3_gap_skew',lambda x:x.skew())])['gap'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
'''近n次缴费间隔率的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度(偏度加工时会存在nan,故舍弃)'''
df2 = df_time[df_time['rn']<=3][['acct_id','gap_rate']]
df3 = df2.groupby('acct_id').agg(
[('last_pay3_gap_rate_max',lambda x:x.max()),
('last_pay3_gap_rate_min',lambda x:x.min()),
('last_pay3_gap_rate_range',lambda x:x.max()-x.min()),
('last_pay3_gap_rate_mean',lambda x:x.mean()),
('last_pay3_gap_rate_mode',lambda x:x.mode()[0]),
('last_pay3_gap_rate_median',lambda x:x.median()),
('last_pay3_gap_rate_std',lambda x:x.std()),
('last_pay3_gap_rate_dc',lambda x:x.std()/x.mean()),
('last_pay3_gap_rate_skew',lambda x:x.skew())])['gap_rate'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
'''加工近12个月内缴费间隔的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
df2 = df_time[df_time['time_span_y']<=365][['acct_id','gap']]
df3 = df2.groupby('acct_id').agg(
[('last_12m_gap_max',lambda x:x.max()),
('last_12m_gap_min',lambda x:x.min()),
('last_12m_gap_range',lambda x:x.max()-x.min()),
('last_12m_gap_mean',lambda x:x.mean()),
('last_12m_gap_mode',lambda x:x.mode()[0]),
('last_12m_gap_median',lambda x:x.median()),
('last_12m_gap_std',lambda x:x.std()),
('last_12m_gap_dc',lambda x:x.std()/x.mean()),
('last_12m_gap_skew',lambda x:x.skew())])['gap'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
'''加工近12个月内缴费间隔率的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
df2 = df_time[df_time['time_span_y']<=365][['acct_id','gap_rate']]
df3 = df2.groupby('acct_id').agg(
[('last_12m_gap_rate_max',lambda x:x.max()),
('last_12m_gap_rate_min',lambda x:x.min()),
('last_12m_gap_rate_range',lambda x:x.max()-x.min()),
('last_12m_gap_rate_mean',lambda x:x.mean()),
('last_12m_gap_rate_mode',lambda x:x.mode()[0]),
('last_12m_gap_rate_median',lambda x:x.median()),
('last_12m_gap_rate_std',lambda x:x.std()),
('last_12m_gap_rate_dc',lambda x:x.std()/x.mean()),
('last_12m_gap_rate_skew',lambda x:x.skew())])['gap_rate'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
'''加工最近2-4次缴费金额的总额,最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
df2 = df1[df1['rn']<=2][['acct_id','pay_fee']]
df3 = df2.groupby('acct_id').agg(
[('last_pay2_fee_sum',lambda x:x.sum()),
('last_pay2_fee_max',lambda x:x.max()),
('last_pay2_fee_min',lambda x:x.min()),
('last_pay2_fee_range',lambda x:x.max()-x.min()),
('last_pay2_fee_mean',lambda x:x.mean()),
('last_pay2_fee_mode',lambda x:x.mode()[0]),
('last_pay2_fee_median',lambda x:x.median()),
('last_pay2_fee_std',lambda x:x.std()),
('last_pay2_fee_dc',lambda x:x.std()/x.mean())])['pay_fee'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
df2 = df1[df1['rn']<=3][['acct_id','pay_fee']]
df3 = df2.groupby('acct_id').agg(
[('last_pay3_fee_sum',lambda x:x.sum()),
('last_pay3_fee_max',lambda x:x.max()),
('last_pay3_fee_min',lambda x:x.min()),
('last_pay3_fee_range',lambda x:x.max()-x.min()),
('last_pay3_fee_mean',lambda x:x.mean()),
('last_pay3_fee_mode',lambda x:x.mode()[0]),
('last_pay3_fee_median',lambda x:x.median()),
('last_pay3_fee_std',lambda x:x.std()),
('last_pay3_fee_dc',lambda x:x.std()/x.mean()),
('last_pay3_fee_skew',lambda x:x.skew())])['pay_fee'].reset_index()
df_mid = df_mid.merge(df3,on='acct_id',how='left')
df2 = df1[df1['rn']<=4][['acct_id','pay_fee']]
df3 = df2.groupby('acct_id').agg(
[('last_pay4_fee_sum',lambda x:x.sum()),
('last_pay4_fee_max',lambda x:x.max()
python回归预测,从变量加工到建模预测(分子模型)RF,LGB,SVR,DNN实现
最新推荐文章于 2024-09-09 18:38:17 发布
该博客详细介绍了使用机器学习算法(随机森林RF,轻量级梯度提升LGB,支持向量机SVR以及深度神经网络DNN)进行回归预测的过程。通过一系列的图表展示了每个模型的性能和效果。
摘要由CSDN通过智能技术生成