python回归预测,从变量加工到建模预测(分子模型)RF,LGB,SVR,DNN实现

该博客详细介绍了使用机器学习算法(随机森林RF,轻量级梯度提升LGB,支持向量机SVR以及深度神经网络DNN)进行回归预测的过程。通过一系列的图表展示了每个模型的性能和效果。
摘要由CSDN通过智能技术生成
import numpy as np
import pandas as pd
# import pyreadstat
import time
import datetime as dt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
import sklearn.datasets
from pprint import pprint
from sklearn.preprocessing import PolynomialFeatures, StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import warnings
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from statsmodels.tools.sm_exceptions import HessianInversionWarning, warnings
warnings.filterwarnings(action='ignore', category=HessianInversionWarning)#


path = 'G:/data/'#数据文件路径
# path = 'E:/data/'
file_name = '2019gaoya.csv'#数据集文件名
# file_name = 'eledata.xlsx'
output_file_name = 'out_put_result' #输出文件名
#确定now是今天还是19年底
# now = pd.to_datetime('today')
now = dt.datetime(2019,12,31)
file_type = 0 #0是高压数据,1是低压数据
'''设置rfm取值参数观察期(3个月,6个月,12个月)'''
time_window = 12
#观察期的长短,用户近3/6/12个月内的缴费次数和缴费总额确定F与M分值


'''设置R参数(最近一次缴费时间距离今天的天数)'''
param_r = [15,30,60,90]
#exp:param_r = [15,30,60,90]
#小于15天得分5,15-30天得分4,30-60得分3,60-90得分2,90以上得分1
'''
未加rfm变量模型结果
总体训练集mae: 5.57585903604288
总体测试集mae: 6.811149742550856
'''

'''#=================================================
path = 'D:/tool/model/input/'#数据文件路径
file_name = 'high_data_all.csv'#数据集文件名
path_out = 'D:/tool/model/output/'#输出文件路径
output_file_name = 'out_put_result' #输出文件名
#确定now是今天还是19年底
# now = pd.to_datetime('today')
now = dt.datetime(2019,12,31)
time_window = 3
#=================================================
'''

#1.数据处理
def preprocessing(df,idx,date,date_type,amt,tw):
    '''
    idx:索引名
    date:日期名
    date_type:date字段的类型('date' or 'gap')
    amt:金额名
    tw:时间窗口
    ====
    return:加工好的df
    '''
    starttime = dt.datetime.now()
    
    df1 = df[[idx,date,amt]]
    if date_type=='date':
        df1[date] = pd.to_datetime(df1[date])
        df1['time_span'] = df1[date].map(lambda x:(now-x).days)
    elif date=='time_span':
        pass
    else:
        df1['time_span'] = df1[date]
    df_mid = df1[[idx,'time_span']].groupby(idx).agg([('last_pay_span',lambda x:x.min())])['time_span'].reset_index()
    df_mid = df_mid.merge(df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).agg(
        [('last_'+str(time_window)+'m_fee_cnt',lambda x:x.count()),
        ('last_'+str(time_window)+'m_fee_sum',lambda x:x.sum())])[amt].reset_index(),how='left',on=idx) 
#     df_mid = df_mid.merge(df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).sum().reset_index(),how='left',on=idx,suffixes=('_x', '_y'))
    df_mid = df_mid.apply(lambda x:x.fillna(0))
#     df_mid.rename(columns={'time_span':'last_pay_span','pay_fee_x':'last_'+str(time_window)+'m_fee_cnt','pay_fee_y':'last_'+str(time_window)+'m_fee_sum'},inplace=True)
    # df2 = df1[df1['time_span']<=tw*30][[idx,amt]].groupby(idx).agg(['sum','count']).reset_index()
    endtime = dt.datetime.now()
    print('用时',(endtime - starttime).seconds/60,'分钟')
    return df_mid




#2.计算Recency分值
def R_score(param_r,x):
    #param_r:    R分值划分区间
    #x: 需要被计算的具体数值
    if x<=param_r[0]:
        return 5
    elif x>param_r[0] and x<=param_r[1]:
        return 4
    elif x>param_r[1] and x<=param_r[2]:
        return 3
    elif x>param_r[2] and x<=param_r[3]:
        return 2
    elif x>param_r[3]:
        return 1

#3.计算f,m,得分及最终分类
def RFM_score(series,q,score_labels):
    a = series.quantile(q=q)
    if len(set(a))>len(score_labels):
        return pd.qcut(series,q=len(score_labels),labels=score_labels)
    else:
        return pd.cut(series,bins=len(score_labels),labels=score_labels)
    

#4.更换列名称
def Change_col_name(df,file_type):

        '''重新给列命名'''

      
    df.rename(columns={
   '单位':'unit','交费ID':'payfee_id','交费方式':'pay_chnl','用户编号':'acct_id','编号':'tq_id','名称':'tq_name',
                           '收费金额':'pay_fee','收费时间':'pay_date','等级':'vo_lvl','用户类别':'user_cat'},
                 inplace=True)
            



    '''取出有用字段'''
    df = df[['acct_id', 'tq_id', 'tq_name',  'voltage_lvl', 'user_cat' ,'pay_chnl', 
           'pay_fee', 'pay_date']]
    '''对id进行取整'''
    df['acct_id'] = df['acct_id'].astype('int64')
    df['tq_id'] = df['tq_id'].astype('str')
    if df['pay_date'].dtype=='object':
        df['pay_date'] = pd.to_datetime(df['pay_date'])
    return df


#5.加工数据集
def Deal_data_fin(df,data_type):    
    '''一、加工自变量'''
    if data_type == 'train':
        print('处理训练集')
    elif data_type == 'test':
        print('处理测试集')
    else:
        print('处理打分集')
    starttime1 = dt.datetime.now()



    df1 = df.copy()
    df1['time_span'] = df1['pay_date'].map(lambda x:(now-x).days)
    df1 = df1.sort_values(by=['acct_id','time_span']).reset_index(drop=True)
#     df1.drop(columns='index',inplace=True)
    df2 = df1[['acct_id','time_span']].drop_duplicates().groupby('acct_id').count().reset_index()
    df1 = df1[df1['acct_id'].isin(df2[df2['time_span']>5]['acct_id'])]
    #
    # if df1[['acct_id','tq_id','voltage_lvl','user_cat']].drop_duplicates().shape[0]==len(df_mid['acct_id'].unique()):
    #     df_mid = df1[['acct_id','tq_id','voltage_lvl','user_cat']].drop_duplicates().reset_index()
    #     df_mid.drop(columns='index',inplace=True)
    # else:
    df1['rn'] = df1[['acct_id','tq_id','tq_name','voltage_lvl','user_cat']].groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
    df_mid = df1[df1['rn']==1][['acct_id','tq_id','tq_name','voltage_lvl','user_cat']].reset_index()
    df_mid.drop(columns='index',inplace=True)

    #缴费渠道个数
    df_mid = df_mid.merge(df1[['acct_id','pay_chnl']].drop_duplicates().groupby('acct_id').count().reset_index(),on='acct_id',how='left')
    #按照日汇总金额
    df1 = df1[['acct_id','time_span','pay_fee']].groupby(['acct_id','time_span']).agg({
   'pay_fee':lambda x:x.sum()}).reset_index()
    df1['rn'] = df1.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
    #加工缴费间隔
    df_time = df1[['acct_id','time_span','rn']]
    df_time['rn'] = df_time['rn']+1
    df_time = df1.merge(df_time,on=['acct_id','rn'])
    df_time['rn'] = df_time['rn']-1
    df_time['gap'] = df_time['time_span_x']-df_time['time_span_y']
    df_time['gap_rate'] = df_time['pay_fee']/df_time['gap']

    if data_type=='train':
        df2 = df_time[df_time['rn']==2][['acct_id','gap']]
        df2['y'] = df2['gap']
        df_mid = df_mid.merge(df2[['acct_id','y']],on='acct_id',how='left')
        df_time = df_time[df_time['rn']>2]
        df1 = df1[df1['rn']>2]
    elif data_type=='test':
        df2 = df_time[df_time['rn']==1][['acct_id','gap']]
        df2['y'] = df2['gap']
        df_mid = df_mid.merge(df2[['acct_id','y']],on='acct_id',how='left')
        df_time = df_time[df_time['rn']>1]
        df1 = df1[df1['rn']>1]
    else:
        pass
    #统一重置rn
    df1['rn'] = df1.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')
    df_time['rn'] = df_time.groupby('acct_id')['acct_id'].rank(ascending=True,method='first')

    '''近3次缴费时间,缴费金额'''
    df2 = df1[df1.rn==1][['acct_id','time_span','pay_fee']]
    df2.rename(columns={
   'time_span':'last_pay_date','pay_fee':'last_pay_fee'},inplace=True)

    df3 = df1[df1.rn==2][['acct_id','time_span','pay_fee']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'time_span':'last_pay2_date','pay_fee':'last_pay2_fee'},inplace=True)

    df3 = df1[df1.rn==3][['acct_id','time_span','pay_fee']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'time_span':'last_pay3_date','pay_fee':'last_pay3_fee'},inplace=True)

    df3 = df1[df1.rn==4][['acct_id','pay_fee']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'pay_fee':'last_pay4_fee'},inplace=True)
    df_mid = df_mid.merge(df2,on='acct_id',how='left')

    '''最近三次缴费间隔'''
    df2 = df_time[df_time['rn']==1][['acct_id','gap']]
    df2.rename(columns={
   'gap':'last_pay_gap'},inplace=True)

    df3 = df_time[df_time['rn']==2][['acct_id','gap']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'gap':'last_pay2_gap'},inplace=True)

    df3 = df_time[df_time['rn']==3][['acct_id','gap']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'gap':'last_pay3_gap'},inplace=True)
    df_mid = df_mid.merge(df2,on='acct_id',how='left')

    '''最近三次缴费间隔率'''
    df2 = df_time[df_time['rn']==1][['acct_id','gap_rate']]
    df2.rename(columns={
   'gap_rate':'last_pay_gap_rate'},inplace=True)

    df3 = df_time[df_time['rn']==2][['acct_id','gap_rate']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'gap_rate':'last_pay2_gap_rate'},inplace=True)

    df3 = df_time[df_time['rn']==3][['acct_id','gap_rate']]
    df2 = df2.merge(df3,on='acct_id',how='left')
    df2.rename(columns={
   'gap_rate':'last_pay3_gap_rate'},inplace=True)
    df_mid = df_mid.merge(df2,on='acct_id',how='left')

    '''近n次缴费间隔的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度(偏度加工时会存在nan,故舍弃)'''
    df2 = df_time[df_time['rn']<=3][['acct_id','gap']]
    df3 = df2.groupby('acct_id').agg(
        [('last_pay3_gap_max',lambda x:x.max()),    
         ('last_pay3_gap_min',lambda x:x.min()),
         ('last_pay3_gap_range',lambda x:x.max()-x.min()),
         ('last_pay3_gap_mean',lambda x:x.mean()),
         ('last_pay3_gap_mode',lambda x:x.mode()[0]),
         ('last_pay3_gap_median',lambda x:x.median()),
         ('last_pay3_gap_std',lambda x:x.std()),
         ('last_pay3_gap_dc',lambda x:x.std()/x.mean()),
         ('last_pay3_gap_skew',lambda x:x.skew())])['gap'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')

    '''近n次缴费间隔率的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度(偏度加工时会存在nan,故舍弃)'''
    df2 = df_time[df_time['rn']<=3][['acct_id','gap_rate']]
    df3 = df2.groupby('acct_id').agg(
        [('last_pay3_gap_rate_max',lambda x:x.max()),    
         ('last_pay3_gap_rate_min',lambda x:x.min()),
         ('last_pay3_gap_rate_range',lambda x:x.max()-x.min()),
         ('last_pay3_gap_rate_mean',lambda x:x.mean()),
         ('last_pay3_gap_rate_mode',lambda x:x.mode()[0]),
         ('last_pay3_gap_rate_median',lambda x:x.median()),
         ('last_pay3_gap_rate_std',lambda x:x.std()),
         ('last_pay3_gap_rate_dc',lambda x:x.std()/x.mean()),
         ('last_pay3_gap_rate_skew',lambda x:x.skew())])['gap_rate'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')

    '''加工近12个月内缴费间隔的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
    df2 = df_time[df_time['time_span_y']<=365][['acct_id','gap']]
    df3 = df2.groupby('acct_id').agg(
        [('last_12m_gap_max',lambda x:x.max()),    
         ('last_12m_gap_min',lambda x:x.min()),
         ('last_12m_gap_range',lambda x:x.max()-x.min()),
         ('last_12m_gap_mean',lambda x:x.mean()),
         ('last_12m_gap_mode',lambda x:x.mode()[0]),
         ('last_12m_gap_median',lambda x:x.median()),
         ('last_12m_gap_std',lambda x:x.std()),
         ('last_12m_gap_dc',lambda x:x.std()/x.mean()),
         ('last_12m_gap_skew',lambda x:x.skew())])['gap'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')

    '''加工近12个月内缴费间隔率的最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
    df2 = df_time[df_time['time_span_y']<=365][['acct_id','gap_rate']]
    df3 = df2.groupby('acct_id').agg(
        [('last_12m_gap_rate_max',lambda x:x.max()),    
         ('last_12m_gap_rate_min',lambda x:x.min()),
         ('last_12m_gap_rate_range',lambda x:x.max()-x.min()),
         ('last_12m_gap_rate_mean',lambda x:x.mean()),
         ('last_12m_gap_rate_mode',lambda x:x.mode()[0]),
         ('last_12m_gap_rate_median',lambda x:x.median()),
         ('last_12m_gap_rate_std',lambda x:x.std()),
         ('last_12m_gap_rate_dc',lambda x:x.std()/x.mean()),
         ('last_12m_gap_rate_skew',lambda x:x.skew())])['gap_rate'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')


    '''加工最近2-4次缴费金额的总额,最大,最小,极差,均值,众数,中位数,标准差,离散系数,偏度,峰度'''
    df2 = df1[df1['rn']<=2][['acct_id','pay_fee']]
    df3 = df2.groupby('acct_id').agg(
        [('last_pay2_fee_sum',lambda x:x.sum()),
         ('last_pay2_fee_max',lambda x:x.max()),    
         ('last_pay2_fee_min',lambda x:x.min()),
         ('last_pay2_fee_range',lambda x:x.max()-x.min()),
         ('last_pay2_fee_mean',lambda x:x.mean()),
         ('last_pay2_fee_mode',lambda x:x.mode()[0]),
         ('last_pay2_fee_median',lambda x:x.median()),
         ('last_pay2_fee_std',lambda x:x.std()),
         ('last_pay2_fee_dc',lambda x:x.std()/x.mean())])['pay_fee'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')

    df2 = df1[df1['rn']<=3][['acct_id','pay_fee']]
    df3 = df2.groupby('acct_id').agg(
        [('last_pay3_fee_sum',lambda x:x.sum()),
         ('last_pay3_fee_max',lambda x:x.max()),    
         ('last_pay3_fee_min',lambda x:x.min()),
         ('last_pay3_fee_range',lambda x:x.max()-x.min()),
         ('last_pay3_fee_mean',lambda x:x.mean()),
         ('last_pay3_fee_mode',lambda x:x.mode()[0]),
         ('last_pay3_fee_median',lambda x:x.median()),
         ('last_pay3_fee_std',lambda x:x.std()),
         ('last_pay3_fee_dc',lambda x:x.std()/x.mean()),
         ('last_pay3_fee_skew',lambda x:x.skew())])['pay_fee'].reset_index()
    df_mid = df_mid.merge(df3,on='acct_id',how='left')

    df2 = df1[df1['rn']<=4][['acct_id','pay_fee']]
    df3 = df2.groupby('acct_id').agg(
        [('last_pay4_fee_sum',lambda x:x.sum()),
         ('last_pay4_fee_max',lambda x:x.max()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值