Single Variable Analysis

IEEE-CIS Fraud Dection

  • 链接:https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt

代码理解

  1. 数据融合:

     train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
     test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
    

注:将两个数据集合并,合并的依据是‘Transaction’

  1. 整体查看数据的一些性质

     def resumetable(df):
         print(f"Dataset Shape: {df.shape}")
         summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
         summary = summary.reset_index()
         summary['Name'] = summary['index']
         summary = summary[['Name','dtypes']]
         summary['Missing'] = df.isnull().sum().values    
         summary['Uniques'] = df.nunique().values
         summary['First Value'] = df.loc[0].values
         summary['Second Value'] = df.loc[1].values
         summary['Third Value'] = df.loc[2].values
     
         for name in summary['Name'].value_counts().index:
             summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 
     
         return summary
    

注:包括特征名,类型,缺失的数量,该特征中不同值的数量,并列出该特征的前3个值和熵(不确定度)
3. 减少内存的使用

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
  1. 计算异常值

    def CalcOutliers(df_num):

     # calculating mean and std of the array
     data_mean, data_std = np.mean(df_num), np.std(df_num)
    
     # seting the cut line to both higher and lower values
     # You can change this value
     cut = data_std * 3
    
     #Calculating the higher and lower cut values
     lower, upper = data_mean - cut, data_mean + cut
    
     # creating an array of lower, higher and total outlier values 
     outliers_lower = [x for x in df_num if x < lower]
     outliers_higher = [x for x in df_num if x > upper]
     outliers_total = [x for x in df_num if x < lower or x > upper]
    
     # array without outlier values
     outliers_removed = [x for x in df_num if x > lower and x < upper]
     
     print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
     print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
     print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
     print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
     print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
     
     return
    

注:这里主要通过均值和标注差来确定该值是否是异常值,异常值为小于mean-std和大于mean-3std

Target Distribution

df_trans['TransactionAmt'] = df_trans['TransactionAmt'].astype(float)
total = len(df_trans)
total_amt = df_trans.groupby(['isFraud'])['TransactionAmt'].sum().sum()
plt.figure(figsize=(16,6))

plt.subplot(121)
g = sns.countplot(x='isFraud', data=df_trans, )
g.set_title("Fraud Transactions Distribution \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g.set_xlabel("Is fraud?", fontsize=18)
g.set_ylabel('Count', fontsize=18)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=15) 

perc_amt = (df_trans.groupby(['isFraud'])['TransactionAmt'].sum())
perc_amt = perc_amt.reset_index()
plt.subplot(122)
g1 = sns.barplot(x='isFraud', y='TransactionAmt',  dodge=True, data=perc_amt)
g1.set_title("% Total Amount in Transaction Amt \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g1.set_xlabel("Is fraud?", fontsize=18)
g1.set_ylabel('Total Transaction Amount Scalar', fontsize=18)
for p in g1.patches:
    height = p.get_height()
    g1.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total_amt * 100),
            ha="center", fontsize=15) 
    
plt.show()

注:这里就是画出Fraud的计数图(欺诈和未欺诈的个数),和欺诈和未欺诈案例涉及的总金额的柱状图
3. 查看数值特征的数据分布百分比

df_trans['TransactionAmt'] = df_trans['TransactionAmt'].astype(float)
print("Transaction Amounts Quantiles:")
print(df_trans['TransactionAmt'].quantile([.01, .025, .1, .25, .5, .75, .9, .975, .99]))
  1. 类别特征的可视化

     tmp = pd.crosstab(df_trans['ProductCD'], df_trans['isFraud'], normalize='index') * 100
     tmp = tmp.reset_index()
     tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
     
     plt.figure(figsize=(14,10))
     plt.suptitle('ProductCD Distributions', fontsize=22)
     
     plt.subplot(221)
     g = sns.countplot(x='ProductCD', data=df_trans)
     # plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])
     
     g.set_title("ProductCD Distribution", fontsize=19)
     g.set_xlabel("ProductCD Name", fontsize=17)
     g.set_ylabel("Count", fontsize=17)
     g.set_ylim(0,500000)
     for p in g.patches:
         height = p.get_height()
         g.text(p.get_x()+p.get_width()/2.,
                 height + 3,
                 '{:1.2f}%'.format(height/total*100),
                 ha="center", fontsize=14) 
     
     plt.subplot(222)
     g1 = sns.countplot(x='ProductCD', hue='isFraud', data=df_trans)
     plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])
     gt = g1.twinx()
     gt = sns.pointplot(x='ProductCD', y='Fraud', data=tmp, color='black', order=['W', 'H',"C", "S", "R"], legend=False)
     gt.set_ylabel("% of Fraud Transactions", fontsize=16)
     
     g1.set_title("Product CD by Target(isFraud)", fontsize=19)
     g1.set_xlabel("ProductCD Name", fontsize=17)
     g1.set_ylabel("Count", fontsize=17)
     
     plt.subplot(212)
     g3 = sns.boxenplot(x='ProductCD', y='TransactionAmt', hue='isFraud', 
                   data=df_trans[df_trans['TransactionAmt'] <= 2000] )
     g3.set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=20)
     g3.set_xlabel("ProductCD Name", fontsize=17)
     g3.set_ylabel("Transaction Values", fontsize=17)
     
     plt.subplots_adjust(hspace = 0.6, top = 0.85)
     
     plt.show()
    

注:
a. 第一个图是每个类别(类别数较少)的分布以及比例。
b. 根据目标变量,画出二分类条件下每个类别的分布以及同种类别下,两种情况的比例的折线图
c. 图3画出的是连续性变量在目标变量下的分布图

在这里插入图片描述5. 查看类别变量在目标变量的分类下的类别分布

for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
    df_trans[col] = df_trans[col].fillna("Miss")
    
def ploting_dist_ratio(df, col, lim=2000):
    tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

    plt.figure(figsize=(20,5))
    plt.suptitle(f'{col} Distributions ', fontsize=22)

    plt.subplot(121)
    g = sns.countplot(x=col, data=df, order=list(tmp[col].values))
    # plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])
    g.set_title(f"{col} Distribution\nCound and %Fraud by each category", fontsize=18)
    g.set_ylim(0,400000)
    gt = g.twinx()
    gt = sns.pointplot(x=col, y='Fraud', data=tmp, order=list(tmp[col].values),
                       color='black', legend=False, )
    gt.set_ylim(0,20)
    gt.set_ylabel("% of Fraud Transactions", fontsize=16)
    g.set_xlabel(f"{col} Category Names", fontsize=16)
    g.set_ylabel("Count", fontsize=17)
    for p in gt.patches:
        height = p.get_height()
        gt.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center",fontsize=14) 
        
    perc_amt = (df_trans.groupby(['isFraud',col])['TransactionAmt'].sum() / total_amt * 100).unstack('isFraud')
    perc_amt = perc_amt.reset_index()
    perc_amt.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

    plt.subplot(122)
    g1 = sns.boxplot(x=col, y='TransactionAmt', hue='isFraud', 
                     data=df[df['TransactionAmt'] <= lim], order=list(tmp[col].values))
    g1t = g1.twinx()
    g1t = sns.pointplot(x=col, y='Fraud', data=perc_amt, order=list(tmp[col].values),
                       color='black', legend=False, )
    g1t.set_ylim(0,5)
    g1t.set_ylabel("%Fraud Total Amount", fontsize=16)
    g1.set_title(f"{col} by Transactions dist", fontsize=18)
    g1.set_xlabel(f"{col} Category Names", fontsize=16)
    g1.set_ylabel("Transaction Amount(U$)", fontsize=16)
        
    plt.subplots_adjust(hspace=.4, wspace = 0.35, top = 0.80)
    
    plt.show()

注:对于有缺失值的特征,先填充为Miss,然后将Miss当作该特征的一个类别

  1. 第一个图就是每个类别的计数,然后和是Fraud的比例的折线图。

  2. 第二个图是连续性变量的箱型图及比例。
    在这里插入图片描述6. 对于时间问题的处理
    import datetime

     START_DATE = '2017-12-01'
     startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
     df_trans['Date'] = df_trans['TransactionDT'].apply(lambda x: (startdate+datetime.timedelta(seconds=x)))
     
     df_trans['_Weekdays'] = df_trans['Date'].dt.dayofweek
     df_trans['_Hours'] = df_trans['Date'].dt.hour
     df_trans['_Days'] = df_trans['Date'].dt.day
    

注:首先给出一个起始时间,将它转化为标准格式,因为TransactionDT记录的是秒,这里想根据这个信息将它转化为具体的时间,然后按星期,按天,按小时进行分析。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值