IEEE-CIS Fraud Dection
- 链接:https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
代码理解
-
数据融合:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left') test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
注:将两个数据集合并,合并的依据是‘Transaction’
-
整体查看数据的一些性质
def resumetable(df): print(f"Dataset Shape: {df.shape}") summary = pd.DataFrame(df.dtypes,columns=['dtypes']) summary = summary.reset_index() summary['Name'] = summary['index'] summary = summary[['Name','dtypes']] summary['Missing'] = df.isnull().sum().values summary['Uniques'] = df.nunique().values summary['First Value'] = df.loc[0].values summary['Second Value'] = df.loc[1].values summary['Third Value'] = df.loc[2].values for name in summary['Name'].value_counts().index: summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) return summary
注:包括特征名,类型,缺失的数量,该特征中不同值的数量,并列出该特征的前3个值和熵(不确定度)
3. 减少内存的使用
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
-
计算异常值
def CalcOutliers(df_num):
# calculating mean and std of the array data_mean, data_std = np.mean(df_num), np.std(df_num) # seting the cut line to both higher and lower values # You can change this value cut = data_std * 3 #Calculating the higher and lower cut values lower, upper = data_mean - cut, data_mean + cut # creating an array of lower, higher and total outlier values outliers_lower = [x for x in df_num if x < lower] outliers_higher = [x for x in df_num if x > upper] outliers_total = [x for x in df_num if x < lower or x > upper] # array without outlier values outliers_removed = [x for x in df_num if x > lower and x < upper] print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points return
注:这里主要通过均值和标注差来确定该值是否是异常值,异常值为小于mean-std和大于mean-3std
Target Distribution
df_trans['TransactionAmt'] = df_trans['TransactionAmt'].astype(float)
total = len(df_trans)
total_amt = df_trans.groupby(['isFraud'])['TransactionAmt'].sum().sum()
plt.figure(figsize=(16,6))
plt.subplot(121)
g = sns.countplot(x='isFraud', data=df_trans, )
g.set_title("Fraud Transactions Distribution \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g.set_xlabel("Is fraud?", fontsize=18)
g.set_ylabel('Count', fontsize=18)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=15)
perc_amt = (df_trans.groupby(['isFraud'])['TransactionAmt'].sum())
perc_amt = perc_amt.reset_index()
plt.subplot(122)
g1 = sns.barplot(x='isFraud', y='TransactionAmt', dodge=True, data=perc_amt)
g1.set_title("% Total Amount in Transaction Amt \n# 0: No Fraud | 1: Fraud #", fontsize=22)
g1.set_xlabel("Is fraud?", fontsize=18)
g1.set_ylabel('Total Transaction Amount Scalar', fontsize=18)
for p in g1.patches:
height = p.get_height()
g1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total_amt * 100),
ha="center", fontsize=15)
plt.show()
注:这里就是画出Fraud的计数图(欺诈和未欺诈的个数),和欺诈和未欺诈案例涉及的总金额的柱状图
3. 查看数值特征的数据分布百分比
df_trans['TransactionAmt'] = df_trans['TransactionAmt'].astype(float)
print("Transaction Amounts Quantiles:")
print(df_trans['TransactionAmt'].quantile([.01, .025, .1, .25, .5, .75, .9, .975, .99]))
-
类别特征的可视化
tmp = pd.crosstab(df_trans['ProductCD'], df_trans['isFraud'], normalize='index') * 100 tmp = tmp.reset_index() tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True) plt.figure(figsize=(14,10)) plt.suptitle('ProductCD Distributions', fontsize=22) plt.subplot(221) g = sns.countplot(x='ProductCD', data=df_trans) # plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes']) g.set_title("ProductCD Distribution", fontsize=19) g.set_xlabel("ProductCD Name", fontsize=17) g.set_ylabel("Count", fontsize=17) g.set_ylim(0,500000) for p in g.patches: height = p.get_height() g.text(p.get_x()+p.get_width()/2., height + 3, '{:1.2f}%'.format(height/total*100), ha="center", fontsize=14) plt.subplot(222) g1 = sns.countplot(x='ProductCD', hue='isFraud', data=df_trans) plt.legend(title='Fraud', loc='best', labels=['No', 'Yes']) gt = g1.twinx() gt = sns.pointplot(x='ProductCD', y='Fraud', data=tmp, color='black', order=['W', 'H',"C", "S", "R"], legend=False) gt.set_ylabel("% of Fraud Transactions", fontsize=16) g1.set_title("Product CD by Target(isFraud)", fontsize=19) g1.set_xlabel("ProductCD Name", fontsize=17) g1.set_ylabel("Count", fontsize=17) plt.subplot(212) g3 = sns.boxenplot(x='ProductCD', y='TransactionAmt', hue='isFraud', data=df_trans[df_trans['TransactionAmt'] <= 2000] ) g3.set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=20) g3.set_xlabel("ProductCD Name", fontsize=17) g3.set_ylabel("Transaction Values", fontsize=17) plt.subplots_adjust(hspace = 0.6, top = 0.85) plt.show()
注:
a. 第一个图是每个类别(类别数较少)的分布以及比例。
b. 根据目标变量,画出二分类条件下每个类别的分布以及同种类别下,两种情况的比例的折线图
c. 图3画出的是连续性变量在目标变量下的分布图
5. 查看类别变量在目标变量的分类下的类别分布
for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
df_trans[col] = df_trans[col].fillna("Miss")
def ploting_dist_ratio(df, col, lim=2000):
tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
plt.figure(figsize=(20,5))
plt.suptitle(f'{col} Distributions ', fontsize=22)
plt.subplot(121)
g = sns.countplot(x=col, data=df, order=list(tmp[col].values))
# plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])
g.set_title(f"{col} Distribution\nCound and %Fraud by each category", fontsize=18)
g.set_ylim(0,400000)
gt = g.twinx()
gt = sns.pointplot(x=col, y='Fraud', data=tmp, order=list(tmp[col].values),
color='black', legend=False, )
gt.set_ylim(0,20)
gt.set_ylabel("% of Fraud Transactions", fontsize=16)
g.set_xlabel(f"{col} Category Names", fontsize=16)
g.set_ylabel("Count", fontsize=17)
for p in gt.patches:
height = p.get_height()
gt.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/total*100),
ha="center",fontsize=14)
perc_amt = (df_trans.groupby(['isFraud',col])['TransactionAmt'].sum() / total_amt * 100).unstack('isFraud')
perc_amt = perc_amt.reset_index()
perc_amt.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
plt.subplot(122)
g1 = sns.boxplot(x=col, y='TransactionAmt', hue='isFraud',
data=df[df['TransactionAmt'] <= lim], order=list(tmp[col].values))
g1t = g1.twinx()
g1t = sns.pointplot(x=col, y='Fraud', data=perc_amt, order=list(tmp[col].values),
color='black', legend=False, )
g1t.set_ylim(0,5)
g1t.set_ylabel("%Fraud Total Amount", fontsize=16)
g1.set_title(f"{col} by Transactions dist", fontsize=18)
g1.set_xlabel(f"{col} Category Names", fontsize=16)
g1.set_ylabel("Transaction Amount(U$)", fontsize=16)
plt.subplots_adjust(hspace=.4, wspace = 0.35, top = 0.80)
plt.show()
注:对于有缺失值的特征,先填充为Miss,然后将Miss当作该特征的一个类别
-
第一个图就是每个类别的计数,然后和是Fraud的比例的折线图。
-
第二个图是连续性变量的箱型图及比例。
6. 对于时间问题的处理
import datetimeSTART_DATE = '2017-12-01' startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d') df_trans['Date'] = df_trans['TransactionDT'].apply(lambda x: (startdate+datetime.timedelta(seconds=x))) df_trans['_Weekdays'] = df_trans['Date'].dt.dayofweek df_trans['_Hours'] = df_trans['Date'].dt.hour df_trans['_Days'] = df_trans['Date'].dt.day
注:首先给出一个起始时间,将它转化为标准格式,因为TransactionDT记录的是秒,这里想根据这个信息将它转化为具体的时间,然后按星期,按天,按小时进行分析。