Kaggle项目:信用贷款欺诈检测
Credit Card Fraud Detection
数据背景: 在批准客户的贷款时,探索所有的可能性。
数据内容: 数据集分为3个csv表,其中:'application_data.csv’为主表,有307510行和122列,包括目标列;'columns_description.csv’用于对其他表各字段的含义进行解释;'previous_application.csv’为以前的申请数据。
数据来源链接: https://www.kaggle.com/datasets/mishra5001/credit-card?datasetId=263888&sortBy=voteCount&select=application_data.csv
以下通过Python对信用卡数据进行清理和探索性数据分析(Exploratory Data Analysis),并尝试预测信用欺诈:
一、数据准备及概览
#引入库
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
#引入机器学习库
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
# 正常显示中文和负号
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('application_data.csv')
origin = data.copy()
data.head()
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
#查看数据集是否平衡
fraud = len(data[data.TARGET == 1])
legit = len(data[data.TARGET == 0])
print("欺诈交易的数量:", fraud)
print("欺诈交易占比: {:.4f} %".format(fraud/(fraud+legit)*100))
欺诈交易的数量: 24825
欺诈交易占比: 8.0729 %
二、数据清洗
#检查是否有重复值
print('数据集中重复记录个数: ', data.duplicated().sum())
数据集中重复记录个数: 0
#新增列——是否有申请过
data2 = pd.read_csv('previous_application.csv')
id_num = data2['SK_ID_CURR'][data2['AMT_APPLICATION']!=0.0].unique()
data['HAS_APPLIED'] = data['SK_ID_CURR'].isin(id_num).astype(int)
#删除ID字段
data.drop('SK_ID_CURR',axis=1,inplace=True)
#找出含缺失值的列,并求缺失值占比
per_null = data.isnull().sum().sort_values(ascending = False)/len(data)
per_null[per_null>0].head()
COMMONAREA_MODE 0.698723
COMMONAREA_AVG 0.698723
COMMONAREA_MEDI 0.698723
NONLIVINGAPARTMENTS_MEDI 0.694330
NONLIVINGAPARTMENTS_AVG 0.694330
dtype: float64
#缺失值占比可视化
fig, ax = plt.subplots(figsize=(6, 12))
per_null[per_null>0].plot(kind='barh',ax=ax,title='缺失值占比')
<AxesSubplot:title={'center':'缺失值占比'}>
#删除缺失值占比大于40%的列
null_col_name = per_null[per_null>0.4].index
data.drop(null_col_name,axis=1,inplace=True)
三、探索性数据分析(EDA)
(一)Object 类型字段
#Object类型字段
print("Object类型字段:",np.count_nonzero(data.select_dtypes('object').columns))
print(data.select_dtypes('object').columns)
Object类型字段: 12
Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE'],
dtype='object')
#求缺失值数量和占比的函数
def missing(df):
total = df.isnull().sum().sort_values(ascending = False)
total = total[total>0]
percent = df.isnull().sum().sort_values(ascending = False)/len(df)*100
percent = percent[percent>0]
return pd.concat([total, percent], axis=1, keys=['Total','Percentage'])
#查看是否有缺失值
missing(data.select_dtypes('object'))