'''
数据集下载链接:https://pan.baidu.com/s/1GiPu5A_9FIFOn7EveLHEzQ 密码:1582
'''
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import warnings
import seaborn as sns
import gc
warnings. simplefilter( 'ignore' )
sns. set ( )
plt. rcParams[ 'font.sans-serif' ] = [ 'KaiTi' ]
plt. rcParams[ 'font.serif' ] = [ 'KaiTi' ]
sns. set_style( "darkgrid" , { "font.sans-serif" : [ 'KaiTi' , 'Arial' ] } )
% matplotlib inline
train_id_file = './train_identity.csv'
train_transaction_file = './train_transaction.csv'
test_id_file = './test_identity.csv'
test_transaction_file = './test_transaction.csv'
def reduce_mem_usage ( df) :
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage of dataframe is {:.2f} MB' . format ( start_mem) )
for col in df. columns:
col_type = df[ col] . dtype
if col_type != object :
c_min = df[ col] . min ( )
c_max = df[ col] . max ( )
if str ( col_type) [ : 3 ] == 'int' :
if c_min > np. iinfo( np. int8) . min and c_max < np. iinfo( np. int8) . max :
df[ col] = df[ col] . astype( np. int8)
elif c_min > np. iinfo( np. int16) . min and c_max < np. iinfo( np. int16) . max :
df[ col] = df[ col] . astype( np. int16)
elif c_min > np. iinfo( np. int32) . min and c_max < np. iinfo( np. int32) . max :
df[ col] = df[ col] . astype( np. int32)
elif c_min > np. iinfo( np. int64) . min and c_max < np. iinfo( np. int64) . max :
df[ col] = df[ col] . astype( np. int64)
else :
if c_min > np. finfo( np. float16) . min and c_max < np. finfo( np. float16) . max :
df[ col] = df[ col] . astype( np. float16)
elif c_min > np. finfo( np. float32) . min and c_max < np. finfo( np. float32) . max :
df[ col] = df[ col] . astype( np. float32)
else :
df[ col] = df[ col] . astype( np. float64)
else :
df[ col] = df[ col] . astype( 'category' )
end_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage after optimization is: {:.2f} MB' . format ( end_mem) )
print ( 'Decreased by {:.1f}%' . format ( 100 * ( start_mem - end_mem) / start_mem) )
return df
train_id = reduce_mem_usage( pd. read_csv( train_id_file) )
train_transaction = reduce_mem_usage( pd. read_csv( train_transaction_file) )
test_id = reduce_mem_usage( pd. read_csv( test_id_file) )
test_trainsaction = reduce_mem_usage( pd. read_csv( test_transaction_file) )
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.02 MB
Decreased by 77.8%
Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 487.16 MB
Decreased by 72.6%
Memory usage of dataframe is 44.39 MB
Memory usage after optimization is: 9.86 MB
Decreased by 77.8%
Memory usage of dataframe is 1519.24 MB
Memory usage after optimization is: 425.24 MB
Decreased by 72.0%
train_transaction[ 'TransactionAmt' ] = train_transaction[ 'TransactionAmt' ] . astype( float )
total = len ( train_transaction)
total
590540
total_amt = train_transaction. groupby( [ 'isFraud' ] ) [ 'TransactionAmt' ] . sum ( ) . sum ( )
plt. figure( figsize= ( 12 , 5 ) )
plt. subplot( 121 )
plot_tr = sns. countplot( x= 'isFraud' , data= train_transaction)
plot_tr. set_title( 'Fraud Tran Distribution \n 0: No | 1: is' , fontsize= 16 )
plot_tr. set_xlabel( 'is Fraud' , fontsize= 16 )
plot_tr. set_ylabel( 'count' , fontsize= 16 )
for p in plot_tr. patches:
height = p. get_height( )
plot_tr. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height/ total * 100 ) ,
ha= 'center' ,
fontsize= 15 )
percent_amt = ( train_transaction. groupby( [ 'isFraud' ] ) [ 'TransactionAmt' ] . sum ( ) )
percent_amt = percent_amt. reset_index( )
plt. subplot( 122 )
plot_tr_2 = sns. barplot( x= 'isFraud' , y= 'TransactionAmt' , dodge= True , data= percent_amt)
plot_tr_2. set_title( '% Total Amount in Transaction Amt \n 0: No Fraud | 1: Fraud' , fontsize= 18 )
plot_tr_2. set_xlabel( 'Is Fraud?' , fontsize= 16 )
plot_tr_2. set_ylabel( 'Total Transaction Amout Scalar' , fontsize= 16 )
for p in plot_tr_2. patches:
height = p. get_height( )
plot_tr_2. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height/ total_amt * 100 ) ,
ha= 'center' , fontsize= 15 )
plt. subplots_adjust( wspace= 0.65 )
plt. show( )
plt. figure( figsize= ( 16 , 12 ) )
plt. suptitle( 'Tran Values Distribution' , fontsize= 22 )
plt. subplot( 221 )
sub_plot_1 = sns. distplot( train_transaction[ train_transaction[ 'TransactionAmt' ] <= 1000 ] [ 'TransactionAmt' ] )
sub_plot_1. set_title( '小于1000元的消费对比' , fontsize= 16 )
sub_plot_1. set_xlabel( '' )
sub_plot_1. set_ylabel( 'probability' , fontsize= 15 )
plt. subplot( 222 )
sub_plot_2 = sns. distplot( np. log( train_transaction[ 'TransactionAmt' ] ) )
sub_plot_2. set_title( '正态转换后的消费情况' , fontsize= 16 )
sub_plot_2. set_xlabel( '' )
sub_plot_2. set_ylabel( 'probability' , fontsize= 15 )
plt. figure( figsize= ( 16 , 12 ) )
plt. subplot( 212 )
sub_plot_3 = plt. scatter( range ( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] . shape[ 0 ] ) ,
np. sort( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] [ 'TransactionAmt' ] . values) ,
label= 'NoFraud' , alpha= .2 )
sub_plot_3 = plt. scatter( range ( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] . shape[ 0 ] ) ,
np. sort( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] [ 'TransactionAmt' ] . values) ,
label= 'IsFraud' , alpha= .2 )
sub_plot_3 = plt. title( 'ECDF \n 违约用户与正常用户的消费金额对比' , fontsize= 18 )
sub_plot_3 = plt. legend( )
plt. figure( figsize= ( 16 , 12 ) )
plt. subplot( 321 )
sub_plot_4 = plt. scatter( range ( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] . shape[ 0 ] ) ,
np. sort( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] [ 'TransactionAmt' ] . values) ,
label= 'NoFraud' , alpha= .4 )
plt. title( '正常用户的消费区间' , fontsize= 16 )
plt. xlabel( 'Index' , fontsize= 15 )
plt. ylabel( '消费金额' , fontsize= 15 )
plt. subplot( 322 )
sub_plot_5 = plt. scatter( range ( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] . shape[ 0 ] ) ,
np. sort( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] [ 'TransactionAmt' ] . values) ,
label= 'IsFraud' , alpha= .2 )
plt. title( '违约用户的消费区间' , fontsize= 16 )
plt. xlabel( 'Index' , fontsize= 15 )
plt. ylabel( '消费金额' , fontsize= 15 )
plt. show( )
tmp = pd. crosstab( train_transaction[ 'ProductCD' ] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp = tmp. reset_index( )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 14 , 10 ) )
plt. suptitle( '产品类型展示' , fontsize= 22 )
plt. subplot( 221 )
plot_1 = sns. countplot( x= 'ProductCD' , data= train_transaction, order= [ 'W' , 'H' , 'C' , 'S' , 'R' ] )
plot_1. set_title( '产品代码展示' , fontsize= 18 )
plot_1. set_xlabel( '产品代码' , fontsize= 16 )
plot_1. set_ylabel( '消费次数' , fontsize= 17 )
plot_1. set_ylim( 0 , 500000 )
for p in plot_1. patches:
height = p. get_height( )
plot_1. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height/ total * 100 ) ,
ha= 'center' , fontsize= 14 )
plt. subplot( 222 )
plot_2 = sns. countplot( x= 'ProductCD' , hue= 'isFraud' , data= train_transaction, order= [ 'W' , 'H' , 'C' , 'S' , 'R' ] )
plt. legend( title= 'Fraud' , loc= 'best' , labels= [ 'No' , 'Yes' ] )
plot_2_2 = plot_2. twinx( )
plot_2_2 = sns. pointplot( x= 'ProductCD' , y= 'IsFraud' , data= tmp, color= 'black' , order= [ 'W' , 'H' , 'C' , 'S' , 'R' ] , legend= False )
plot_2. set_title( '单产品违约消费者对比' , fontsize= 16 )
plot_2. set_xlabel( '产品代码' , fontsize= 16 )
plot_2. set_ylabel( '消费次数' , fontsize= 16 )
plt. subplot( 212 )
plot_3 = sns. boxplot( x= 'ProductCD' , y= 'TransactionAmt' , hue= 'isFraud' ,
data= train_transaction[ train_transaction[ 'TransactionAmt' ] <= 2000 ] ,
order= [ 'W' , 'H' , 'C' , 'S' , 'R' ] )
plot_3. set_title( '单种产品消费金额' , fontsize= 16 )
plot_3. set_xlabel( '产品代码' , fontsize= 16 )
plot_3. set_ylabel( '消费金额' , fontsize= 16 )
plt. subplots_adjust( hspace = 0.6 , top = 0.85 )
plt. show( )
train_transaction. loc[ train_transaction. card3. isin( train_transaction. card3. value_counts( ) [ train_transaction. card3. value_counts( ) < 200 ] . index) ,
'card3' ] = 'Others'
train_transaction. loc[ train_transaction. card5. isin( train_transaction. card5. value_counts( ) [ train_transaction. card5. value_counts( ) < 200 ] . index) ,
'card5' ] = 'Others'
tmp = pd. crosstab( train_transaction[ 'card3' ] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp = tmp. reset_index( )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
tmp2 = pd. crosstab( train_transaction[ 'card5' ] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp2 = tmp2. reset_index( )
tmp2. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 14 , 22 ) )
plt. subplot( 411 )
plot_1 = sns. distplot( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] [ 'card1' ] , label= 'Fraud' )
plot_1 = sns. distplot( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] [ 'card1' ] , label= 'NoFraud' )
plot_1. legend( )
plot_1. set_title( 'Card1正常用户与违约用户对比' , fontsize= 20 )
plot_1. set_xlabel( 'Card1值' , fontsize= 18 )
plot_1. set_ylabel( '可能性' , fontsize= 18 )
plt. subplot( 412 )
plot_2 = sns. distplot( train_transaction[ train_transaction[ 'isFraud' ] == 1 ] [ 'card2' ] . dropna( ) , label= 'Fraud' )
plot_2 = sns. distplot( train_transaction[ train_transaction[ 'isFraud' ] == 0 ] [ 'card2' ] . dropna( ) , label= 'NoFraud' )
plot_2. legend( )
plot_2. set_title( 'Card2正常用户与异常用户对比' , fontsize= 20 )
plot_2. set_xlabel( 'Card2值' , fontsize= 18 )
plot_2. set_ylabel( '可能性' , fontsize= 18 )
plt. subplot( 413 )
plot_3 = sns. countplot( x= 'card3' , data= train_transaction, order= list ( tmp. card3. values) )
plot_3_2 = plot_3. twinx( )
plot_3_2 = sns. pointplot( x= 'card3' , y= 'IsFraud' , data= tmp,
color= 'black' , order= list ( tmp. card3. values) )
plot_3_2. set_ylabel( '异常占比' , fontsize= 16 )
plot_3. set_title( 'Card3正常用户与异常用户对比' , fontsize= 20 )
plot_3. set_xlabel( 'Card3值' , fontsize= 18 )
plot_3. set_ylabel( '消费总笔数' , fontsize= 18 )
for p in plot_3. patches:
height = p. get_height( )
plot_3. text( p. get_x( ) + p. get_width( ) / 2 . ,
height+ 25 ,
'{:1.2f}%' . format ( height/ total* 100 ) ,
ha= 'center' )
plt. subplot( 414 )
plot_4 = sns. countplot( x= 'card5' , data= train_transaction, order= list ( tmp2. card5. values) )
plot_4_2 = plot_4. twinx( )
plot_4_2 = sns. pointplot( x= 'card5' , y= 'IsFraud' , data= tmp2,
color= 'black' , order= list ( tmp2. card5. values) )
plot_4_2. set_ylabel( '异常占比' , fontsize= 16 )
plot_4. set_title( 'Card5正常用户与异常用户对比' , fontsize= 20 )
plot_4. set_xlabel( 'Card5值' , fontsize= 18 )
plot_4. set_ylabel( '消费总笔数' , fontsize= 18 )
for p in plot_4. patches:
height = p. get_height( )
plot_4. text( p. get_x( ) + p. get_width( ) / 2 . ,
height+ 25 ,
'{:1.2f}%' . format ( height/ total* 100 ) ,
ha= 'center' )
plt. subplots_adjust( hspace= 0.6 , top= 0.85 )
plt. show( )
tmp = pd. crosstab( train_transaction[ 'card4' ] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp = tmp. reset_index( )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 14 , 10 ) )
plt. suptitle( 'Card4' )
plt. subplot( 221 )
plot_1 = sns. countplot( x= 'card4' , data= train_transaction,
order= [ 'american express' , 'discover' , 'mastercard' , 'visa' ] )
plot_1. set_title( 'Card4种类以及数量' , fontsize= 20 )
plot_1. set_xlabel( 'Card4种类' )
plot_1. set_ylabel( '总量' )
for p in plot_1. patches:
height = p. get_height( )
plot_1. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height/ total* 100 ) ,
ha= 'center' , fontsize= 14 )
plt. subplot( 222 )
plot_2 = sns. countplot( x= 'card4' , hue= 'isFraud' , data= train_transaction,
order= [ 'american express' , 'discover' , 'mastercard' , 'visa' ] )
plot_2_2 = plot_2. twinx( )
plot_2_2 = sns. pointplot( x= 'card4' , y= 'IsFraud' , data= tmp, color= 'black' , legend= False ,
order= [ 'american express' , 'discover' , 'mastercard' , 'visa' ] )
plot_2_2. set_ylabel( '违约占比' )
plot_2. set_title( 'card4正常与违约用户对比' , fontsize= 20 )
plot_2. set_xlabel( 'Card4种类' )
plot_2. set_ylabel( '总量' )
plt. subplot( 212 )
plot_3 = sns. boxplot( x= 'card4' , y= 'TransactionAmt' , hue= 'isFraud' ,
data= train_transaction[ train_transaction[ 'TransactionAmt' ] <= 2000 ] )
plot_3. set_title( 'Card4按产品和目标分配' , fontsize= 20 )
plot_3. set_xlabel( 'Card4种类' )
plot_3. set_ylabel( '交易数值' )
plt. subplots_adjust( hspace= 0.6 , top= 0.85 )
plt. show( )
tmp = pd. crosstab( train_transaction[ 'card6' ] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp = tmp. reset_index( )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 16 , 10 ) )
plt. suptitle( 'Card6' , fontsize= 20 )
plt. subplot( 221 )
plot_1 = sns. countplot( x= 'card6' , data= train_transaction)
plot_1. set_title( 'Card6各种类占比' , fontsize= 18 )
plot_1. set_xlabel( 'Card6种类' , fontsize= 16 )
plot_1. set_ylabel( '总量' , fontsize= 16 )
for p in plot_1. patches:
height = p. get_height( )
plot_1. text( p. get_x( ) + p. get_width( ) / 2 ,
height + 3 ,
'{:1.2f}%' . format ( height/ total* 100 ) ,
ha= 'center' , fontsize= 14 )
plt. subplot( 222 )
plot_2 = sns. countplot( x= 'card6' , hue= 'isFraud' , data= train_transaction, order= list ( tmp. card6. values) )
plot_2_2 = plot_2. twinx( )
plot_2_2 = sns. pointplot( x= 'card6' , y= 'IsFraud' , data= tmp, order= [ 'charge card' , 'credit' , 'debit' , 'debit or credit' ] ,
color= 'black' , legend= False )
plot_2_2. set_ylim( 0 , 20 )
plot_2_2. set_ylabel( 'Card6各种类异常占比' , fontsize= 15 )
plot_2. set_title( 'Card6正常用户与异常用户占比' , fontsize= 18 )
plot_2. set_xlabel( 'Card6种类' , fontsize= 15 )
plot_2. set_ylabel( '总量' , fontsize= 15 )
plt. subplot( 212 )
plot_3 = sns. boxplot( x= 'card6' , y= 'TransactionAmt' , hue= 'isFraud' ,
data= train_transaction[ train_transaction[ 'TransactionAmt' ] <= 2000 ] )
plot_3. set_title( 'Card6正常用户与异常用户交易金额对比' , fontsize= 18 )
plot_3. set_xlabel( 'Card6种类' , fontsize= 15 )
plot_3. set_ylabel( '交易金额' , fontsize= 15 )
plt. subplots_adjust( hspace= 0.6 , top= 0.85 )
plt. show( )
for Columns in [ 'M1' , 'M2' , 'M3' , 'M4' , 'M5' , 'M6' , 'M7' , 'M8' , 'M9' ] :
if 'Miss' not in train_transaction[ Columns] . cat. categories:
train_transaction[ Columns] = train_transaction[ Columns] . cat. add_categories( [ 'Miss' ] ) ;
train_transaction[ Columns] = train_transaction[ Columns] . fillna( 'Miss' )
def ploting_dist_ratio ( DataFile, Column, lim= 2000 ) :
tmp = pd. crosstab( train_transaction[ Column] , train_transaction[ 'isFraud' ] , normalize= 'index' ) * 100
tmp = tmp. reset_index( )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 20 , 5 ) )
plt. suptitle( f'{Column}' , fontsize= 20 )
plt. subplot( 121 )
plot_1 = sns. countplot( x= Column, data= DataFile, order= list ( tmp[ Column] . values) )
plot_1. set_title( f'{Column}的常规展示以及异常占比展示' , fontsize= 18 )
plot_1. set_ylim( 0 , 400000 )
plot_1_1 = plot_1. twinx( )
plot_1_1 = sns. pointplot( x= Column, y= 'IsFraud' , data= tmp, order= list ( tmp[ Column] . values) ,
color= 'black' , legend= False )
plot_1_1. set_ylim( 0 , 20 )
plot_1_1. set_ylabel( '异常占比' , fontsize= 16 )
plot_1. set_xlabel( f'{Column} 种类' )
plot_1. set_ylabel( '总量' , fontsize= 16 )
for p in plot_1. patches:
height = p. get_height( )
plot_1. text( p. get_x( ) + p. get_width( ) / 2 ,
height + 3 , '{:1.2f}%' . format ( height/ total* 100 ) ,
ha= 'center' , fontsize= 14 )
perc_amt = ( train_transaction. groupby( [ 'isFraud' , Column] ) [ 'TransactionAmt' ] . sum ( ) / total_amt * 100 ) . unstack( 'isFraud' )
print ( perc_amt)
perc_amt = perc_amt. reset_index( )
perc_amt. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. subplot( 122 )
plot_2 = sns. boxplot( x= Column , y= 'TransactionAmt' , hue= 'isFraud' , data= DataFile[ DataFile[ 'TransactionAmt' ] <= lim] ,
order= list ( tmp[ Column] . values) )
plot_2_2 = plot_2. twinx( )
plot_2_2 = sns. pointplot( x= Column, y= 'IsFraud' , data= perc_amt, order= list ( tmp[ Column] . values) ,
color= 'black' , legend= False )
plot_2_2. set_ylim( 0 , 5 )
plot_2_2. set_ylabel( '欺诈总额' , fontsize= 16 )
plot_2. set_title( f'{Column} 种类消费金额' )
plot_2. set_xlabel( f'{Column} 种类' )
plot_2. set_ylabel( '金额 (U$)' )
plt. subplots_adjust( hspace= 0.65 , top= 0.85 , wspace= 0.35 )
plt. show( )
for Columns in [ 'M1' , 'M2' , 'M3' , 'M4' , 'M5' , 'M6' , 'M7' , 'M8' , 'M9' ] :
ploting_dist_ratio( train_transaction, Columns)
isFraud 0 1
M1
F 0.004843 NaN
T 59.554396 1.898156
Miss 36.573351 1.969254
isFraud 0 1
M2
F 6.912159 0.342055
T 52.647080 1.556101
Miss 36.573351 1.969254
isFraud 0 1
M3
F 13.725183 0.554626
T 45.834056 1.343530
Miss 36.573351 1.969254
isFraud 0 1
M4
M0 31.412624 1.958153
M1 8.834237 0.280888
M2 3.381058 0.449007
Miss 52.504671 1.179362
isFraud 0 1
M5
F 24.187301 1.255668
T 15.947904 0.945038
Miss 55.997385 1.666704
isFraud 0 1
M6
F 38.162428 1.351420
T 40.035646 1.146065
Miss 17.934516 1.369926
isFraud 0 1
M7
F 41.357407 1.295796
T 6.205311 0.270727
Miss 48.569872 2.300887
isFraud 0 1
M8
F 31.564724 1.096196
T 16.006749 0.470327
Miss 48.561117 2.300887
isFraud 0 1
M9
F 7.812824 0.366394
T 39.758649 1.200130
Miss 48.561117 2.300887
train_transaction. groupby( [ 'isFraud' , 'M1' ] ) [ 'TransactionAmt' ] . sum ( )
isFraud M1
0 F 3.862156e+03
T 4.748811e+07
Miss 2.916324e+07
1 T 1.513572e+06
Miss 1.570264e+06
Name: TransactionAmt, dtype: float64
train_transaction. loc[ train_transaction. \
addr1. isin( train_transaction. addr1. \
value_counts( ) [ train_transaction. addr1. \
value_counts( ) <= 5000 ] . index) ,
'addr1' ] = 'Others'
train_transaction. loc[ train_transaction. \
addr2. isin( train_transaction. addr2. \
value_counts( ) [ train_transaction. addr2. \
value_counts( ) <= 50 ] . index) ,
'addr2' ] = 'Others'
def ploting_cnt_amt ( DataFile, column, lim= 2000 ) :
tmp = pd. crosstab( DataFile[ column] , DataFile[ 'isFraud' ] , normalize= 'index' ) * 100
tmp. reset_index( inplace= True )
tmp. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
plt. figure( figsize= ( 16 , 15 ) )
plt. suptitle( f'{column}' , fontsize= 24 )
plt. subplot( 211 )
plot_1 = sns. countplot( x= column, data= DataFile, order= list ( tmp[ column] . values) )
plot_1_2 = plot_1. twinx( )
plot_1_2 = sns. pointplot( x= column, y= 'IsFraud' , data= tmp, order= list ( tmp[ column] . values) ,
color= 'black' , legend= False )
plot_1_2. set_ylim( 0 , tmp[ 'IsFraud' ] . max ( ) * 1.1 )
plot_1_2. set_ylabel( '异常消费占比' , fontsize= 14 )
plot_1. set_title( f'最常见的{column}的值与异常消费占比' , fontsize= 20 )
plot_1. set_xlabel( f'{column}的种类' , fontsize= 14 )
plot_1. set_ylabel( '总计' , fontsize= 14 )
plot_1. set_xticklabels( plot_1. get_xticklabels( ) , rotation= 45 )
sizes = [ ]
for p in plot_1. patches:
height = p. get_height( )
sizes. append( height)
plot_1. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height / total * 100 ) ,
ha= 'center' , fontsize= 12 )
plot_1. set_ylim( 0 , max ( sizes) * 1.15 )
plt. subplot( 212 )
perc_mat = ( DataFile. groupby( [ 'isFraud' , column] ) [ 'TransactionAmt' ] . sum ( ) \
/ DataFile. groupby( [ column] ) [ 'TransactionAmt' ] . sum ( ) * 100 ) . unstack( 'isFraud' )
perc_mat. reset_index( inplace= True )
perc_mat. rename( columns= { 0 : 'NoFraud' , 1 : 'IsFraud' } , inplace= True )
amt = DataFile. groupby( [ column] ) [ 'TransactionAmt' ] . sum ( ) . reset_index( )
perc_mat = perc_mat. fillna( 0 )
plot_2 = sns. barplot( x= column, y= 'TransactionAmt' ,
data= amt,
order= list ( amt[ column] . values) )
plot_2_2 = plot_2. twinx( )
plot_2_2 = sns. pointplot( x= column, y= 'IsFraud' , data= perc_mat,
order= list ( amt[ column] . values) ,
color= 'black' , legend= False )
plot_2_2. set_ylim( 0 , perc_mat[ 'IsFraud' ] . max ( ) * 1.1 )
plot_2_2. set_ylabel( '异常消费占比' , fontsize= 14 )
plot_2. set_xticklabels( plot_2. get_xticklabels( ) , rotation= 45 )
plot_2. set_title( f'{column} 各类消费金额 以及异常消费占比' , fontsize= 20 )
plot_2. set_xlabel( f'{column}种类' , fontsize= 14 )
plot_2. set_ylabel( '消费金($)' , fontsize= 14 )
for p in plot_2. patches:
height = p. get_height( )
plot_2. text( p. get_x( ) + p. get_width( ) / 2 . ,
height + 3 ,
'{:1.2f}%' . format ( height/ total_amt* 100 ) ,
ha= 'center' , fontsize= 12 )
plt. subplots_adjust( hspace= 0.65 , top= 0.85 , wspace= 0.65 )
plt. show( )
ploting_cnt_amt( train_transaction, 'addr1' )
ploting_cnt_amt( train_transaction, 'addr2' )
train_transaction. P_emaildomain. value_counts( ) [ train_transaction. P_emaildomain. value_counts( ) <= 1000 ] . index
CategoricalIndex(['charter.net', 'live.com.mx', 'rocketmail.com', 'mail.com',
'earthlink.net', 'gmail', 'outlook.es', 'mac.com',
'juno.com', 'aim.com', 'roadrunner.com', 'hotmail.es',
'windstream.net', 'hotmail.fr', 'frontier.com',
'embarqmail.com', 'web.de', 'twc.com', 'netzero.com',
'prodigy.net.mx', 'centurylink.net', 'netzero.net',
'frontiernet.net', 'q.com', 'suddenlink.net', 'cfl.rr.com',
'sc.rr.com', 'cableone.net', 'gmx.de', 'yahoo.fr',
'yahoo.es', 'hotmail.co.uk', 'protonmail.com', 'yahoo.de',
'ptd.net', 'live.fr', 'yahoo.co.uk', 'hotmail.de',
'servicios-ta.com', 'yahoo.co.jp'],
categories=['aim.com', 'anonymous.com', 'aol.com', 'att.net', 'bellsouth.net', 'cableone.net', 'centurylink.net', 'cfl.rr.com', ...], ordered=False, dtype='category')
train_transaction[ 'P_emaildomain' ] = train_transaction[ 'P_emaildomain' ] . cat. add_categories( [ 'NoInf' , 'Others' , 'Microsoft' , 'Yahoo Mail' , 'Google' , 'Others' ] )
train_transaction. loc[ train_transaction[ 'P_emaildomain' ] . isin( [ 'gmail.com' , 'gmail' ] ) , 'P_emaildomain' ] = 'Google'
train_transaction. loc[ train_transaction[ 'P_emaildomain' ] . isin( [ 'yahoo.com' , 'yahoo.com.mx' , 'yahoo.co.uk' ,
'yahoo.co.jp' , 'yahoo.de' , 'yahoo.fr' , 'yahoo.es' ] ) ,
'P_emaildomain' ] = 'Yahoo Mail'
train_transaction. loc[ train_transaction[ 'P_emaildomain' ] . isin( [ 'hotmail.com' , 'outlook.com' , 'msn.com' , 'live.com.mx' ,
'hotmail.es' , 'hotmail.co.uk' , 'hotmail.de' ,
'outlook.es' , 'live.com' , 'live.fr' , 'hotmail.fr' ] ) ,
'P_emaildomain' ] = 'Microsoft'
train_transaction. loc[ train_transaction. P_emaildomain. isin( train_transaction. \
P_emaildomain. value_counts( ) [ train_transaction. P_emaildomain. \
value_counts( ) <= 500 ] \
. index) , 'P_emaildomain' ] = 'Others'
train_transaction. P_emaildomain. fillna( 'NoInf' , inplace= True )
train_transaction[ 'R_emaildomain' ] = train_transaction[ 'R_emaildomain' ] . cat. add_categories( [ 0 ] )
ploting_cnt_amt( train_transaction, 'R_emaildomain' )
train_transaction. loc[ train_transaction. C1. isin( train_transaction. C1. \
value_counts( ) [ train_transaction. C1. value_counts( ) <= 400 ] . index) ,
'C1' ] = 'Others'
ploting_cnt_amt( train_transaction, 'C1' )
train_transaction. loc[ train_transaction. C2. isin( train_transaction. C2. \
value_counts( ) [ train_transaction. C2. value_counts( ) <= 400 ] . index) ,
'C2' ] = 'Others'
ploting_cnt_amt( train_transaction, 'C2' )
未完待续