Santander unhappy customer_santander数据集-CSDN博客

本文链接：https://blog.csdn.net/HE19930303/article/details/52355966

import pandas as pd
import numpy as np
import warnings 
#drop warnings generated by 
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style='white',color_codes=True)

#load training and testing set
train=pd.read_csv('./input/train.csv')
test=pd.read_csv('./input/test.csv')

train.head()

	ID	var3	var15	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	…	var38
0	1	2	23	0	0	…	39205.170000
1	3	2	34	0	0	…	49278.030000
2	4	2	23	0	0	…	67333.770000
3	8	2	37	195	195	…	64007.970000
4	10	2	39	0	0	…	117310.979016

5 rows × 371 columns

df=pd.DataFrame(train.TARGET.value_counts())
df['Percentage']=100*df.TARGET/train.shape[0]
df

	TARGET	Percentage
0	73012	96.043147
1	3008	3.956853

#from abrove result, it is a unbalanced dataset

#top ten most common values
train.var3.value_counts()[:10]

2 74165 8 138 -999999 116 9 110 3 108 1 105 13 98 7 97 4 86 12 85 Name: var3, dtype: int64

# var3 is the nationality of customer
# -99999 represent that nationality is unknown 
# replace -99999 with the most common value (2)
train=train.replace(-999999,2)
train.loc[train.var3==-999999].shape

(0, 371)

#add feature that counts the number of zeros each row
X=train.iloc[:,:-1]
y=train.TARGET
X['n0']=(X==0).sum(axis=1)
train['n0']=X['n0']

train.describe()

	ID	var3	var15	imp_ent_var16_ult1	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	imp_op_var40_comer_ult1	imp_op_var40_comer_ult3	imp_op_var40_efect_ult1	imp_op_var40_efect_ult3	…	saldo_medio_var33_hace3	saldo_medio_var33_ult1	saldo_medio_var33_ult3	saldo_medio_var44_hace2	saldo_medio_var44_hace3	saldo_medio_var44_ult1	saldo_medio_var44_ult3	var38	TARGET	n0
count	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	…	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000
mean	75964.050723	2.716483	33.212865	86.208265	72.363067	119.529632	3.559130	6.472698	0.412946	0.567352	…	1.365146	12.215580	8.784074	31.505324	1.858575	76.026165	56.614351	117235.809430	0.039569	335.426888
std	43781.947379	9.447971	12.956486	1614.757313	339.315831	546.266294	93.155749	153.737066	30.604864	36.513513	…	113.959637	783.207399	538.439211	2013.125393	147.786584	4040.337842	2852.579397	182664.598503	0.194945	17.836658
min	1.000000	0.000000	5.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	…	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5163.750000	0.000000	220.000000
25%	38104.750000	2.000000	23.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	…	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	67870.612500	0.000000	325.000000
50%	76043.000000	2.000000	28.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	…	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	106409.160000	0.000000	340.000000
75%	113748.750000	2.000000	40.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	…	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	118756.252500	0.000000	348.000000
max	151838.000000	238.000000	105.000000	210000.000000	12888.030000	21024.810000	8237.820000	11073.570000	6600.000000	6600.000000	…	20385.720000	138831.630000	91778.730000	438329.220000	24650.010000	681462.900000	397884.300000	22034738.760000	1.000000	361.000000

8 rows × 372 columns

#num_var4 is the number of products
#plot the number of products
train.num_var4.hist(bins=100)
plt.xlabel('number of products')
plt.ylabel('number of customers')
plt.title('most customer with one product')
plt.show()

这里写图片描述

#let's look at the density of happy / unhappy customer in the function of bank  products
sns.FacetGrid(train,hue='TARGET',size=7).map(plt.hist,'num_var4').add_legend()
plt.title('unhappy customer with less bank products')
plt.show()

这里写图片描述

train[train.TARGET==1].num_var4.hist(bins=6)
plt.title('the number of unhappy customer in function of bank products')
plt.show()

这里写图片描述

# var38 is supposed to be the value of customers
train.var38.describe()

count 76020.000000 mean 117235.809430 std 182664.598503 min 5163.750000 25% 67870.612500 50% 106409.160000 75% 118756.252500 max 22034738.760000 Name: var38, dtype: float64

#how is var38 looking when customer is unhappy
train.loc[train.TARGET==1,'var38'].describe()

count 3008.000000 mean 99678.280590 std 106309.811490 min 11136.630000 25% 57160.942500 50% 86219.970000 75% 117310.979016 max 3988595.100000 Name: var38, dtype: float64

#histogram  for var_38 
train.var38.hist(bins=1000)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d54b4da0>

这里写图片描述

train.var38.map(np.log).hist(bins=1000)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d69cadd8>

这里写图片描述

train.var38.value_counts()

117310.979016 14868 451931.220000 16 463625.160000 12 104563.800000 11 288997.440000 11 236690.340000 8 67088.310000 7 128318.520000 7 329603.970000 7 125722.440000 7 104644.410000 7 70813.800000 6 163432.470000 6 105260.880000 6 97639.560000 6 185385.690000 6 100466.730000 6 168733.620000 6 127141.500000 5 227397.720000 5 71302.530000 5 235476.720000 5 192920.760000 5 33184.020000 5 185784.720000 5 208961.790000 5 83174.280000 5 171932.700000 5 121603.020000 5 229351.650000 5 … 67239.600000 1 84077.580000 1 83315.520000 1 84145.410000 1 84203.250000 1 111706.230000 1 84202.590000 1 215271.630000 1 84198.450000 1 84196.500000 1 84194.940000 1 83321.910000 1 117971.910000 1 84190.050000 1 84187.410000 1 84185.040000 1 84182.670000 1 84181.950000 1 215252.280000 1 84179.850000 1 84178.770000 1 84177.090000 1 477388.740000 1 84169.110000 1 84167.880000 1 84162.270000 1 84160.980000 1 215230.230000 1 84150.720000 1 131072.070000 1 Name: var38, dtype: int64

#what if exclude the most common values
train.loc[~np.isclose(train.var38,117310.979016),'var38'].value_counts()

451931.22 16 463625.16 12 104563.80 11 288997.44 11 236690.34 8 128318.52 7 104644.41 7 125722.44 7 329603.97 7 67088.31 7 70813.80 6 185385.69 6 100466.73 6 168733.62 6 163432.47 6 105260.88 6 97639.56 6 171932.70 5 148781.16 5 131353.47 5 185784.72 5 53324.46 5 127141.50 5 63820.89 5 121603.02 5 235476.72 5 83174.28 5 85814.04 5 276030.57 5 71302.53 5 .. 84483.51 1 84482.01 1 477748.14 1 84535.56 1 84577.77 1 84536.04 1 84574.05 1 215645.88 1 84571.65 1 84570.66 1 84569.88 1 84567.84 1 84565.59 1 84563.91 1 84562.95 1 84560.67 1 84559.17 1 84558.96 1 84557.91 1 84556.50 1 84555.27 1 84545.04 1 215616.45 1 65688.57 1 84541.53 1 84540.33 1 84539.79 1 84538.08 1 84537.51 1 131072.07 1 Name: var38, dtype: int64

#exclude the most common values and look at it's distribution
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb77b54ed30>

这里写图片描述

#above results imply us to split up var38 into two varibles
#var38mc=1 when var38 has the most common values otherwise var38mc=0
train['var38mc']=np.isclose(train.var38,117310.979016)
#logvar38 is log(var38) when var38mc is zero, otherwise is zero
train['logvar38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[train['var38mc'],'logvar38']=0

#check for nan
print("the number of nan in var38mc is",train.var38mc.isnull().sum())
print('the number of nan in logvar38 is ',train.logvar38.isnull().sum())

the number of nan in var38mc is 0 the number of nan in logvar38 is 0

#var15 is the age of customer 
train.var15.describe()

count 76020.000000 mean 33.212865 std 12.956486 min 5.000000 25% 23.000000 50% 28.000000 75% 40.000000 max 105.000000 Name: var15, dtype: float64

train.var15.hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb7e0ac9630>

这里写图片描述

sns.FacetGrid(train,hue='TARGET',size=6).map(sns.kdeplot,'var15').add_legend()
plt.title('unhappy customers are slightly order')

<matplotlib.text.Text at 0x7fb7e0d1e240>

这里写图片描述

train.saldo_var30.hist(bins=100)
plt.xlim(0,train.saldo_var30.max())

(0, 3458077.3199999998)

# improve the plot by making the x axis logarithmic
#train['log_saldo_var30'] = train.saldo_var30.map(np.log)
sns.FacetGrid(train, hue="TARGET", size=20) \
   .map(sns.kdeplot, "saldo_var30") \
   .add_legend();

这里写图片描述

#explore the interaction of var15 and var38
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.scatter,'var38','var15').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7fb7d6189748>

这里写图片描述



# Exclude most common value for var38 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]);

这里写图片描述

sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]); # Age must be positive ;-)

这里写图片描述



# What is distribution of the age when var38 has it's most common value ?
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6) \
   .map(sns.kdeplot, "var15") \
   .add_legend();

这里写图片描述

# What is density of n0 ?
sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "n0") \
   .add_legend()
plt.title('Unhappy customers have a lot of features that are zero');

![png](output_28_0.png)



from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

# First select features based on chi2 and f_classif
p = 3

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = S gelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)

Chi2 selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var8_0’, ‘num_var30_0’, ‘num_var30’, ‘num_var42’, ‘saldo_var30’, ‘var36’, ‘num_meses_var5_ult3’]. F_classif selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var4’, ‘num_var5’, ‘num_var30’, ‘num_var35’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’, ‘n0’]. Chi2 & F_classif selected 9 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var30’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’]

X_sel=train[features+['TARGET']]

X_sel.info()

X_sel.describe()

	var15	ind_var5	ind_var8_0	ind_var30	num_var5	num_var30	num_var42	var36	num_meses_var5_ult3	TARGET
count	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000
mean	33.212865	0.663760	0.032833	0.732833	1.999171	2.382873	2.217995	40.449079	1.979979	0.039569
std	12.956486	0.472425	0.178202	0.442483	1.431902	1.642787	1.497703	47.362719	1.298924	0.194945
min	5.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	23.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000
50%	28.000000	1.000000	0.000000	1.000000	3.000000	3.000000	3.000000	3.000000	3.000000	0.000000
75%	40.000000	1.000000	0.000000	1.000000	3.000000	3.000000	3.000000	99.000000	3.000000	0.000000
max	105.000000	1.000000	1.000000	1.000000	15.000000	33.000000	18.000000	99.000000	3.000000	1.000000

sns.FacetGrid(X_sel,hue='TARGET',size=6).map(sns.kdeplot,'var36').add_legend()
plt.title('the unhappy customer is smaller when var36 is not 99')

<matplotlib.text.Text at 0x7fb7d84f56d8>

p![这里写图片描述

X_sel.var36.value_counts()

99    30064
3     22177
1     14664
2      8704
0       411
Name: var36, dtype: int64

# var36 in function of var38 (most common value excluded) 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "var36", "logvar38") \
   .add_legend();

这里写图片描述

sns.FacetGrid(train[(~train.var38mc)&(train.var36<4)],hue='TARGET',size=10).map(plt.scatter,'var36','logvar38').add_legend()
plt.title('when var36 is zero there is all  unhappy customer')

<matplotlib.text.Text at 0x7fb7bde30ef0>

这里写图片描述

#look at the value of var38 when var36==99
sns.FacetGrid(train[(~train.var38mc)&(train.var36==99)], hue="TARGET", size=10) \
   .map(sns.kdeplot, 'logvar38') \
   .add_legend();

这里写图片描述

train.num_var5.value_counts()

3     50265
0     25561
6       190
9         3
15        1
Name: num_var5, dtype: int64

train[train.TARGET==0].num_var5.value_counts()

3     49223
0     23602
6       183
9         3
15        1
Name: num_var5, dtype: int64

train[train.TARGET==1].num_var5.value_counts()

0    1959
3    1042
6       7
Name: num_var5, dtype: int64

sns.FacetGrid(train,hue='TARGET',size=10).map(plt.hist,'num_var5').add_legend()

<seaborn.axisgrid.FacetGrid at 0x7fb7d5344da0>

png

sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "num_var5") \
   .add_legend();

这里写图片描述