import pandas as pd
import numpy as np
import warnings
#drop warnings generated by
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style='white',color_codes=True)
#load training and testing set
train=pd.read_csv('./input/train.csv')
test=pd.read_csv('./input/test.csv')
# var3 is the nationality of customer# -99999 represent that nationality is unknown # replace -99999 with the most common value (2)
train=train.replace(-999999,2)
train.loc[train.var3==-999999].shape
(0, 371)
#add feature that counts the number of zeros each row
X=train.iloc[:,:-1]
y=train.TARGET
X['n0']=(X==0).sum(axis=1)
train['n0']=X['n0']
train.describe()
ID
var3
var15
imp_ent_var16_ult1
imp_op_var39_comer_ult1
imp_op_var39_comer_ult3
imp_op_var40_comer_ult1
imp_op_var40_comer_ult3
imp_op_var40_efect_ult1
imp_op_var40_efect_ult3
…
saldo_medio_var33_hace3
saldo_medio_var33_ult1
saldo_medio_var33_ult3
saldo_medio_var44_hace2
saldo_medio_var44_hace3
saldo_medio_var44_ult1
saldo_medio_var44_ult3
var38
TARGET
n0
count
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
…
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
76020.000000
mean
75964.050723
2.716483
33.212865
86.208265
72.363067
119.529632
3.559130
6.472698
0.412946
0.567352
…
1.365146
12.215580
8.784074
31.505324
1.858575
76.026165
56.614351
117235.809430
0.039569
335.426888
std
43781.947379
9.447971
12.956486
1614.757313
339.315831
546.266294
93.155749
153.737066
30.604864
36.513513
…
113.959637
783.207399
538.439211
2013.125393
147.786584
4040.337842
2852.579397
182664.598503
0.194945
17.836658
min
1.000000
0.000000
5.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
5163.750000
0.000000
220.000000
25%
38104.750000
2.000000
23.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
67870.612500
0.000000
325.000000
50%
76043.000000
2.000000
28.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
106409.160000
0.000000
340.000000
75%
113748.750000
2.000000
40.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
118756.252500
0.000000
348.000000
max
151838.000000
238.000000
105.000000
210000.000000
12888.030000
21024.810000
8237.820000
11073.570000
6600.000000
6600.000000
…
20385.720000
138831.630000
91778.730000
438329.220000
24650.010000
681462.900000
397884.300000
22034738.760000
1.000000
361.000000
8 rows × 372 columns
#num_var4 is the number of products#plot the number of products
train.num_var4.hist(bins=100)
plt.xlabel('number of products')
plt.ylabel('number of customers')
plt.title('most customer with one product')
plt.show()
#let's look at the density of happy / unhappy customer in the function of bank products
sns.FacetGrid(train,hue='TARGET',size=7).map(plt.hist,'num_var4').add_legend()
plt.title('unhappy customer with less bank products')
plt.show()
train[train.TARGET==1].num_var4.hist(bins=6)
plt.title('the number of unhappy customer in function of bank products')
plt.show()
# var38 is supposed to be the value of customers
train.var38.describe()
count 76020.000000 mean 117235.809430 std 182664.598503 min 5163.750000 25% 67870.612500 50% 106409.160000 75% 118756.252500 max 22034738.760000 Name: var38, dtype: float64
#how is var38 looking when customer is unhappy
train.loc[train.TARGET==1,'var38'].describe()
count 3008.000000 mean 99678.280590 std 106309.811490 min 11136.630000 25% 57160.942500 50% 86219.970000 75% 117310.979016 max 3988595.100000 Name: var38, dtype: float64
#histogram for var_38
train.var38.hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d54b4da0>
train.var38.map(np.log).hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d69cadd8>
#exclude the most common values and look at it's distribution
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb77b54ed30>
#above results imply us to split up var38 into two varibles#var38mc=1 when var38 has the most common values otherwise var38mc=0
train['var38mc']=np.isclose(train.var38,117310.979016)
#logvar38 is log(var38) when var38mc is zero, otherwise is zero
train['logvar38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[train['var38mc'],'logvar38']=0
#check for nan
print("the number of nan in var38mc is",train.var38mc.isnull().sum())
print('the number of nan in logvar38 is ',train.logvar38.isnull().sum())
the number of nan in var38mc is 0 the number of nan in logvar38 is 0
#var15 is the age of customer
train.var15.describe()
count 76020.000000 mean 33.212865 std 12.956486 min 5.000000 25% 23.000000 50% 28.000000 75% 40.000000 max 105.000000 Name: var15, dtype: float64
train.var15.hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7e0ac9630>
sns.FacetGrid(train,hue='TARGET',size=6).map(sns.kdeplot,'var15').add_legend()
plt.title('unhappy customers are slightly order')
# improve the plot by making the x axis logarithmic#train['log_saldo_var30'] = train.saldo_var30.map(np.log)
sns.FacetGrid(train, hue="TARGET", size=20) \
.map(sns.kdeplot, "saldo_var30") \
.add_legend();
#explore the interaction of var15 and var38
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.scatter,'var38','var15').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7fb7d6189748>
# Exclude most common value for var38
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
.map(plt.scatter, "logvar38", "var15") \
.add_legend()
plt.ylim([0,120]);
sns.FacetGrid(train, hue="TARGET", size=10) \
.map(plt.scatter, "logvar38", "var15") \
.add_legend()
plt.ylim([0,120]); # Age must be positive ;-)
# What is distribution of the age when var38 has it's most common value ?
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6) \
.map(sns.kdeplot, "var15") \
.add_legend();
# What is density of n0 ?
sns.FacetGrid(train, hue="TARGET", size=6) \
.map(sns.kdeplot, "n0") \
.add_legend()
plt.title('Unhappy customers have a lot of features that are zero');
![png](output_28_0.png)
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale
# First select features based on chi2 and f_classif
p = 3
X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = S gelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)
chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)
# var36 in function of var38 (most common value excluded)
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
.map(plt.scatter, "var36", "logvar38") \
.add_legend();
sns.FacetGrid(train[(~train.var38mc)&(train.var36<4)],hue='TARGET',size=10).map(plt.scatter,'var36','logvar38').add_legend()
plt.title('when var36 is zero there is all unhappy customer')
<matplotlib.text.Text at 0x7fb7bde30ef0>
#look at the value of var38 when var36==99
sns.FacetGrid(train[(~train.var38mc)&(train.var36==99)], hue="TARGET", size=10) \
.map(sns.kdeplot, 'logvar38') \
.add_legend();