Santander unhappy customer

import pandas as pd
import numpy as np
import warnings 
#drop warnings generated by 
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style='white',color_codes=True)

#load training and testing set
train=pd.read_csv('./input/train.csv')
test=pd.read_csv('./input/test.csv')
train.head()
IDvar3var15imp_ent_var16_ult1imp_op_var39_comer_ult1imp_op_var39_comer_ult3imp_op_var40_comer_ult1imp_op_var40_comer_ult3imp_op_var40_efect_ult1imp_op_var40_efect_ult3saldo_medio_var33_hace2saldo_medio_var33_hace3saldo_medio_var33_ult1saldo_medio_var33_ult3saldo_medio_var44_hace2saldo_medio_var44_hace3saldo_medio_var44_ult1saldo_medio_var44_ult3var38TARGET
0122300000000000000039205.1700000
1323400000000000000049278.0300000
2422300000000000000067333.7700000
38237019519500000000000064007.9700000
410239000000000000000117310.9790160

5 rows × 371 columns

df=pd.DataFrame(train.TARGET.value_counts())
df['Percentage']=100*df.TARGET/train.shape[0]
df
TARGETPercentage
07301296.043147
130083.956853
#from abrove result, it is a unbalanced dataset

#top ten most common values
train.var3.value_counts()[:10]
2 74165 8 138 -999999 116 9 110 3 108 1 105 13 98 7 97 4 86 12 85 Name: var3, dtype: int64
# var3 is the nationality of customer
# -99999 represent that nationality is unknown 
# replace -99999 with the most common value (2)
train=train.replace(-999999,2)
train.loc[train.var3==-999999].shape
(0, 371)
#add feature that counts the number of zeros each row
X=train.iloc[:,:-1]
y=train.TARGET
X['n0']=(X==0).sum(axis=1)
train['n0']=X['n0']
train.describe()
IDvar3var15imp_ent_var16_ult1imp_op_var39_comer_ult1imp_op_var39_comer_ult3imp_op_var40_comer_ult1imp_op_var40_comer_ult3imp_op_var40_efect_ult1imp_op_var40_efect_ult3saldo_medio_var33_hace3saldo_medio_var33_ult1saldo_medio_var33_ult3saldo_medio_var44_hace2saldo_medio_var44_hace3saldo_medio_var44_ult1saldo_medio_var44_ult3var38TARGETn0
count76020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.000000
mean75964.0507232.71648333.21286586.20826572.363067119.5296323.5591306.4726980.4129460.5673521.36514612.2155808.78407431.5053241.85857576.02616556.614351117235.8094300.039569335.426888
std43781.9473799.44797112.9564861614.757313339.315831546.26629493.155749153.73706630.60486436.513513113.959637783.207399538.4392112013.125393147.7865844040.3378422852.579397182664.5985030.19494517.836658
min1.0000000.0000005.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000005163.7500000.000000220.000000
25%38104.7500002.00000023.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000067870.6125000.000000325.000000
50%76043.0000002.00000028.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000106409.1600000.000000340.000000
75%113748.7500002.00000040.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000118756.2525000.000000348.000000
max151838.000000238.000000105.000000210000.00000012888.03000021024.8100008237.82000011073.5700006600.0000006600.00000020385.720000138831.63000091778.730000438329.22000024650.010000681462.900000397884.30000022034738.7600001.000000361.000000

8 rows × 372 columns

#num_var4 is the number of products
#plot the number of products
train.num_var4.hist(bins=100)
plt.xlabel('number of products')
plt.ylabel('number of customers')
plt.title('most customer with one product')
plt.show()

这里写图片描述

#let's look at the density of happy / unhappy customer in the function of bank  products
sns.FacetGrid(train,hue='TARGET',size=7).map(plt.hist,'num_var4').add_legend()
plt.title('unhappy customer with less bank products')
plt.show()

这里写图片描述

train[train.TARGET==1].num_var4.hist(bins=6)
plt.title('the number of unhappy customer in function of bank products')
plt.show()

这里写图片描述

# var38 is supposed to be the value of customers
train.var38.describe()
count 76020.000000 mean 117235.809430 std 182664.598503 min 5163.750000 25% 67870.612500 50% 106409.160000 75% 118756.252500 max 22034738.760000 Name: var38, dtype: float64
#how is var38 looking when customer is unhappy
train.loc[train.TARGET==1,'var38'].describe()
count 3008.000000 mean 99678.280590 std 106309.811490 min 11136.630000 25% 57160.942500 50% 86219.970000 75% 117310.979016 max 3988595.100000 Name: var38, dtype: float64
#histogram  for var_38 
train.var38.hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d54b4da0>

这里写图片描述

train.var38.map(np.log).hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d69cadd8>

这里写图片描述

train.var38.value_counts()
117310.979016 14868 451931.220000 16 463625.160000 12 104563.800000 11 288997.440000 11 236690.340000 8 67088.310000 7 128318.520000 7 329603.970000 7 125722.440000 7 104644.410000 7 70813.800000 6 163432.470000 6 105260.880000 6 97639.560000 6 185385.690000 6 100466.730000 6 168733.620000 6 127141.500000 5 227397.720000 5 71302.530000 5 235476.720000 5 192920.760000 5 33184.020000 5 185784.720000 5 208961.790000 5 83174.280000 5 171932.700000 5 121603.020000 5 229351.650000 5 … 67239.600000 1 84077.580000 1 83315.520000 1 84145.410000 1 84203.250000 1 111706.230000 1 84202.590000 1 215271.630000 1 84198.450000 1 84196.500000 1 84194.940000 1 83321.910000 1 117971.910000 1 84190.050000 1 84187.410000 1 84185.040000 1 84182.670000 1 84181.950000 1 215252.280000 1 84179.850000 1 84178.770000 1 84177.090000 1 477388.740000 1 84169.110000 1 84167.880000 1 84162.270000 1 84160.980000 1 215230.230000 1 84150.720000 1 131072.070000 1 Name: var38, dtype: int64
#what if exclude the most common values
train.loc[~np.isclose(train.var38,117310.979016),'var38'].value_counts()
451931.22 16 463625.16 12 104563.80 11 288997.44 11 236690.34 8 128318.52 7 104644.41 7 125722.44 7 329603.97 7 67088.31 7 70813.80 6 185385.69 6 100466.73 6 168733.62 6 163432.47 6 105260.88 6 97639.56 6 171932.70 5 148781.16 5 131353.47 5 185784.72 5 53324.46 5 127141.50 5 63820.89 5 121603.02 5 235476.72 5 83174.28 5 85814.04 5 276030.57 5 71302.53 5 .. 84483.51 1 84482.01 1 477748.14 1 84535.56 1 84577.77 1 84536.04 1 84574.05 1 215645.88 1 84571.65 1 84570.66 1 84569.88 1 84567.84 1 84565.59 1 84563.91 1 84562.95 1 84560.67 1 84559.17 1 84558.96 1 84557.91 1 84556.50 1 84555.27 1 84545.04 1 215616.45 1 65688.57 1 84541.53 1 84540.33 1 84539.79 1 84538.08 1 84537.51 1 131072.07 1 Name: var38, dtype: int64
#exclude the most common values and look at it's distribution
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb77b54ed30>

这里写图片描述

#above results imply us to split up var38 into two varibles
#var38mc=1 when var38 has the most common values otherwise var38mc=0
train['var38mc']=np.isclose(train.var38,117310.979016)
#logvar38 is log(var38) when var38mc is zero, otherwise is zero
train['logvar38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[train['var38mc'],'logvar38']=0
#check for nan
print("the number of nan in var38mc is",train.var38mc.isnull().sum())
print('the number of nan in logvar38 is ',train.logvar38.isnull().sum())
the number of nan in var38mc is 0 the number of nan in logvar38 is 0
#var15 is the age of customer 
train.var15.describe()
count 76020.000000 mean 33.212865 std 12.956486 min 5.000000 25% 23.000000 50% 28.000000 75% 40.000000 max 105.000000 Name: var15, dtype: float64
train.var15.hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7e0ac9630>

这里写图片描述

sns.FacetGrid(train,hue='TARGET',size=6).map(sns.kdeplot,'var15').add_legend()
plt.title('unhappy customers are slightly order')
<matplotlib.text.Text at 0x7fb7e0d1e240>

这里写图片描述

train.saldo_var30.hist(bins=100)
plt.xlim(0,train.saldo_var30.max())
(0, 3458077.3199999998)
# improve the plot by making the x axis logarithmic
#train['log_saldo_var30'] = train.saldo_var30.map(np.log)
sns.FacetGrid(train, hue="TARGET", size=20) \
   .map(sns.kdeplot, "saldo_var30") \
   .add_legend();

这里写图片描述

#explore the interaction of var15 and var38
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.scatter,'var38','var15').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7fb7d6189748>

这里写图片描述



# Exclude most common value for var38 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]);

这里写图片描述

sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]); # Age must be positive ;-)

这里写图片描述



# What is distribution of the age when var38 has it's most common value ?
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6) \
   .map(sns.kdeplot, "var15") \
   .add_legend();

这里写图片描述

# What is density of n0 ?
sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "n0") \
   .add_legend()
plt.title('Unhappy customers have a lot of features that are zero');

![png](output_28_0.png)


from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

# First select features based on chi2 and f_classif
p = 3

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = S gelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)

Chi2 selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var8_0’, ‘num_var30_0’, ‘num_var30’, ‘num_var42’, ‘saldo_var30’, ‘var36’, ‘num_meses_var5_ult3’]. F_classif selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var4’, ‘num_var5’, ‘num_var30’, ‘num_var35’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’, ‘n0’]. Chi2 & F_classif selected 9 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var30’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’]
X_sel=train[features+['TARGET']]
X_sel.info()
X_sel.describe()
var15ind_var5ind_var8_0ind_var30num_var5num_var30num_var42var36num_meses_var5_ult3TARGET
count76020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.00000076020.000000
mean33.2128650.6637600.0328330.7328331.9991712.3828732.21799540.4490791.9799790.039569
std12.9564860.4724250.1782020.4424831.4319021.6427871.49770347.3627191.2989240.194945
min5.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%23.0000000.0000000.0000000.0000000.0000000.0000000.0000002.0000000.0000000.000000
50%28.0000001.0000000.0000001.0000003.0000003.0000003.0000003.0000003.0000000.000000
75%40.0000001.0000000.0000001.0000003.0000003.0000003.00000099.0000003.0000000.000000
max105.0000001.0000001.0000001.00000015.00000033.00000018.00000099.0000003.0000001.000000
sns.FacetGrid(X_sel,hue='TARGET',size=6).map(sns.kdeplot,'var36').add_legend()
plt.title('the unhappy customer is smaller when var36 is not 99')
<matplotlib.text.Text at 0x7fb7d84f56d8>

p![这里写图片描述

X_sel.var36.value_counts()
99    30064
3     22177
1     14664
2      8704
0       411
Name: var36, dtype: int64
# var36 in function of var38 (most common value excluded) 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "var36", "logvar38") \
   .add_legend();

这里写图片描述

sns.FacetGrid(train[(~train.var38mc)&(train.var36<4)],hue='TARGET',size=10).map(plt.scatter,'var36','logvar38').add_legend()
plt.title('when var36 is zero there is all  unhappy customer')
<matplotlib.text.Text at 0x7fb7bde30ef0>

这里写图片描述

#look at the value of var38 when var36==99
sns.FacetGrid(train[(~train.var38mc)&(train.var36==99)], hue="TARGET", size=10) \
   .map(sns.kdeplot, 'logvar38') \
   .add_legend();

这里写图片描述

train.num_var5.value_counts()
3     50265
0     25561
6       190
9         3
15        1
Name: num_var5, dtype: int64
train[train.TARGET==0].num_var5.value_counts()
3     49223
0     23602
6       183
9         3
15        1
Name: num_var5, dtype: int64
train[train.TARGET==1].num_var5.value_counts()
0    1959
3    1042
6       7
Name: num_var5, dtype: int64
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.hist,'num_var5').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7fb7d5344da0>

png

sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "num_var5") \
   .add_legend();

这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

hebastast

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值