关闭

Santander unhappy customer

407人阅读 评论(0) 收藏 举报
分类:
import pandas as pd
import numpy as np
import warnings 
#drop warnings generated by 
warnings.filterwarnings('ignore')
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style='white',color_codes=True)

#load training and testing set
train=pd.read_csv('./input/train.csv')
test=pd.read_csv('./input/test.csv')
train.head()
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39205.170000 0
1 3 2 34 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 49278.030000 0
2 4 2 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 67333.770000 0
3 8 2 37 0 195 195 0 0 0 0 0 0 0 0 0 0 0 0 64007.970000 0
4 10 2 39 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 117310.979016 0

5 rows × 371 columns

df=pd.DataFrame(train.TARGET.value_counts())
df['Percentage']=100*df.TARGET/train.shape[0]
df
TARGET Percentage
0 73012 96.043147
1 3008 3.956853
#from abrove result, it is a unbalanced dataset

#top ten most common values
train.var3.value_counts()[:10]
2 74165 8 138 -999999 116 9 110 3 108 1 105 13 98 7 97 4 86 12 85 Name: var3, dtype: int64
# var3 is the nationality of customer
# -99999 represent that nationality is unknown 
# replace -99999 with the most common value (2)
train=train.replace(-999999,2)
train.loc[train.var3==-999999].shape
(0, 371)
#add feature that counts the number of zeros each row
X=train.iloc[:,:-1]
y=train.TARGET
X['n0']=(X==0).sum(axis=1)
train['n0']=X['n0']
train.describe()
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET n0
count 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000
mean 75964.050723 2.716483 33.212865 86.208265 72.363067 119.529632 3.559130 6.472698 0.412946 0.567352 1.365146 12.215580 8.784074 31.505324 1.858575 76.026165 56.614351 117235.809430 0.039569 335.426888
std 43781.947379 9.447971 12.956486 1614.757313 339.315831 546.266294 93.155749 153.737066 30.604864 36.513513 113.959637 783.207399 538.439211 2013.125393 147.786584 4040.337842 2852.579397 182664.598503 0.194945 17.836658
min 1.000000 0.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5163.750000 0.000000 220.000000
25% 38104.750000 2.000000 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 67870.612500 0.000000 325.000000
50% 76043.000000 2.000000 28.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 106409.160000 0.000000 340.000000
75% 113748.750000 2.000000 40.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 118756.252500 0.000000 348.000000
max 151838.000000 238.000000 105.000000 210000.000000 12888.030000 21024.810000 8237.820000 11073.570000 6600.000000 6600.000000 20385.720000 138831.630000 91778.730000 438329.220000 24650.010000 681462.900000 397884.300000 22034738.760000 1.000000 361.000000

8 rows × 372 columns

#num_var4 is the number of products
#plot the number of products
train.num_var4.hist(bins=100)
plt.xlabel('number of products')
plt.ylabel('number of customers')
plt.title('most customer with one product')
plt.show()

这里写图片描述

#let's look at the density of happy / unhappy customer in the function of bank  products
sns.FacetGrid(train,hue='TARGET',size=7).map(plt.hist,'num_var4').add_legend()
plt.title('unhappy customer with less bank products')
plt.show()

这里写图片描述

train[train.TARGET==1].num_var4.hist(bins=6)
plt.title('the number of unhappy customer in function of bank products')
plt.show()

这里写图片描述

# var38 is supposed to be the value of customers
train.var38.describe()
count 76020.000000 mean 117235.809430 std 182664.598503 min 5163.750000 25% 67870.612500 50% 106409.160000 75% 118756.252500 max 22034738.760000 Name: var38, dtype: float64
#how is var38 looking when customer is unhappy
train.loc[train.TARGET==1,'var38'].describe()
count 3008.000000 mean 99678.280590 std 106309.811490 min 11136.630000 25% 57160.942500 50% 86219.970000 75% 117310.979016 max 3988595.100000 Name: var38, dtype: float64
#histogram  for var_38 
train.var38.hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d54b4da0>

这里写图片描述

train.var38.map(np.log).hist(bins=1000)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7d69cadd8>

这里写图片描述

train.var38.value_counts()
117310.979016 14868 451931.220000 16 463625.160000 12 104563.800000 11 288997.440000 11 236690.340000 8 67088.310000 7 128318.520000 7 329603.970000 7 125722.440000 7 104644.410000 7 70813.800000 6 163432.470000 6 105260.880000 6 97639.560000 6 185385.690000 6 100466.730000 6 168733.620000 6 127141.500000 5 227397.720000 5 71302.530000 5 235476.720000 5 192920.760000 5 33184.020000 5 185784.720000 5 208961.790000 5 83174.280000 5 171932.700000 5 121603.020000 5 229351.650000 5 … 67239.600000 1 84077.580000 1 83315.520000 1 84145.410000 1 84203.250000 1 111706.230000 1 84202.590000 1 215271.630000 1 84198.450000 1 84196.500000 1 84194.940000 1 83321.910000 1 117971.910000 1 84190.050000 1 84187.410000 1 84185.040000 1 84182.670000 1 84181.950000 1 215252.280000 1 84179.850000 1 84178.770000 1 84177.090000 1 477388.740000 1 84169.110000 1 84167.880000 1 84162.270000 1 84160.980000 1 215230.230000 1 84150.720000 1 131072.070000 1 Name: var38, dtype: int64
#what if exclude the most common values
train.loc[~np.isclose(train.var38,117310.979016),'var38'].value_counts()
451931.22 16 463625.16 12 104563.80 11 288997.44 11 236690.34 8 128318.52 7 104644.41 7 125722.44 7 329603.97 7 67088.31 7 70813.80 6 185385.69 6 100466.73 6 168733.62 6 163432.47 6 105260.88 6 97639.56 6 171932.70 5 148781.16 5 131353.47 5 185784.72 5 53324.46 5 127141.50 5 63820.89 5 121603.02 5 235476.72 5 83174.28 5 85814.04 5 276030.57 5 71302.53 5 .. 84483.51 1 84482.01 1 477748.14 1 84535.56 1 84577.77 1 84536.04 1 84574.05 1 215645.88 1 84571.65 1 84570.66 1 84569.88 1 84567.84 1 84565.59 1 84563.91 1 84562.95 1 84560.67 1 84559.17 1 84558.96 1 84557.91 1 84556.50 1 84555.27 1 84545.04 1 215616.45 1 65688.57 1 84541.53 1 84540.33 1 84539.79 1 84538.08 1 84537.51 1 131072.07 1 Name: var38, dtype: int64
#exclude the most common values and look at it's distribution
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb77b54ed30>

这里写图片描述

#above results imply us to split up var38 into two varibles
#var38mc=1 when var38 has the most common values otherwise var38mc=0
train['var38mc']=np.isclose(train.var38,117310.979016)
#logvar38 is log(var38) when var38mc is zero, otherwise is zero
train['logvar38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[train['var38mc'],'logvar38']=0
#check for nan
print("the number of nan in var38mc is",train.var38mc.isnull().sum())
print('the number of nan in logvar38 is ',train.logvar38.isnull().sum())
the number of nan in var38mc is 0 the number of nan in logvar38 is 0
#var15 is the age of customer 
train.var15.describe()
count 76020.000000 mean 33.212865 std 12.956486 min 5.000000 25% 23.000000 50% 28.000000 75% 40.000000 max 105.000000 Name: var15, dtype: float64
train.var15.hist(bins=100)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb7e0ac9630>

这里写图片描述

sns.FacetGrid(train,hue='TARGET',size=6).map(sns.kdeplot,'var15').add_legend()
plt.title('unhappy customers are slightly order')
<matplotlib.text.Text at 0x7fb7e0d1e240>

这里写图片描述

train.saldo_var30.hist(bins=100)
plt.xlim(0,train.saldo_var30.max())
(0, 3458077.3199999998)
# improve the plot by making the x axis logarithmic
#train['log_saldo_var30'] = train.saldo_var30.map(np.log)
sns.FacetGrid(train, hue="TARGET", size=20) \
   .map(sns.kdeplot, "saldo_var30") \
   .add_legend();

这里写图片描述

#explore the interaction of var15 and var38
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.scatter,'var38','var15').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7fb7d6189748>

这里写图片描述



# Exclude most common value for var38 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]);

这里写图片描述

sns.FacetGrid(train, hue="TARGET", size=10) \
   .map(plt.scatter, "logvar38", "var15") \
   .add_legend()
plt.ylim([0,120]); # Age must be positive ;-)

这里写图片描述



# What is distribution of the age when var38 has it's most common value ?
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6) \
   .map(sns.kdeplot, "var15") \
   .add_legend();

这里写图片描述

# What is density of n0 ?
sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "n0") \
   .add_legend()
plt.title('Unhappy customers have a lot of features that are zero');

![png](output_28_0.png)


from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

# First select features based on chi2 and f_classif
p = 3

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = S gelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)

Chi2 selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var8_0’, ‘num_var30_0’, ‘num_var30’, ‘num_var42’, ‘saldo_var30’, ‘var36’, ‘num_meses_var5_ult3’]. F_classif selected 12 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var4’, ‘num_var5’, ‘num_var30’, ‘num_var35’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’, ‘n0’]. Chi2 & F_classif selected 9 features [‘var15’, ‘ind_var5’, ‘ind_var8_0’, ‘ind_var30’, ‘num_var5’, ‘num_var30’, ‘num_var42’, ‘var36’, ‘num_meses_var5_ult3’]
X_sel=train[features+['TARGET']]
X_sel.info()
X_sel.describe()
var15 ind_var5 ind_var8_0 ind_var30 num_var5 num_var30 num_var42 var36 num_meses_var5_ult3 TARGET
count 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000
mean 33.212865 0.663760 0.032833 0.732833 1.999171 2.382873 2.217995 40.449079 1.979979 0.039569
std 12.956486 0.472425 0.178202 0.442483 1.431902 1.642787 1.497703 47.362719 1.298924 0.194945
min 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 0.000000
50% 28.000000 1.000000 0.000000 1.000000 3.000000 3.000000 3.000000 3.000000 3.000000 0.000000
75% 40.000000 1.000000 0.000000 1.000000 3.000000 3.000000 3.000000 99.000000 3.000000 0.000000
max 105.000000 1.000000 1.000000 1.000000 15.000000 33.000000 18.000000 99.000000 3.000000 1.000000
sns.FacetGrid(X_sel,hue='TARGET',size=6).map(sns.kdeplot,'var36').add_legend()
plt.title('the unhappy customer is smaller when var36 is not 99')
<matplotlib.text.Text at 0x7fb7d84f56d8>

p![这里写图片描述

X_sel.var36.value_counts()
99    30064
3     22177
1     14664
2      8704
0       411
Name: var36, dtype: int64
# var36 in function of var38 (most common value excluded) 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10) \
   .map(plt.scatter, "var36", "logvar38") \
   .add_legend();

这里写图片描述

sns.FacetGrid(train[(~train.var38mc)&(train.var36<4)],hue='TARGET',size=10).map(plt.scatter,'var36','logvar38').add_legend()
plt.title('when var36 is zero there is all  unhappy customer')
<matplotlib.text.Text at 0x7fb7bde30ef0>

这里写图片描述

#look at the value of var38 when var36==99
sns.FacetGrid(train[(~train.var38mc)&(train.var36==99)], hue="TARGET", size=10) \
   .map(sns.kdeplot, 'logvar38') \
   .add_legend();

这里写图片描述

train.num_var5.value_counts()
3     50265
0     25561
6       190
9         3
15        1
Name: num_var5, dtype: int64
train[train.TARGET==0].num_var5.value_counts()
3     49223
0     23602
6       183
9         3
15        1
Name: num_var5, dtype: int64
train[train.TARGET==1].num_var5.value_counts()
0    1959
3    1042
6       7
Name: num_var5, dtype: int64
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.hist,'num_var5').add_legend()
<seaborn.axisgrid.FacetGrid at 0x7fb7d5344da0>

png

sns.FacetGrid(train, hue="TARGET", size=6) \
   .map(sns.kdeplot, "num_var5") \
   .add_legend();

这里写图片描述

1
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:86004次
    • 积分:4590
    • 等级:
    • 排名:第7078名
    • 原创:374篇
    • 转载:7篇
    • 译文:0篇
    • 评论:5条
    最新评论