# 数据分析-python

import numpy as np
import pandas as pd
df = pd.read_csv("D:/pythonproject/train.csv");
label = df['TARGET']
df = df.drop(['ID','TARGET'],axis=1)
missSet=[np.nan,9999999999,-999999]
#取出第0列，所有行的数据，并且计算有多少不同的值
len(df.iloc[:,0].unique())
208

count_un=df.iloc[:,0:3].apply(lambda x:len(x.unique()))
print count_un
var3                  208
var15                 100
imp_ent_var16_ult1    596
dtype: int64

np.sum(df.iloc[:,0]==0)
75

count_zero = df.iloc[:,0:3].apply(lambda x:sum(x==0))
print count_zero
var3                     75
var15                     0
imp_ent_var16_ult1    72301
dtype: int64

np.mean(df.iloc[:,0])
-1523.1992765061825

df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)]
0          2
1          2
2          2
3          2
4          2
5          2
6          2
7          2
8          2
9          2
10         2
11         2
12         2
13         2
14         2
15         2
16         2
17         2
18       229
19         2
20         2
21         2
22         2
23         2
24         2
25         2
26         2
27         2
28         2
29         2
...
75990      2
75991      2
75992      2
75993      2
75994      2
75995      2
75996      2
75997      2
75998      2
75999      2
76000      2
76001      2
76002      2
76003      2
76004      2
76005      2
76006      2
76007      2
76008      2
76009      2
76010      2
76011      2
76012      4
76013      2
76014      2
76015      2
76016      2
76017      2
76018      2
76019      2
Name: var3, Length: 75904, dtype: int64

np.mean(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
2.717577466273187

df_mean=df.iloc[:,0:3].apply(lambda x:np.mean(x[~np.isin(x,missSet)]))
print  df_mean
var3                   2.717577
var15                 33.212865
imp_ent_var16_ult1    86.208265
dtype: float64

print(np.__version__)
1.13.1

np.median(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
2.0

df_median=df.iloc[:,0:3].apply(lambda x:np.median(x[~np.isin(x,missSet)]))
print df_median
var3                   2.0
var15                 28.0
imp_ent_var16_ult1     0.0
dtype: float64

#mode（）返回值是两个数组，分别是众数和众数出现的次数
from  scipy import  stats
stats.mode(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
ModeResult(mode=array([2], dtype=int64), count=array([74165]))

df_mode = df.iloc[:,0:3].apply(lambda x:stats.mode(x[~np.isin(x,missSet)])[0][0])
print df_mode
var3                   2.0
var15                 23.0
imp_ent_var16_ult1     0.0
dtype: float64

df_mode_count = df.iloc[:,0:3].apply(lambda x:stats.mode(x[~np.isin(x,missSet)])[1][0])
print df_mode_count
var3                  74165
var15                 20170
imp_ent_var16_ult1    72301
dtype: int64

df_mode_percentage = df_mode_count/df.shape[0]
print df_mode_percentage
var3                  0.975599
var15                 0.265325
imp_ent_var16_ult1    0.951079
dtype: float64

np.min(df.iloc[:,0])
-999999

np.min(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
0

df_min = df.iloc[:,0:3].apply(lambda x:np.min(x[~np.isin(x,missSet)]))
print df_min
var3                  0.0
var15                 5.0
imp_ent_var16_ult1    0.0
dtype: float64

df_max= df.iloc[:,0:3].apply(lambda x:np.max(x[~np.isin(x,missSet)]))
print df_max
var3                     238.0
var15                    105.0
imp_ent_var16_ult1    210000.0
dtype: float64

np.percentile(df.iloc[:,0],(1,5,25,50,75,95,99))
array([  2.,   2.,   2.,   2.,   2.,   2.,  11.])

df_percentile = np.percentile(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)],(1,5,25,50,75,95,99))
print df_percentile
[  2.   2.   2.   2.   2.   2.  11.]

json_quan = {}
for i,name in enumerate(df.iloc[:,0:3].columns):
json_quan[name]=np.percentile(df[name][~np.isin(df[name],missSet)],(1,5,25,50,75,95,99))
print("the {} is {} ={}").format(i,name,json_quan[name])
the 0 is var3 =[  2.   2.   2.   2.   2.   2.  11.]
the 1 is var15 =[ 22.  23.  23.  28.  40.  60.  77.]
the 2 is imp_ent_var16_ult1 =[    0.     0.     0.     0.     0.     0.  1500.]

df_quan = pd.DataFrame(json_quan)[df.iloc[:,0:3].columns].T
print df_quan
                       0     1     2     3     4     5       6
var3                 2.0   2.0   2.0   2.0   2.0   2.0    11.0
var15               22.0  23.0  23.0  28.0  40.0  60.0    77.0
imp_ent_var16_ult1   0.0   0.0   0.0   0.0   0.0   0.0  1500.0

#统计频数出现前5的数据
df.iloc[:,0].value_counts().iloc[0:5,]
 2         74165
8           138
-999999      116
9           110
3           108
Name: var3, dtype: int64

df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)].value_counts().iloc[0:5,]
2    74165
8      138
9      110
3      108
1      105
Name: var3, dtype: int64

json_fre_name={}
json_fre_counts ={}

def fill_fre_top_5(x):
if(len(x))<=5:
new_array = np.full(5,np.nan)
new_array[0:len(x)]=x
return new_array
df['ind_var1_0'].value_counts()
df['imp_sal_var16_ult1'].value_counts()
0.00        75924
3000.00         9
1500.00         5
600.00          4
450.00          3
750.00          3
150.00          3
3600.00         2
3300.00         2
1620.00         2
45.00           2
300.00          2
12000.00        2
210.00          2
30.00           2
2100.00         2
6000.00         2
165.00          1
900.00          1
2244.00         1
15.00           1
360.00          1
1809.00         1
8850.00         1
390.00          1
120.00          1
99.00           1
6270.00         1
10500.00        1
1350.00         1
...
29250.00        1
1800.00         1
2550.00         1
4200.00         1
1650.00         1
3570.00         1
1020.00         1
907.14          1
60.00           1
630.00          1
6060.00         1
1770.00         1
1509.75         1
7050.00         1
105.00          1
2010.00         1
2460.00         1
4608.00         1
1227.00         1
1.50            1
1470.00         1
6900.00         1
86.01           1
5850.00         1
58048.05        1
2872.50         1
5570.13         1
8187.00         1
574.56          1
540.00          1
Name: imp_sal_var16_ult1, Length: 66, dtype: int64

json_fre_name={}
json_fre_count ={}
for i,name in enumerate(df[['ind_var1_0','imp_sal_var16_ult1']].columns):
index_name = df[name][~np.isin(df[name],missSet)].value_counts().iloc[0:5,].index.values
index_name = fill_fre_top_5(index_name)
json_fre_name[name] =index_name
value_count = df[name][~np.isin(df[name],missSet)].value_counts().iloc[0:5,].values
value_count = fill_fre_top_5(value_count)
json_fre_count[name]=value_count
df_fre_name = pd.DataFrame(json_fre_name)[df[['ind_var1_0','imp_sal_var16_ult1']].columns].T
df_fre_count = pd.DataFrame(json_fre_count)[df[['ind_var1_0','imp_sal_var16_ult1']].columns].T

df_fre = pd.concat([df_fre_name,df_fre_count],axis=1)
print df_fre
                      0       1       2      3      4        0      1    2  \
ind_var1_0          0.0     1.0     NaN    NaN    NaN  75149.0  871.0  NaN
imp_sal_var16_ult1  0.0  3000.0  1500.0  600.0  450.0  75924.0    9.0  5.0

3    4
ind_var1_0          NaN  NaN
imp_sal_var16_ult1  4.0  3.0

#计算缺失值的数量
df_miss = df.iloc[:,0:3].apply(lambda x:np.sum(np.isin(x,missSet)))
print df_miss
var3                  116
var15                   0
imp_ent_var16_ult1      0
dtype: int64