import numpy as np
import pandas as pd
names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')
data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)
data.head()
|
Balance |
Duration |
History |
Purpose |
Credit amount |
Savings |
Employment |
instPercent |
sexMarried |
Guarantors |
… |
Assets |
Age |
concCredit |
Apartment |
Credits |
Occupation |
Dependents |
hasPhone |
Foreign |
lable |
0 |
A11 |
6 |
A34 |
A43 |
1169 |
A65 |
A75 |
4 |
A93 |
A101 |
… |
A121 |
67 |
A143 |
A152 |
2 |
A173 |
1 |
A192 |
A201 |
1 |
1 |
A12 |
48 |
A32 |
A43 |
5951 |
A61 |
A73 |
2 |
A92 |
A101 |
… |
A121 |
22 |
A143 |
A152 |
1 |
A173 |
1 |
A191 |
A201 |
2 |
2 |
A14 |
12 |
A34 |
A46 |
2096 |
A61 |
A74 |
2 |
A93 |
A101 |
… |
A121 |
49 |
A143 |
A152 |
1 |
A172 |
2 |
A191 |
A201 |
1 |
3 |
A11 |
42 |
A32 |
A42 |
7882 |
A61 |
A74 |
2 |
A93 |
A103 |
… |
A122 |
45 |
A143 |
A153 |
1 |
A173 |
2 |
A191 |
A201 |
1 |
4 |
A11 |
24 |
A33 |
A40 |
4870 |
A61 |
A73 |
3 |
A93 |
A101 |
… |
A124 |
53 |
A143 |
A153 |
2 |
A173 |
2 |
A191 |
A201 |
2 |
5 rows × 21 columns
data.Balance.unique()
array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)
data.count()
Balance 1000 Duration 1000 History 1000 Purpose 1000 Credit amount 1000 Savings 1000 Employment 1000 instPercent 1000 sexMarried 1000 Guarantors 1000 Residence duration 1000 Assets 1000 Age 1000 concCredit 1000 Apartment 1000 Credits 1000 Occupation 1000 Dependents 1000 hasPhone 1000 Foreign 1000 lable 1000 dtype: int64
data.describe()
|
Duration |
Credit amount |
instPercent |
Residence duration |
Age |
Credits |
Dependents |
lable |
count |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
1000.000000 |
mean |
20.903000 |
3271.258000 |
2.973000 |
2.845000 |
35.546000 |
1.407000 |
1.155000 |
1.300000 |
std |
12.058814 |
2822.736876 |
1.118715 |
1.103718 |
11.375469 |
0.577654 |
0.362086 |
0.458487 |
min |
4.000000 |
250.000000 |
1.000000 |
1.000000 |
19.000000 |
1.000000 |
1.000000 |
1.000000 |
25% |
12.000000 |
1365.500000 |
2.000000 |
2.000000 |
27.000000 |
1.000000 |
1.000000 |
1.000000 |
50% |
18.000000 |
2319.500000 |
3.000000 |
3.000000 |
33.000000 |
1.000000 |
1.000000 |
1.000000 |
75% |
24.000000 |
3972.250000 |
4.000000 |
4.000000 |
42.000000 |
2.000000 |
1.000000 |
2.000000 |
max |
72.000000 |
18424.000000 |
4.000000 |
4.000000 |
75.000000 |
4.000000 |
2.000000 |
2.000000 |
data.Duration.unique()
array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)
data.History.unique()
array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)
data.groupby('Balance').size().order(ascending=