以德国信用数据为例,用logistict regression算法做信用评分卡原理性实现,因此并未考虑feature selection.
第一步:导入必要的库
1
2
3
|
import
pandas as pd
import
numpy as np
from
sklearn.cross_validation
import
train_test_split
|
第二步:导入数据
1
2
3
4
5
|
german
=
pd.read_csv(
'D:/CreditDatasets/german.data'
, sep
=
' '
, header
=
None
)
german.columns
=
[
'Status_of_existing_checking_account'
,
'Duration_in_month'
,
'Credit_history'
,
'Purpose'
,
'Credit_amount'
,
'Savings_account'
,
'Present_employment_since'
,
'Installment_rate'
,
'Personal_status_and_sex'
,
'Other_debtors'
,
'Present_residence_since'
,
'Property'
,
'Age'
,
'Other_installment_plans'
,
'Housing'
,
'Number_of_existing_credits'
,
'Job'
,
'Number_of_people'
,
'Telephone'
,
'foreign_worker'
,
'default'
]
Grp
=
german.groupby(
'default'
)
total_good
=
Grp.size()[
1
]
total_bad
=
Grp.size()[
2
]
|
第三步:分别计算名义变量和数值变量的woe值,对取值较少的数值变量也用名义变量woe计算方法实现,其余数值变量均5等分
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
def
CalcWOE(VarName):
WOE_Map
=
pd.DataFrame()
Vars
=
np.unique(german[VarName])
for
v
in
Vars
:
tmp
=
german[VarName]
=
=
v
grp
=
german[tmp].groupby(
'default'
)
Good
=
grp.size()[
1
]
Bad
=
grp.size()[
2
]
good_ratio
=
float
(Good)
/
total_good
bad_ratio
=
float
(Bad)
/
total_bad
WOE
=
np.log(bad_ratio
/
good_ratio)
IV
=
(bad_ratio
-
good_ratio)
*
WOE
result
=
pd.DataFrame([[VarName, v, WOE, IV]], index
=
None
, columns
=
[
'variable'
,
'class'
,
'woe'
,
'iv'
])
WOE_Map
=
WOE_Map.append(result, ignore_index
=
True
)
return
WOE_Map
# nominal variable woe
status_checking_account_woe
=
CalcWOE(
'Status_of_existing_checking_account'
)
Credit_history_woe
=
CalcWOE(
'Credit_history'
)
Purpose_woe
=
CalcWOE(
'Purpose'
)
Savings_account_woe
=
CalcWOE(
'Savings_account'
)
Present_employment_since_woe
=
CalcWOE(
'Present_employment_since'
)
Personal_status_and_sex_woe
=
CalcWOE(
'Personal_status_and_sex'
)
Other_debtors_woe
=
CalcWOE(
'Other_debtors'
)
Property_woe
=
CalcWOE(
'Property'
)
Other_installment_plans_woe
=
CalcWOE(
'Other_installment_plans'
)
Housing_woe
=
CalcWOE(
'Housing'
)
Job_woe
=
CalcWOE(
'Job'
)
Telephone_woe
=
CalcWOE(
'Telephone'
)
foreign_worker_woe
=
CalcWOE(
'foreign_worker'
)
# numeric variable woe, no binning
Installment_rate_woe
=
CalcWOE(
'Installment_rate'
)
Present_residence_since_woe
=
CalcWOE(
'Present_residence_since'
)
Number_of_existing_credits_woe
=
CalcWOE(
'Number_of_existing_credits'
)
Number_of_people_woe
=
CalcWOE(
'Number_of_people'
)
def
CalcWOE_bin(VarName,N):
WOE_Map
=
pd.DataFrame()
max_value
=
max
(german[VarName])
min_value
=
min
(german[VarName])
bin
=
float
(max_value
-
min_value)
/
N
for
i
in
range
(N):
bin_U
=
min_value
+
(i
+
1
)
*
bin
bin_L
=
bin_U
-
bin
if
i
=
=
1
:
tmp
=
(german[VarName] >
=
bin_L) & (german[VarName] <
=
bin_U)
grp
=
german[tmp].groupby(
'default'
)
else
:
tmp
=
(german[VarName] > bin_L) & (german[VarName] <
=
bin_U)
grp
=
german[tmp].groupby(
'default'
)
Good
=
grp.size()[
1
]
Bad
=
grp.size()[
2
]
good_ratio
=
float
(Good)
/
total_good
bad_ratio
=
float
(Bad)
/
total_bad
WOE
=
np.log(bad_ratio
/
good_ratio)
IV
=
(bad_ratio
-
good_ratio)
*
WOE
result
=
pd.DataFrame([[VarName, [bin_L, bin_U, WOE], WOE, IV]],
index
=
None
, columns
=
[
'variable'
,
'class+woe'
,
'woe'
,
'iv'
])
WOE_Map
=
WOE_Map.append(result, ignore_index
=
True
)
return
WOE_Map
Duration_in_month_woe
=
CalcWOE_bin(
'Duration_in_month'
,
5
)
Credit_amount_woe
=
CalcWOE_bin(
'Credit_amount'
,
5
)
Age_woe
=
CalcWOE_bin(
'Age'
,
5
)
|
第四步:用woe值替代原来的值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
def
ReplaceWOE(VarName, SourceDF, VarWOE):
dict1
=
dict
.fromkeys(VarWOE[
'class'
])
j
=
0
for
key
in
dict1:
dict1[key]
=
VarWOE[
'woe'
][j]
j
=
j
+
1
SourceDF[VarName]
=
SourceDF[VarName].
map
(dict1)
return
SourceDF
german_woe
=
german
temp
=
ReplaceWOE(
'Status_of_existing_checking_account'
, german_woe, status_checking_account_woe)
temp1
=
ReplaceWOE(
'Credit_history'
, temp, Credit_history_woe)
temp
=
ReplaceWOE(
'Purpose'
, temp1, Purpose_woe)
temp1
=
ReplaceWOE(
'Savings_account'
, temp, Savings_account_woe)
temp
=
ReplaceWOE(
'Present_employment_since'
, temp1, Present_employment_since_woe)
temp1
=
ReplaceWOE(
'Personal_status_and_sex'
, temp, Personal_status_and_sex_woe)
temp
=
ReplaceWOE(
'Other_debtors'
, temp1, Other_debtors_woe)
temp1
=
ReplaceWOE(
'Property'
, temp, Property_woe)
temp
=
ReplaceWOE(
'Other_installment_plans'
, temp1, Other_installment_plans_woe)
temp1
=
ReplaceWOE(
'Housing'
, temp, Housing_woe)
temp
=
ReplaceWOE(
'Job'
, temp1, Job_woe)
temp1
=
ReplaceWOE(
'Telephone'
, temp, Telephone_woe)
temp
=
ReplaceWOE(
'foreign_worker'
, temp1, foreign_worker_woe)
temp1
=
ReplaceWOE(
'Installment_rate'
, temp, Installment_rate_woe)
temp
=
ReplaceWOE(
'Present_residence_since'
, temp1, Present_residence_since_woe)
temp1
=
ReplaceWOE(
'Number_of_existing_credits'
, temp, Number_of_existing_credits_woe)
temp
=
ReplaceWOE(
'Number_of_people'
, temp1, Number_of_people_woe)
def
ReplaceWOE_bin(VarName, SourceDF, VarWOE):
items
=
np.unique(SourceDF[VarName])
m
=
min
(SourceDF[VarName])
dict2
=
{}
for
it
in
items:
if
it
=
=
m:
dict2[it]
=
VarWOE[
'class+woe'
][
0
][
2
]
else
:
for
l, u, w
in
VarWOE[
'class+woe'
]:
if
(it > l) & (it <
=
u):
dict2[it]
=
w
SourceDF[VarName]
=
SourceDF[VarName].
map
(dict2)
return
SourceDF
temp1
=
ReplaceWOE_bin(
'Duration_in_month'
, temp, Duration_in_month_woe)
temp
=
ReplaceWOE_bin(
'Credit_amount'
, temp1, Credit_amount_woe)
temp1
=
ReplaceWOE_bin(
'Age'
, temp, Age_woe)
|
第五步:将数据集拆分为训练集和测试集
1
2
3
|
X
=
temp1[
list
(temp1.columns)[:
-
1
]]
y
=
temp1[
'default'
]
-
1
X_train, X_test, y_train, y_test
=
train_test_split(X, y, test_size
=
0.1
, random_state
=
0
)
|
第六步:在训练集上应用logistic regression算法
1
2
3
4
|
from
sklearn.linear_model.logistic
import
LogisticRegression
classifier
=
LogisticRegression()
classifier.fit(X_train, y_train)
predictions
=
classifier.predict(X_test)
|
第七步:评估模型分类精度
1
2
3
4
5
6
|
from
sklearn.metrics
import
accuracy_score
# print 'Accuracy:', accuracy_score(y_test, predictions)
from
sklearn.cross_validation
import
cross_val_score
scores
=
cross_val_score(classifier, X_train, y_train, cv
=
5
)
# print np.mean(scores), scores
|
第八步:创建评分卡
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
# score = A - B*log(theta)
# P0 = A - B*log(theta0), P0 + PDO = A - B*log(2*theta0)
P0
=
600
PDO
=
20
theta0
=
1.0
/
60
B
=
PDO
/
np.log(
2
)
A
=
P0
+
B
*
np.log(theta0)
coef
=
classifier.coef_
beta0
=
classifier.intercept_
status_checking_account_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
0
]
*
status_checking_account_woe[
'woe'
]
Duration_in_month_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
1
]
*
Duration_in_month_woe[
'woe'
]
Credit_history_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
2
]
*
Credit_history_woe[
'woe'
]
Purpose_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
3
]
*
Purpose_woe[
'woe'
]
Credit_amount_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
4
]
*
Credit_amount_woe[
'woe'
]
Savings_account_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
5
]
*
Savings_account_woe[
'woe'
]
Present_employment_since_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
6
]
*
Present_employment_since_woe[
'woe'
]
Installment_rate_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
7
]
*
Installment_rate_woe[
'woe'
]
Personal_status_and_sex_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
8
]
*
Personal_status_and_sex_woe[
'woe'
]
Other_debtors_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
9
]
*
Other_debtors_woe[
'woe'
]
Present_residence_since_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
10
]
*
Present_residence_since_woe[
'woe'
]
Property_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
11
]
*
Property_woe[
'woe'
]
Age_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
12
]
*
Age_woe[
'woe'
]
Other_installment_plans_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
13
]
*
Other_installment_plans_woe[
'woe'
]
Housing_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
14
]
*
Housing_woe[
'woe'
]
Number_of_existing_credits_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
15
]
*
Number_of_existing_credits_woe[
'woe'
]
Job_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
16
]
*
Job_woe[
'woe'
]
Number_of_people_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
17
]
*
Number_of_people_woe[
'woe'
]
Telephone_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
18
]
*
Telephone_woe[
'woe'
]
foreign_worker_woe[
'score'
]
=
(A
-
B
*
beta0)
/
20
-
B
*
coef[
0
][
19
]
*
foreign_worker_woe[
'woe'
]
|