以德国信用数据为例,用logistict regression算法做信用评分卡原理性实现,因此并未考虑feature selection.
第一步:导入必要的库
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
第二步:导入数据
german = pd.read_csv('D:/CreditDatasets/german.data', sep=' ', header=None)
german.columns = ['Status_of_existing_checking_account', 'Duration_in_month', 'Credit_history','Purpose', 'Credit_amount', 'Savings_account', 'Present_employment_since','Installment_rate', 'Personal_status_and_sex', 'Other_debtors', 'Present_residence_since','Property', 'Age', 'Other_installment_plans', 'Housing', 'Number_of_existing_credits','Job', 'Number_of_people', 'Telephone', 'foreign_worker', 'default']
Grp = german.groupby('default')
total_good = Grp.size()[1]
total_bad = Grp.size()[2]
第三步:分别计算名义变量和数值变量的woe值,对取值较少的数值变量也用名义变量woe计算方法实现,其余数值变量均5等分
def CalcWOE(VarName):
WOE_Map = pd.DataFrame()
Vars = np.unique(german[VarName])
for v in Vars:
tmp = german[VarName] == v
grp = german[tmp].groupby('default')
Good = grp.size()[1]
Bad = grp.size()[2]
good_ratio = float(Good)/total_good
bad_ratio = float(Bad)/total_bad
WOE = np.log(bad_ratio/good_ratio)
IV = (bad_ratio - good_ratio)*WOE
result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=['variable', 'class', 'woe', 'iv'])
WOE_Map = WOE_Map.append(result, ignore_index=True)
return WOE_Map
# nominal variable woe
status_checking_account_woe