#########WOE 评分模型在logistics regression算法在评分卡上的实践

最新推荐文章于 2023-05-16 22:42:08 发布

mishidemudong

最新推荐文章于 2023-05-16 22:42:08 发布

阅读量3k

点赞数

分类专栏：数据挖掘

数据挖掘专栏收录该内容

132 篇文章 12 订阅

订阅专栏

以德国信用数据为例，用logistict regression算法做信用评分卡原理性实现，因此并未考虑feature selection.

第一步：导入必要的库

 
        import  
        pandas as pd 
       
        import  
        numpy as np 
       
        from  
        sklearn.cross_validation  
        import  
        train_test_split

第二步：导入数据

 
   
        german  
        =  
        pd.read_csv( 
        'D:/CreditDatasets/german.data' 
        , sep 
        = 
        ' ' 
        , header 
        = 
        None 
        ) 
       
 
        german.columns  
        =  
        [ 
        'Status_of_existing_checking_account' 
        ,  
        'Duration_in_month' 
        ,  
        'Credit_history' 
        , 
        'Purpose' 
        ,  
        'Credit_amount' 
        ,  
        'Savings_account' 
        ,  
        'Present_employment_since' 
        , 
        'Installment_rate' 
        ,  
        'Personal_status_and_sex' 
        ,  
        'Other_debtors' 
        ,  
        'Present_residence_since' 
        , 
        'Property' 
        ,  
        'Age' 
        ,  
        'Other_installment_plans' 
        ,  
        'Housing' 
        ,  
        'Number_of_existing_credits' 
        , 
        'Job' 
        ,  
        'Number_of_people' 
        ,  
        'Telephone' 
        ,  
        'foreign_worker' 
        ,  
        'default' 
        ] 
       
 
        Grp  
        =  
        german.groupby( 
        'default' 
        ) 
       
 
        total_good  
        =  
        Grp.size()[ 
        1 
        ] 
       
 
        total_bad  
        =  
        Grp.size()[ 
        2 
        ] 
       
 
 

第三步：分别计算名义变量和数值变量的woe值，对取值较少的数值变量也用名义变量woe计算方法实现，其余数值变量均5等分

 
   
        def  
        CalcWOE(VarName): 
       
 
             
        WOE_Map  
        =  
        pd.DataFrame() 
       
 
             
        Vars  
        =  
        np.unique(german[VarName]) 
       
 
             
        for  
        v  
        in  
        Vars 
        : 
       
 
                 
        tmp  
        =  
        german[VarName]  
        = 
        =  
        v 
       
 
                 
        grp  
        =  
        german[tmp].groupby( 
        'default' 
        ) 
       
 
                 
        Good  
        =  
        grp.size()[ 
        1 
        ] 
       
 
                 
        Bad  
        =  
        grp.size()[ 
        2 
        ] 
       
 
                 
        good_ratio  
        =  
        float 
        (Good) 
        / 
        total_good 
       
 
                 
        bad_ratio  
        =  
        float 
        (Bad) 
        / 
        total_bad 
       
 
                 
        WOE  
        =  
        np.log(bad_ratio 
        / 
        good_ratio) 
       
 
                 
        IV  
        =  
        (bad_ratio  
        -  
        good_ratio) 
        * 
        WOE 
       
 
                 
        result  
        =  
        pd.DataFrame([[VarName, v, WOE, IV]], index 
        = 
        None 
        , columns 
        = 
        [ 
        'variable' 
        ,  
        'class' 
        ,  
        'woe' 
        ,  
        'iv' 
        ]) 
       
 
                 
        WOE_Map  
        =  
        WOE_Map.append(result, ignore_index 
        = 
        True 
        ) 
       
 
             
        return  
        WOE_Map 
       

           
       
 
        # nominal variable woe 
       
 
        status_checking_account_woe  
        =  
        CalcWOE( 
        'Status_of_existing_checking_account' 
        ) 
       
 
        Credit_history_woe           
        =  
        CalcWOE( 
        'Credit_history' 
        ) 
       
 
        Purpose_woe                  
        =  
        CalcWOE( 
        'Purpose' 
        ) 
       
 
        Savings_account_woe          
        =  
        CalcWOE( 
        'Savings_account' 
        ) 
       
 
        Present_employment_since_woe 
        =  
        CalcWOE( 
        'Present_employment_since' 
        ) 
       
 
        Personal_status_and_sex_woe  
        =  
        CalcWOE( 
        'Personal_status_and_sex' 
        ) 
       
 
        Other_debtors_woe            
        =  
        CalcWOE( 
        'Other_debtors' 
        ) 
       
 
        Property_woe                 
        =  
        CalcWOE( 
        'Property' 
        ) 
       
 
        Other_installment_plans_woe  
        =  
        CalcWOE( 
        'Other_installment_plans' 
        ) 
       
 
        Housing_woe                  
        =  
        CalcWOE( 
        'Housing' 
        ) 
       
 
        Job_woe                      
        =  
        CalcWOE( 
        'Job' 
        ) 
       
 
        Telephone_woe                
        =  
        CalcWOE( 
        'Telephone' 
        ) 
       
 
        foreign_worker_woe           
        =  
        CalcWOE( 
        'foreign_worker' 
        ) 
       

           
       
 
        # numeric variable woe, no binning 
       
 
        Installment_rate_woe         
        =  
        CalcWOE( 
        'Installment_rate' 
        ) 
       
 
        Present_residence_since_woe  
        =  
        CalcWOE( 
        'Present_residence_since' 
        ) 
       
 
        Number_of_existing_credits_woe  
        =  
        CalcWOE( 
        'Number_of_existing_credits' 
        ) 
       
 
        Number_of_people_woe         
        =  
        CalcWOE( 
        'Number_of_people' 
        ) 
       

           
       

           
       
 
        def  
        CalcWOE_bin(VarName,N): 
       
 
             
        WOE_Map  
        =  
        pd.DataFrame() 
       
 
             
        max_value  
        =  
        max 
        (german[VarName]) 
       
 
             
        min_value  
        =  
        min 
        (german[VarName]) 
       
 
             
        bin  
        =  
        float 
        (max_value  
        -  
        min_value) 
        / 
        N 
       
 
             
        for  
        i  
        in  
        range 
        (N): 
       
 
                 
        bin_U  
        =  
        min_value  
        +  
        (i 
        + 
        1 
        ) 
        * 
        bin 
       
 
                 
        bin_L  
        =  
        bin_U  
        -  
        bin 
       
 
                 
        if  
        i  
        = 
        =  
        1 
        : 
       
 
                     
        tmp  
        =  
        (german[VarName] > 
        =  
        bin_L) & (german[VarName] < 
        =  
        bin_U) 
       
 
                     
        grp  
        =  
        german[tmp].groupby( 
        'default' 
        ) 
       
 
                 
        else 
        : 
       
 
                     
        tmp  
        =  
        (german[VarName] > bin_L) & (german[VarName] < 
        =  
        bin_U) 
       
 
                     
        grp  
        =  
        german[tmp].groupby( 
        'default' 
        ) 
       
 
                 
        Good  
        =  
        grp.size()[ 
        1 
        ] 
       
 
                 
        Bad  
        =  
        grp.size()[ 
        2 
        ] 
       
 
                 
        good_ratio  
        =  
        float 
        (Good) 
        / 
        total_good 
       
 
                 
        bad_ratio  
        =  
        float 
        (Bad) 
        / 
        total_bad 
       
 
                 
        WOE  
        =  
        np.log(bad_ratio 
        / 
        good_ratio) 
       
 
                 
        IV  
        =  
        (bad_ratio  
        -  
        good_ratio) 
        * 
        WOE 
       
 
                 
        result  
        =  
        pd.DataFrame([[VarName, [bin_L, bin_U, WOE], WOE, IV]], 
       
 
                                       
        index 
        = 
        None 
        , columns 
        = 
        [ 
        'variable' 
        ,  
        'class+woe' 
        ,  
        'woe' 
        ,  
        'iv' 
        ]) 
       
 
                 
        WOE_Map  
        =  
        WOE_Map.append(result, ignore_index 
        = 
        True 
        ) 
       
 
             
        return  
        WOE_Map 
       

           
       
 
        Duration_in_month_woe  
        =  
        CalcWOE_bin( 
        'Duration_in_month' 
        ,  
        5 
        ) 
       
 
        Credit_amount_woe      
        =  
        CalcWOE_bin( 
        'Credit_amount' 
        ,  
        5 
        ) 
       
 
        Age_woe                
        =  
        CalcWOE_bin( 
        'Age' 
        ,  
        5 
        ) 
       
 
 

第四步：用woe值替代原来的值

 
   
        def  
        ReplaceWOE(VarName, SourceDF, VarWOE): 
       
 
             
        dict1  
        =  
        dict 
        .fromkeys(VarWOE[ 
        'class' 
        ]) 
       
 
             
        j  
        =  
        0 
       
 
             
        for  
        key  
        in  
        dict1: 
       
 
                 
        dict1[key]  
        =  
        VarWOE[ 
        'woe' 
        ][j] 
       
 
                 
        j  
        =  
        j  
        +  
        1 
       
 
             
        SourceDF[VarName]  
        =  
        SourceDF[VarName]. 
        map 
        (dict1) 
       
 
             
        return  
        SourceDF 
       

           
       
 
        german_woe  
        =  
        german 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Status_of_existing_checking_account' 
        , german_woe, status_checking_account_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Credit_history' 
        , temp, Credit_history_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Purpose' 
        , temp1, Purpose_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Savings_account' 
        , temp, Savings_account_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Present_employment_since' 
        , temp1, Present_employment_since_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Personal_status_and_sex' 
        , temp, Personal_status_and_sex_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Other_debtors' 
        , temp1, Other_debtors_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Property' 
        , temp, Property_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Other_installment_plans' 
        , temp1, Other_installment_plans_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Housing' 
        , temp, Housing_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Job' 
        , temp1, Job_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Telephone' 
        , temp, Telephone_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'foreign_worker' 
        , temp1, foreign_worker_woe) 
       

           
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Installment_rate' 
        , temp, Installment_rate_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Present_residence_since' 
        , temp1, Present_residence_since_woe) 
       
 
        temp1  
        =  
        ReplaceWOE( 
        'Number_of_existing_credits' 
        , temp, Number_of_existing_credits_woe) 
       
 
        temp  
        =  
        ReplaceWOE( 
        'Number_of_people' 
        , temp1, Number_of_people_woe) 
       

           
       
 
        def  
        ReplaceWOE_bin(VarName, SourceDF, VarWOE): 
       
 
             
        items  
        =  
        np.unique(SourceDF[VarName]) 
       
 
             
        m  
        =  
        min 
        (SourceDF[VarName]) 
       
 
             
        dict2  
        =  
        {} 
       
 
             
        for  
        it  
        in  
        items: 
       
 
                 
        if  
        it  
        = 
        =  
        m: 
       
 
                     
        dict2[it]  
        =  
        VarWOE[ 
        'class+woe' 
        ][ 
        0 
        ][ 
        2 
        ] 
       
 
                 
        else 
        : 
       
 
                     
        for  
        l, u, w  
        in  
        VarWOE[ 
        'class+woe' 
        ]: 
       
 
                         
        if  
        (it > l) & (it < 
        =  
        u): 
       
 
                             
        dict2[it]  
        =  
        w 
       
 
             
        SourceDF[VarName]  
        =  
        SourceDF[VarName]. 
        map 
        (dict2) 
       
 
             
        return  
        SourceDF 
       

           
       
 
        temp1  
        =  
        ReplaceWOE_bin( 
        'Duration_in_month' 
        , temp, Duration_in_month_woe) 
       
 
        temp  
        =  
        ReplaceWOE_bin( 
        'Credit_amount' 
        , temp1, Credit_amount_woe) 
       
 
        temp1  
        =  
        ReplaceWOE_bin( 
        'Age' 
        , temp, Age_woe) 
       
 
 

第五步：将数据集拆分为训练集和测试集

 
   
        X  
        =  
        temp1[ 
        list 
        (temp1.columns)[: 
        - 
        1 
        ]] 
       
 
        y  
        =  
        temp1[ 
        'default' 
        ]  
        -  
        1 
       
 
        X_train, X_test, y_train, y_test  
        =  
        train_test_split(X, y, test_size 
        = 
        0.1 
        , random_state 
        = 
        0 
        ) 
       
 
 

第六步：在训练集上应用logistic regression算法

 
        from  
        sklearn.linear_model.logistic  
        import  
        LogisticRegression 
       
        classifier  
        =  
        LogisticRegression() 
       
        classifier.fit(X_train, y_train) 
       
        predictions  
        =  
        classifier.predict(X_test)

第七步：评估模型分类精度

 
        from  
        sklearn.metrics  
        import  
        accuracy_score 
       
        # print 'Accuracy:', accuracy_score(y_test, predictions) 
       
        from  
        sklearn.cross_validation  
        import  
        cross_val_score 
       
        scores  
        =  
        cross_val_score(classifier, X_train, y_train, cv 
        = 
        5 
        ) 
       
        # print np.mean(scores), scores

第八步：创建评分卡

 
   
        # score = A - B*log(theta) 
       
 
        # P0 = A - B*log(theta0), P0 + PDO = A - B*log(2*theta0) 
       
 
        P0  
        =  
        600 
       
 
        PDO  
        =  
        20 
       
 
        theta0  
        =  
        1.0 
        / 
        60 
       
 
        B  
        =  
        PDO 
        / 
        np.log( 
        2 
        ) 
       
 
        A  
        =  
        P0  
        +  
        B 
        * 
        np.log(theta0) 
       
 
        coef  
        =  
        classifier.coef_ 
       
 
        beta0  
        =  
        classifier.intercept_ 
       

           
       
 
        status_checking_account_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        0 
        ] 
        * 
        status_checking_account_woe[ 
        'woe' 
        ] 
       
 
        Duration_in_month_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        1 
        ] 
        * 
        Duration_in_month_woe[ 
        'woe' 
        ] 
       
 
        Credit_history_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        2 
        ] 
        * 
        Credit_history_woe[ 
        'woe' 
        ] 
       
 
        Purpose_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        3 
        ] 
        * 
        Purpose_woe[ 
        'woe' 
        ] 
       
 
        Credit_amount_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        4 
        ] 
        * 
        Credit_amount_woe[ 
        'woe' 
        ] 
       
 
        Savings_account_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        5 
        ] 
        * 
        Savings_account_woe[ 
        'woe' 
        ] 
       
 
        Present_employment_since_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        6 
        ] 
        * 
        Present_employment_since_woe[ 
        'woe' 
        ] 
       
 
        Installment_rate_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        7 
        ] 
        * 
        Installment_rate_woe[ 
        'woe' 
        ] 
       
 
        Personal_status_and_sex_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        8 
        ] 
        * 
        Personal_status_and_sex_woe[ 
        'woe' 
        ] 
       
 
        Other_debtors_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        9 
        ] 
        * 
        Other_debtors_woe[ 
        'woe' 
        ] 
       
 
        Present_residence_since_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        10 
        ] 
        * 
        Present_residence_since_woe[ 
        'woe' 
        ] 
       
 
        Property_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        11 
        ] 
        * 
        Property_woe[ 
        'woe' 
        ] 
       
 
        Age_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        12 
        ] 
        * 
        Age_woe[ 
        'woe' 
        ] 
       
 
        Other_installment_plans_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        13 
        ] 
        * 
        Other_installment_plans_woe[ 
        'woe' 
        ] 
       
 
        Housing_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        14 
        ] 
        * 
        Housing_woe[ 
        'woe' 
        ] 
       
 
        Number_of_existing_credits_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        15 
        ] 
        * 
        Number_of_existing_credits_woe[ 
        'woe' 
        ] 
       
 
        Job_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        16 
        ] 
        * 
        Job_woe[ 
        'woe' 
        ] 
       
 
        Number_of_people_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        17 
        ] 
        * 
        Number_of_people_woe[ 
        'woe' 
        ] 
       
 
        Telephone_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        18 
        ] 
        * 
        Telephone_woe[ 
        'woe' 
        ] 
       
 
        foreign_worker_woe[ 
        'score' 
        ]  
        =  
        (A  
        -  
        B 
        * 
        beta0) 
        / 
        20  
        -  
        B 
        * 
        coef[ 
        0 
        ][ 
        19 
        ] 
        * 
        foreign_worker_woe[ 
        'woe' 
        ] 
       
 
 

mishidemudong

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
#########WOE 评分模型在logistics regression算法在评分卡上的实践

以德国信用数据为例，用logistict regression算法做信用评分卡原理性实现，因此并未考虑feature selection.第一步：导入必要的库123import pandas as pdimport numpy as npfrom sklearn.cross_validation import train_te
复制链接

扫一扫