评分卡建模流程

核心代码讲解:

注解:核心代码也是参考之前人的代码,理解的;希望读者引用时候,也附上本作者地址

 

def split_cut_result(data, get_col_continuous_cut_points):
    cols = [i for i in data.columns if i not in [i[0] for i in get_col_continuous_cut_points]]
    data_cut_result = data[cols].copy()
    for col, cut_points in get_col_continuous_cut_points:
        data_cut_result[col] = pd.cut(data[col], cut_points).astype("str")
    data_cut_result = data_cut_result.fillna('null')
    data_cut_result.replace('nan', 'null', inplace=True)
    return data_cut_result

#woe编码转换
def get_woe(data_discrete):
    data_woe = pd.DataFrame()
    for col in [i for i in data_discrete.columns if i != 'y']:
        col_woe_iv = get_woe_iv(data_discrete, col)
        data_woe[col] = data_discrete[col].replace(list(col_woe_iv['cut_points']), list(col_woe_iv['woe']))
    data_woe['y'] = data_discrete['y']
    return data_woe

#使用iv,共线性以及lasso回归代码来实现特征降维
def get_iv_corr_logistic_l1_col(data_woe, col_iv, min_iv=0.02, max_corr=0.6, C=0.01, penalty='l1'):
    col_filter = [col for col, iv in col_iv if iv > min_iv]
    col_iv_filter = [[col, iv] for col, iv in col_iv if iv > min_iv]
    data_woe_corr = data_woe[col_filter].corr()
    data_woe_corr_list = data_woe_corr.values.reshape(-1, 1)
    col_iv_result = []
    for col1, iv1 in col_iv_filter:
        for col2, iv2 in col_iv_filter:
            col_iv_result.append([col1, col2, iv1, iv2, iv1 - iv2])
    data_woe_corr_iv = pd.DataFrame(col_iv_result, columns=['col1', 'col2', 'iv1', 'iv2', 'iv1_iv2'])
    data_woe_corr_iv['corr'] = data_woe_corr_list
        
    col_delete = data_woe_corr_iv['col1'][(data_woe_corr_iv['corr'] < 1) & (data_woe_corr_iv['corr'] > max_corr) & (
            data_woe_corr_iv['iv1_iv2'] < 0)].unique()
    col_filter_result = [col for col in col_filter if col not in (col_delete)]
       
    lr = linear_model.LogisticRegression(C=C, penalty=penalty).fit(data_woe[col_filter_result], data_woe['y'])
    col_result = [col_filter_result[i] for i in range(len(col_filter_result)) if lr.coef_[0][i] != 0]
    return col_result
def get_Lasso_card(data, get_col_continuous_cut_points, increase_score=50,odds=1/20,base_score=600):
    col_types = get_colname_d_continue(data)
    col_result = [i for i in data.columns if i != 'y']
    data_discrete = split_cut_result(data, get_col_continuous_cut_points)  
    data_woe = get_woe(data_discrete)  
      
    lr = linear_model.LogisticRegression(C=1, penalty='l2')
    lr.fit(data_woe[col_result], data_woe['y'])
    b = increase_score / np.log(2)
    a = base_score + b * np.log(odds) 
    score_card = pd.DataFrame()
    for col in col_result:
        col_cut_point_woe = get_woe_iv(data_discrete, col)
        col_cut_point_woe['col'] = col
        score_card = pd.concat([score_card, col_cut_point_woe])
    col_coef = pd.DataFrame(col_result, lr.coef_[0]).reset_index()
    col_coef.columns = ['col_coef', 'col']
    score_card['lr_intercept'] = lr.intercept_[0]
    score_card = pd.merge(score_card, col_coef, on=['col'], how='left')
    score_card['score'] = score_card['woe'] * score_card['col_coef'] * (- b)
    score_card = pd.merge(score_card, pd.DataFrame(col_types, columns=['col', 'type']), on='col', how='left')
    score_card = pd.merge(score_card, pd.DataFrame(get_col_continuous_cut_points, columns=['col', 'cuts']), on='col',how='left')

        
    data_cut_points_id = pd.DataFrame()
    for col, cut_point in get_col_continuous_cut_points:
        result = pd.DataFrame()
        result['cut_points'] = pd.cut(data[col], cut_point).astype('str').unique()
        result['cut_points_id'] = pd.cut(data[col], cut_point).unique()._codes
        result['cut_points'].replace('nan', 'null', inplace=True)
        result['col'] = col
        data_cut_points_id = pd.concat([data_cut_points_id, result])
    score_card = pd.merge(score_card, data_cut_points_id, on=['col', 'cut_points'], how='left').sort_values(
            ['col', 'cut_points_id', 'cut_points'])

    score_card = score_card[
            ['col', 'type', 'cuts', 'cut_points', '1_num', '0_num', 'total_num', '1_pct', '0_pct', 'total_pct',
             '1_rate', 'woe', 'iv', 'total_iv', 'col_coef', 'lr_intercept', 'score']].reset_index(drop=True)
    return score_card
def predict_score_proba(data, score_card, increase_score=50,odds=1/20,base_score=600):
    b = increase_score / np.log(2)
    a = base_score + b * np.log(odds) 
    basescore = round(a - b * score_card['lr_intercept'][0], 0)
    col_result = score_card['col'].unique().tolist() + ['y']
    get_col_continuous_cut_points = score_card[['col', 'cuts']][score_card['type'] == 'continuous'].drop_duplicates('col').values.tolist()
    data_discrete = split_cut_result(data[col_result], get_col_continuous_cut_points)
    data_score_proba = data[['MOBILE']]
    for col in score_card['col'].unique():
        col_score = col + 'score'
        cut_points = score_card['cut_points'][score_card['col'] == col].tolist()
        score = score_card['score'][score_card['col'] == col].tolist()
        data_score_proba[col_score] = data_discrete[col].replace(cut_points, score)
    data_score_proba['score'] = data_score_proba.sum(axis=1)+ basescore
    data_score_proba['proba'] =  1 / (1 + np.e ** ((data_score_proba['score'] - a) / b))
    return data_score_proba
  
def score(data, score_card):
    data_score_proba = predict_score_proba(data, score_card)
    false_positive_rate, recall, thresholds = roc_curve(data['y'], data_score_proba['proba'])
    roc_auc = auc(false_positive_rate, recall)
    ks = max(recall - false_positive_rate)
    result = {}
    result['auc'] = roc_auc
    result['ks'] = ks
    return result

#KS与ROC画图

def graph_roc_ks(data, score_card):
    data_score_proba = predict_score_proba(data, score_card)
    false_positive_rate, recall, thresholds = roc_curve(data['y'], data_score_proba['proba'],drop_intermediate=False)
    roc_auc = auc(false_positive_rate, recall)
    plt.figure(figsize=(10, 5))

      
    plt.subplot(121)
    plt.title('ROC')
    plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('false_positive_rate')
    plt.ylabel('recall')

       
    plt.subplot(122)
    pre = sorted(data_score_proba['proba'], reverse=True)
    num = [(i) * int(len(pre) / 10) for i in range(10)]
    num = num + [(len(pre) - 1)]
    ks_thresholds = [max(thresholds[thresholds <= pre[i]]) for i in num]
    data_ks = pd.DataFrame([false_positive_rate, recall, thresholds]).T
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    data_ks = pd.merge(data_ks, pd.DataFrame(ks_thresholds, columns=['thresholds']), on='thresholds', how='inner')
    ks = max(recall - false_positive_rate)
    plt.title('KS')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.xlabel('label')
    plt.show()
# 连续变量后面可以直接用于分箱,名义变量:有序名义变量可以需要排序,无序变量需要进行转换
def get_colname_d_continue(data):
    col_all = data.columns
    col_all_type = data.dtypes
    col_type = []
    for i in range(len(col_all)):
        if col_all[i] != 'y':
            if str(col_all_type[i]) in ('int64', 'float64'):
                col_type.append([col_all[i], 'continuous'])
            else:
                col_type.append([col_all[i], 'discrete'])
    return col_type

#使用决策树算法来切分决策点
def split_tree_cut( data, col, max_depth=None, max_leaf_nodes=5, min_samples_leaf=0.05):
    data_notnull = data[[col, 'y']][data[col].notnull()]  
    cut_point = []
    if len(np.unique(data_notnull[col])) > 1:
        x = data_notnull[col].values.reshape(-1, 1)
        y = data_notnull['y'].values
        clf = DecisionTreeClassifier(criterion='entropy', 
                                        max_depth=max_depth,  
                                        max_leaf_nodes=max_leaf_nodes,  
                                        min_samples_leaf=min_samples_leaf) 
        clf.fit(x, y)  
        threshold = np.unique(clf.tree_.threshold)
        x_num = np.unique(x)
        for i in threshold:
            if i != -2:
                point = np.round(max(x_num[x_num < i]), 2)  
                cut_point.extend([point])
        cut_point = [float(str(i)) for i in cut_point]
        cut_point = [-inf] + cut_point + [inf]
    return cut_point
    

#提高测试数据的容错能力
def get_woe_iv(data_discrete, col):
    result = data_discrete.groupby(col)['y'].agg([('1_num', lambda y: (y == 1).sum()),
                                                    ('0_num', lambda y: (y == 0).sum()),
                                                    ('total_num', 'count')]).reset_index()
    result['1_pct'] = result['1_num'] / result['1_num'].sum()
    result['0_pct'] = result['0_num'] / result['0_num'].sum()
    result['total_pct'] = result['total_num'] / result['total_num'].sum()
    result['1_rate'] = result['1_num'] / result['total_num']
    result['woe'] = np.log(result['1_pct'] / result['0_pct'])  # WOE
    result['iv'] = (result['1_pct'] - result['0_pct']) * result['woe']  # IV
    result['total_iv'] = result['iv'].sum()
    result.replace([-inf, inf], [0, 0], inplace=True)
    result = result.rename(columns={col: "cut_points"})
    return result

  #得到IV信息
def get_iv(data):
    col_iv = []
    for col in [i for i in data.columns if i != 'y']:
        col_woe_iv = get_woe_iv(data, col)
        col_iv.append([col, col_woe_iv['iv'].sum()])
    return col_iv

#得到预测结果概率

def get_predict_score_proba(data, score_card, increase_score=50,odds=1/20,base_score=600):
    b = increase_score / np.log(2)
    a = base_score + b * np.log(odds) 
    basescore = round(a - b * score_card['lr_intercept'][0], 0)
    col_result = score_card['col'].unique().tolist() 
    get_col_continuous_cut_points = score_card[['col', 'cuts']][score_card['type'] == 'continuous'].drop_duplicates('col').values.tolist()
    data_discrete = split_cut_result(data[col_result], get_col_continuous_cut_points)
    data_score_proba = data[['MOBILE']]
    for col in score_card['col'].unique():
        col_score = col + 'score'
        cut_points = score_card['cut_points'][score_card['col'] == col].tolist()
        score = score_card['score'][score_card['col'] == col].tolist()
        data_score_proba[col_score] = data_discrete[col].replace(cut_points, score)
    data_score_proba['score'] = data_score_proba.sum(axis=1)+ basescore
    data_score_proba['proba'] =  1 / (1 + np.e ** ((data_score_proba['score'] - a) / b))
    return data_score_proba

 

 

if __main__:

col_name =['col1','col2','col3']

col_result1=col_name

get_continuous_cut_points = [col for col in get_col_continuous_cut_points if col[0] in col_result1]
score_card=get_Lasso_card(data_train[col_name + ['y']], get_continuous_cut_points, increase_score=50, base_score=600)

graph_roc_ks(data_train,score_card)

 

score_card.to_csv('评分卡.csv')
test_df_score = get_predict_score_proba(test_df, score_card, increase_score=50,odds=1/20,base_score=600)
test_df_score.to_csv('result.csv')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值