核心代码讲解:
注解:核心代码也是参考之前人的代码,理解的;希望读者引用时候,也附上本作者地址
def split_cut_result(data, get_col_continuous_cut_points):
cols = [i for i in data.columns if i not in [i[0] for i in get_col_continuous_cut_points]]
data_cut_result = data[cols].copy()
for col, cut_points in get_col_continuous_cut_points:
data_cut_result[col] = pd.cut(data[col], cut_points).astype("str")
data_cut_result = data_cut_result.fillna('null')
data_cut_result.replace('nan', 'null', inplace=True)
return data_cut_result
#woe编码转换
def get_woe(data_discrete):
data_woe = pd.DataFrame()
for col in [i for i in data_discrete.columns if i != 'y']:
col_woe_iv = get_woe_iv(data_discrete, col)
data_woe[col] = data_discrete[col].replace(list(col_woe_iv['cut_points']), list(col_woe_iv['woe']))
data_woe['y'] = data_discrete['y']
return data_woe
#使用iv,共线性以及lasso回归代码来实现特征降维
def get_iv_corr_logistic_l1_col(data_woe, col_iv, min_iv=0.02, max_corr=0.6, C=0.01, penalty='l1'):
col_filter = [col for col, iv in col_iv if iv > min_iv]
col_iv_filter = [[col, iv] for col, iv in col_iv if iv > min_iv]
data_woe_corr = data_woe[col_filter].corr()
data_woe_corr_list = data_woe_corr.values.reshape(-1, 1)
col_iv_result = []
for col1, iv1 in col_iv_filter:
for col2, iv2 in col_iv_filter:
col_iv_result.append([col1, col2, iv1, iv2, iv1 - iv2])
data_woe_corr_iv = pd.DataFrame(col_iv_result, columns=['col1', 'col2', 'iv1', 'iv2', 'iv1_iv2'])
data_woe_corr_iv['corr'] = data_woe_corr_list
col_delete = data_woe_corr_iv['col1'][(data_woe_corr_iv['corr'] < 1) & (data_woe_corr_iv['corr'] > max_corr) & (
data_woe_corr_iv['iv1_iv2'] < 0)].unique()
col_filter_result = [col for col in col_filter if col not in (col_delete)]
lr = linear_model.LogisticRegression(C=C, penalty=penalty).fit(data_woe[col_filter_result], data_woe['y'])
col_result = [col_filter_result[i] for i in range(len(col_filter_result)) if lr.coef_[0][i] != 0]
return col_result
def get_Lasso_card(data, get_col_continuous_cut_points, increase_score=50,odds=1/20,base_score=600):
col_types = get_colname_d_continue(data)
col_result = [i for i in data.columns if i != 'y']
data_discrete = split_cut_result(data, get_col_continuous_cut_points)
data_woe = get_woe(data_discrete)
lr = linear_model.LogisticRegression(C=1, penalty='l2')
lr.fit(data_woe[col_result], data_woe['y'])
b = increase_score / np.log(2)
a = base_score + b * np.log(odds)
score_card = pd.DataFrame()
for col in col_result:
col_cut_point_woe = get_woe_iv(data_discrete, col)
col_cut_point_woe['col'] = col
score_card = pd.concat([score_card, col_cut_point_woe])
col_coef = pd.DataFrame(col_result, lr.coef_[0]).reset_index()
col_coef.columns = ['col_coef', 'col']
score_card['lr_intercept'] = lr.intercept_[0]
score_card = pd.merge(score_card, col_coef, on=['col'], how='left')
score_card['score'] = score_card['woe'] * score_card['col_coef'] * (- b)
score_card = pd.merge(score_card, pd.DataFrame(col_types, columns=['col', 'type']), on='col', how='left')
score_card = pd.merge(score_card, pd.DataFrame(get_col_continuous_cut_points, columns=['col', 'cuts']), on='col',how='left')
data_cut_points_id = pd.DataFrame()
for col, cut_point in get_col_continuous_cut_points:
result = pd.DataFrame()
result['cut_points'] = pd.cut(data[col], cut_point).astype('str').unique()
result['cut_points_id'] = pd.cut(data[col], cut_point).unique()._codes
result['cut_points'].replace('nan', 'null', inplace=True)
result['col'] = col
data_cut_points_id = pd.concat([data_cut_points_id, result])
score_card = pd.merge(score_card, data_cut_points_id, on=['col', 'cut_points'], how='left').sort_values(
['col', 'cut_points_id', 'cut_points'])
score_card = score_card[
['col', 'type', 'cuts', 'cut_points', '1_num', '0_num', 'total_num', '1_pct', '0_pct', 'total_pct',
'1_rate', 'woe', 'iv', 'total_iv', 'col_coef', 'lr_intercept', 'score']].reset_index(drop=True)
return score_card
def predict_score_proba(data, score_card, increase_score=50,odds=1/20,base_score=600):
b = increase_score / np.log(2)
a = base_score + b * np.log(odds)
basescore = round(a - b * score_card['lr_intercept'][0], 0)
col_result = score_card['col'].unique().tolist() + ['y']
get_col_continuous_cut_points = score_card[['col', 'cuts']][score_card['type'] == 'continuous'].drop_duplicates('col').values.tolist()
data_discrete = split_cut_result(data[col_result], get_col_continuous_cut_points)
data_score_proba = data[['MOBILE']]
for col in score_card['col'].unique():
col_score = col + 'score'
cut_points = score_card['cut_points'][score_card['col'] == col].tolist()
score = score_card['score'][score_card['col'] == col].tolist()
data_score_proba[col_score] = data_discrete[col].replace(cut_points, score)
data_score_proba['score'] = data_score_proba.sum(axis=1)+ basescore
data_score_proba['proba'] = 1 / (1 + np.e ** ((data_score_proba['score'] - a) / b))
return data_score_proba
def score(data, score_card):
data_score_proba = predict_score_proba(data, score_card)
false_positive_rate, recall, thresholds = roc_curve(data['y'], data_score_proba['proba'])
roc_auc = auc(false_positive_rate, recall)
ks = max(recall - false_positive_rate)
result = {}
result['auc'] = roc_auc
result['ks'] = ks
return result
#KS与ROC画图
def graph_roc_ks(data, score_card):
data_score_proba = predict_score_proba(data, score_card)
false_positive_rate, recall, thresholds = roc_curve(data['y'], data_score_proba['proba'],drop_intermediate=False)
roc_auc = auc(false_positive_rate, recall)
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.title('ROC')
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.4f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('false_positive_rate')
plt.ylabel('recall')
plt.subplot(122)
pre = sorted(data_score_proba['proba'], reverse=True)
num = [(i) * int(len(pre) / 10) for i in range(10)]
num = num + [(len(pre) - 1)]
ks_thresholds = [max(thresholds[thresholds <= pre[i]]) for i in num]
data_ks = pd.DataFrame([false_positive_rate, recall, thresholds]).T
data_ks.columns = ['fpr', 'tpr', 'thresholds']
data_ks = pd.merge(data_ks, pd.DataFrame(ks_thresholds, columns=['thresholds']), on='thresholds', how='inner')
ks = max(recall - false_positive_rate)
plt.title('KS')
plt.plot(np.array(range(len(num))), data_ks['tpr'])
plt.plot(np.array(range(len(num))), data_ks['fpr'])
plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
plt.legend(loc='lower right')
plt.xlim([0, 10])
plt.ylim([0.0, 1.0])
plt.xlabel('label')
plt.show()
# 连续变量后面可以直接用于分箱,名义变量:有序名义变量可以需要排序,无序变量需要进行转换
def get_colname_d_continue(data):
col_all = data.columns
col_all_type = data.dtypes
col_type = []
for i in range(len(col_all)):
if col_all[i] != 'y':
if str(col_all_type[i]) in ('int64', 'float64'):
col_type.append([col_all[i], 'continuous'])
else:
col_type.append([col_all[i], 'discrete'])
return col_type
#使用决策树算法来切分决策点
def split_tree_cut( data, col, max_depth=None, max_leaf_nodes=5, min_samples_leaf=0.05):
data_notnull = data[[col, 'y']][data[col].notnull()]
cut_point = []
if len(np.unique(data_notnull[col])) > 1:
x = data_notnull[col].values.reshape(-1, 1)
y = data_notnull['y'].values
clf = DecisionTreeClassifier(criterion='entropy',
max_depth=max_depth,
max_leaf_nodes=max_leaf_nodes,
min_samples_leaf=min_samples_leaf)
clf.fit(x, y)
threshold = np.unique(clf.tree_.threshold)
x_num = np.unique(x)
for i in threshold:
if i != -2:
point = np.round(max(x_num[x_num < i]), 2)
cut_point.extend([point])
cut_point = [float(str(i)) for i in cut_point]
cut_point = [-inf] + cut_point + [inf]
return cut_point
#提高测试数据的容错能力
def get_woe_iv(data_discrete, col):
result = data_discrete.groupby(col)['y'].agg([('1_num', lambda y: (y == 1).sum()),
('0_num', lambda y: (y == 0).sum()),
('total_num', 'count')]).reset_index()
result['1_pct'] = result['1_num'] / result['1_num'].sum()
result['0_pct'] = result['0_num'] / result['0_num'].sum()
result['total_pct'] = result['total_num'] / result['total_num'].sum()
result['1_rate'] = result['1_num'] / result['total_num']
result['woe'] = np.log(result['1_pct'] / result['0_pct']) # WOE
result['iv'] = (result['1_pct'] - result['0_pct']) * result['woe'] # IV
result['total_iv'] = result['iv'].sum()
result.replace([-inf, inf], [0, 0], inplace=True)
result = result.rename(columns={col: "cut_points"})
return result
#得到IV信息
def get_iv(data):
col_iv = []
for col in [i for i in data.columns if i != 'y']:
col_woe_iv = get_woe_iv(data, col)
col_iv.append([col, col_woe_iv['iv'].sum()])
return col_iv
#得到预测结果概率
def get_predict_score_proba(data, score_card, increase_score=50,odds=1/20,base_score=600):
b = increase_score / np.log(2)
a = base_score + b * np.log(odds)
basescore = round(a - b * score_card['lr_intercept'][0], 0)
col_result = score_card['col'].unique().tolist()
get_col_continuous_cut_points = score_card[['col', 'cuts']][score_card['type'] == 'continuous'].drop_duplicates('col').values.tolist()
data_discrete = split_cut_result(data[col_result], get_col_continuous_cut_points)
data_score_proba = data[['MOBILE']]
for col in score_card['col'].unique():
col_score = col + 'score'
cut_points = score_card['cut_points'][score_card['col'] == col].tolist()
score = score_card['score'][score_card['col'] == col].tolist()
data_score_proba[col_score] = data_discrete[col].replace(cut_points, score)
data_score_proba['score'] = data_score_proba.sum(axis=1)+ basescore
data_score_proba['proba'] = 1 / (1 + np.e ** ((data_score_proba['score'] - a) / b))
return data_score_proba
if __main__:
col_name =['col1','col2','col3']
col_result1=col_name
get_continuous_cut_points = [col for col in get_col_continuous_cut_points if col[0] in col_result1]
score_card=get_Lasso_card(data_train[col_name + ['y']], get_continuous_cut_points, increase_score=50, base_score=600)
graph_roc_ks(data_train,score_card)
score_card.to_csv('评分卡.csv')
test_df_score = get_predict_score_proba(test_df, score_card, increase_score=50,odds=1/20,base_score=600)
test_df_score.to_csv('result.csv')