give me some credits

weixin_46517201

已于 2024-04-17 18:56:56 修改

阅读量716

点赞数 25

文章标签： python

于 2024-04-17 15:35:00 首次发布

本文链接：https://blog.csdn.net/weixin_46517201/article/details/137020820

版权

数据导入及预览

df_train = pd.read_csv("cs-training.csv")
df_test = pd.read_csv("cs-test.csv")
df_train.head()
df_train.info()
df_train.describe()

查找缺失值

对于缺失大于15%的数据连续性数据可以尝试使用随机森林进行填充（本次暂时省略）
也可根据特征含义，用中位数，平均数，众数填充

null_val_sums = df_train.isnull().sum()
pd.DataFrame({"Column": null_val_sums.index, "Number of Null Values": null_val_sums.values,
             "Proportion": null_val_sums.values / len(df_train) })

在这里插入图片描述

观察y值分布

可以看出不违约的人居多
样本不平衡，需要后续处理。

plt.pie(df_train["SeriousDlqin2yrs"].value_counts(), labels=["Y","N"], autopct='%1.1f%%', startangle=140)

在这里插入图片描述

探索性数据分析

尝试观察不同自定义分箱下的违约率，对特征进行探索性分析

#不同分箱情况下，箱内人数以及违约率
def get_compare_plot(data,feature_plot,cut_num):
    ylabel=[]
    for i in range(len(cut_num)-1):
        ylabel.append(str(cut_num[i])+"-"+str(cut_num[i+1]))
    #分箱
    data["ycut"] = pd.cut(data[feature_plot],bins = cut_num, labels=ylabel)
    #制作数据透视表
    feature_data = pd.pivot_table(data, index = ["ycut"], values= feature_plot, aggfunc=lambda x: len(x), columns=["SeriousDlqin2yrs"],fill_value=0).reset_index(drop = False)
    feature_data["r"] = feature_data[1]/(feature_data[0]+feature_data[1])
    #绘图
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot()
    fig.autofmt_xdate()
    ax.bar(feature_data["ycut"],feature_data[0],color="green")
    ax.bar(feature_data["ycut"],feature_data[1],color="red",bottom=feature_data[0])
    ax.set_ylabel('Feature')
    ax_twinx = ax.twinx()
    ax_twinx.plot(feature_data["ycut"], feature_data["r"], linestyle='-', marker='o', markersize=3, color="orange")
    ax_twinx.set_ylim([0,1])
    ax_twinx.set_ylabel('RATE')
    plt.title(feature_plot)
    plt.show()
    print(feature_data)

数据清洗

通过不同分箱观察数据特征，结合业务逻辑，数据走向
选择性剔除异常数据、合并/删除某些特征、特征衍生以及处理缺失值
少量缺失值（5%）以下，可以根据数据含义用众数/中位数/平均数填充
缺失大于15%的数据连续性数据可以尝试使用随机森林进行填充。

样本不平衡处理

数据处理中发现，正负样本比例严重不均衡
通过SMOTE算法对数据进行过采样

import imblearn
from imblearn.over_sampling import SMOTE

train_x = X.reset_index(drop=True)
train_y = Y.reset_index(drop=True)

sm = SMOTE(random_state=42)  ##实例化
train_x,train_y = sm.fit_resample(train_x,train_y)  ##返回已经上采样完毕过后的特征矩阵和标签
train_y.value_counts()

使得违约和不违约的样本数量一致

数据分集

from sklearn.model_selection import train_test_split
X = pd.DataFrame(train_x)
Y = pd.DataFrame(train_y)

#分测试集和验证集
X_train,X_vali,Y_train,Y_vali = train_test_split(X,Y,test_size = 0.3,random_state = 200)

train_data = pd.concat([Y_train,X_train],axis = 1)   
#规范索引 
train_data.index = range(train_data.shape[0])

vali_data = pd.concat([Y_vali,X_vali],axis = 1)
#规范索引
vali_data.index = range(vali_data.shape[0])

toad分箱

import toad
# WOE编码
combiner = toad.transform.Combiner()
# 训练数据并指定分箱方法
combiner.fit(pd.concat([Y_train,X_train], axis=1), y='SeriousDlqin2yrs',method= 'chi',min_samples = 0.05,exclude=[])
train_adj = combiner.transform(pd.concat([Y_train,X_train], axis=1))
bins = combiner. export()
hand_bins = {k:[-np.inf,*v[:-1],v[-1],np.inf] for k,v in bins.items()}

woe值计算

##包装一个函数计算不同分箱的woe值
def get_woe(df,col,y,bins):
    df = df[[col,y]].copy()
    df["cut"] = pd.cut(df[col],bins)
    bins_df = df.groupby("cut")[y].value_counts().unstack()
    bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum())+1e-6)
    woe = bins_df["woe"]
    return woe

#将所有特征的WOE存储到字典当中
woeall = {}
#c为每个特征名称字符
for col in hand_bins:
    woeall[col] = get_woe(train_data,col,"SeriousDlqin2yrs",hand_bins[col])
#输出各个分箱的woe值
woeall

在这里插入图片描述
RevolvingUtilizationOfUnsecuredLines，DebtRatio，NumberRealEstateLoansOrLines，NumberOfDependents，NumberOfTimePastDueNotWorse不完全单调
DebtRatio，NumberRealEstateLoansOrLines，NumberOfDependents，NumberOfTimePastDueNotWorse更加严重

toad分箱绘图观察不单调的特征

from toad.plot import badrate_plot, proportion_plot 
from toad.plot import  bin_plot,badrate_plot
# 可视化RevolvingUtilizationOfUnsecuredLines箱子的坏样本率
adj_var = 'RevolvingUtilizationOfUnsecuredLines'
#bar代表了样本量占比，红线代表了坏客户占比
bin_plot(train_adj, target='SeriousDlqin2yrs', x=adj_var)

在这里插入图片描述
通过合并，判断特征数值等等，对有问题的特征进行重新分箱

## 这里修改箱子的情况
combiner.set_rules({'RevolvingUtilizationOfUnsecuredLines': [0.0396239,
  0.115524,
  0.18701,
  0.307072,
  0.470554,
  0.696275,
  0.882243,
  0.999999],'DebtRatio':[0.345,0.49,2.0],
  'NumberRealEstateLoansOrLines':[1, 2],
  'NumberOfDependents': [0.5,1.5,2.5],
  'NumberOfTimePastDueNotWorse': [2.6232e-05,0.66671]})
data_adj = combiner.transform(pd.concat([Y_train,X_train], axis=1))

#继续可视化分箱后的坏样本率，不断调整分箱情况
adj_var = 'RevolvingUtilizationOfUnsecuredLines'
bin_plot(data_adj, target='SeriousDlqin2yrs', x=adj_var)
# 可视化每个箱子的坏样本率
adj_var = 'DebtRatio'
bin_plot(data_adj, target='SeriousDlqin2yrs', x=adj_var)

最终确定分箱，更新模型

#调整箱子
adj_bin ={'RevolvingUtilizationOfUnsecuredLines': [0.0396239,
  0.115524,
  0.18701,
  0.307072,
  0.470554,
  0.696275,
  0.882243,
  0.999999],'DebtRatio':[0.345,0.49,2.0],
  'NumberRealEstateLoansOrLines':[1, 2],
  'NumberOfDependents': [0.5,1.5,2.5],
  'NumberOfTimePastDueNotWorse': [2.6232e-05,0.66671]}
# 更新调整后的分箱
combiner.set_rules(adj_bin)
bins = combiner.export()
hand_bins_ = {k:[-np.inf,*v[:-1],v[-1],np.inf] for k,v in bins. items()}
#重新计算woe值，将所有特征的WOE存储到字典当中
woeall_ = {}
for col in hand_bins:
    woeall_[col] = get_woe(train_data,col,"SeriousDlqin2yrs",hand_bins_[col])
woeall_

训练集woe转化

#计算WOE，仅在训练集计算WOE，不然会标签泄露
#初始化
transer = toad.transform.WOETransformer()
#根据分箱情况分类
binned_data = combiner.transform(pd.concat([Y_train,X_train], axis=1))
#对WOE的值进行转化，映射到原数据集上。对训练集用fit_transform,测试集用transform.
data_tr_woe = transer.fit_transform(binned_data, binned_data['SeriousDlqin2yrs'],  exclude=['SeriousDlqin2yrs'])
data_tr_woe.head()

验证集woe转化

# 先分箱
binned_data = combiner.transform(X_vali)
#对WOE的值进行转化，映射到原数据集上。测试集用transform.
data_test_woe = transer.transform(binned_data)
data_test_woe.head()

逻辑回归

# 训练LR模型
from sklearn.linear_model import LogisticRegression
 #实例化
lr = LogisticRegression(solver='liblinear')
#模型训练
lr.fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
#验证集检验
lr.score(data_test_woe,Y_vali)

from sklearn import metrics
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,roc_curve,auc,roc_auc_score,mean_squared_error

#交叉验证
from sklearn.model_selection import cross_val_score
score = cross_val_score(lr,data_tr_woe.iloc[:,1:],data_tr_woe.iloc[:,0],cv=10).mean()
print(score)

分别使用L1,L2正则项以及不同的系数计算准确率

c_1 = np.linspace(0.01,1,20)
score_1 = []
score_2 = []
for i in c_1: 
    lr_1 = LogisticRegression(penalty="l1",solver='liblinear',C=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
    score_1.append(lr_1.score(data_test_woe,Y_vali))
    lr_2 = LogisticRegression(penalty="l2",solver='liblinear',C=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
    score_2.append(lr_2.score(data_test_woe,Y_vali))
plt.figure()
plt.plot(c_1,score_1,color="blue")
plt.plot(c_1,score_2,color="green")
plt.show()

在这里插入图片描述

寻找最佳迭代次数

score = []
for i in np.arange(1,20,1): 
    lr = LogisticRegression(solver='liblinear',max_iter=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
    score.append(lr.score(data_test_woe,Y_vali))
plt.figure()
plt.plot(np.arange(1,20,1),score)
plt.show()

在这里插入图片描述

绘制ROC曲线

def model_metrics(model, X, y):
    y_pred = model.predict(X)
    #准确率：预测正确的样本/总样本
    #(TP+TN）/all
    accuracy = accuracy_score(y, y_pred)
    #精确率：将正类预测为正类的样本/所有被预测为正类的样本
    #P = TP/(TP+FP)
    precision = precision_score(y, y_pred)
    #召回率：将正类预测为正类的样本/所有实际的正类的样本
    #R = TP/(TP+FN)
    recall = recall_score(y, y_pred)
    #精确率和召回率的调和平均值
    #F1 = 2*P*R/(P+R)
    f1 = f1_score(y, y_pred)
    
    #AUC：ROC曲线下面积
    roc_auc = roc_auc_score(y, y_pred)
    #fpr:false positive rate
    #tpr:true positive rate
    #thresholds:不同阈值
    fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
    
    ks = max(tpr - fpr)
    plt.plot(fpr, tpr, label='ROC Curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1, 'ROC AUC': roc_auc,'KS': ks}

#根据计算的最佳迭代数和正则项重新训练模型
lr = LogisticRegression(solver='liblinear',max_iter=7).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
#打印训练集和测试集的ROC曲线
print('train ',model_metrics(lr,data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs']))
print('test ',model_metrics(lr,data_test_woe,Y_vali))

在这里插入图片描述

评分卡建立

score = A-B*log(odds)

$odds=\frac{p}{1-p}$
p:违约的概率
在逻辑回归模型中， $p=y=\frac{1}{1+e^{-(wx+b)}}$
因此 $e^{wx+b}=\frac{p}{1-p}$
$l n (o dd s) = w x + b$

两个假设（一般由公司规定）：
1.假设在某个特定的比率设定特定的预期分值(设定比率为θ的特定点的分值为P)
P=A-Blog(θ)
2.假设比率翻番的分数（PDO）(比率为2θ的点的分值为P-PDO).
P-PDO=A-Blog(2θ)
联立方程解出A,B

score = A-B*(wx+b)
score = A-Bb-bwx
score=base_score-bwx
其中base_score=A-Bb
在建立特征分箱的评分卡时，只需计算该分箱下的-bwx

lc=[]
for i,col in enumerate(X.columns):
    score = woeall_[col] * (-B*lr.coef_[0][i])
    p = pd.DataFrame(score)
    for j in range(len(p)):
        lc.append([col,str(p.index[j]),p.iat[j, 0]])
scorecard = pd.DataFrame(lc,columns=["feature","bins","score"])

在这里插入图片描述

weixin_46517201

关注

25
点赞
踩
8

收藏

觉得还不错? 一键收藏
1
评论
give me some credits

RevolvingUtilizationOfUnsecuredLines，DebtRatio，NumberRealEstateLoansOrLines，NumberOfDependents，NumberOfTimePastDueNotWorse不完全单调。1.假设在某个特定的比率设定特定的预期分值(设定比率为θ的特定点的分值为P)2.假设比率翻番的分数（PDO）(比率为2θ的点的分值为P-PDO).通过合并，判断特征数值等等，对有问题的特征进行重新分箱。通过不同分箱观察数据特征，结合业务逻辑，数据走向。
复制链接

扫一扫