数据导入及预览
df_train = pd.read_csv("cs-training.csv")
df_test = pd.read_csv("cs-test.csv")
df_train.head()
df_train.info()
df_train.describe()
查找缺失值
对于缺失大于15%的数据连续性数据可以尝试使用随机森林进行填充(本次暂时省略)
也可根据特征含义,用中位数,平均数,众数填充
null_val_sums = df_train.isnull().sum()
pd.DataFrame({"Column": null_val_sums.index, "Number of Null Values": null_val_sums.values,
"Proportion": null_val_sums.values / len(df_train) })
观察y值分布
可以看出不违约的人居多
样本不平衡,需要后续处理。
plt.pie(df_train["SeriousDlqin2yrs"].value_counts(), labels=["Y","N"], autopct='%1.1f%%', startangle=140)
探索性数据分析
尝试观察不同自定义分箱下的违约率,对特征进行探索性分析
#不同分箱情况下,箱内人数以及违约率
def get_compare_plot(data,feature_plot,cut_num):
ylabel=[]
for i in range(len(cut_num)-1):
ylabel.append(str(cut_num[i])+"-"+str(cut_num[i+1]))
#分箱
data["ycut"] = pd.cut(data[feature_plot],bins = cut_num, labels=ylabel)
#制作数据透视表
feature_data = pd.pivot_table(data, index = ["ycut"], values= feature_plot, aggfunc=lambda x: len(x), columns=["SeriousDlqin2yrs"],fill_value=0).reset_index(drop = False)
feature_data["r"] = feature_data[1]/(feature_data[0]+feature_data[1])
#绘图
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot()
fig.autofmt_xdate()
ax.bar(feature_data["ycut"],feature_data[0],color="green")
ax.bar(feature_data["ycut"],feature_data[1],color="red",bottom=feature_data[0])
ax.set_ylabel('Feature')
ax_twinx = ax.twinx()
ax_twinx.plot(feature_data["ycut"], feature_data["r"], linestyle='-', marker='o', markersize=3, color="orange")
ax_twinx.set_ylim([0,1])
ax_twinx.set_ylabel('RATE')
plt.title(feature_plot)
plt.show()
print(feature_data)
数据清洗
通过不同分箱观察数据特征,结合业务逻辑,数据走向
选择性剔除异常数据、合并/删除某些特征、特征衍生以及处理缺失值
少量缺失值(5%)以下,可以根据数据含义用众数/中位数/平均数填充
缺失大于15%的数据连续性数据可以尝试使用随机森林进行填充。
样本不平衡处理
数据处理中发现,正负样本比例严重不均衡
通过SMOTE算法对数据进行过采样
import imblearn
from imblearn.over_sampling import SMOTE
train_x = X.reset_index(drop=True)
train_y = Y.reset_index(drop=True)
sm = SMOTE(random_state=42) ##实例化
train_x,train_y = sm.fit_resample(train_x,train_y) ##返回已经上采样完毕过后的特征矩阵和标签
train_y.value_counts()
使得违约和不违约的样本数量一致
数据分集
from sklearn.model_selection import train_test_split
X = pd.DataFrame(train_x)
Y = pd.DataFrame(train_y)
#分测试集和验证集
X_train,X_vali,Y_train,Y_vali = train_test_split(X,Y,test_size = 0.3,random_state = 200)
train_data = pd.concat([Y_train,X_train],axis = 1)
#规范索引
train_data.index = range(train_data.shape[0])
vali_data = pd.concat([Y_vali,X_vali],axis = 1)
#规范索引
vali_data.index = range(vali_data.shape[0])
toad分箱
import toad
# WOE编码
combiner = toad.transform.Combiner()
# 训练数据并指定分箱方法
combiner.fit(pd.concat([Y_train,X_train], axis=1), y='SeriousDlqin2yrs',method= 'chi',min_samples = 0.05,exclude=[])
train_adj = combiner.transform(pd.concat([Y_train,X_train], axis=1))
bins = combiner. export()
hand_bins = {k:[-np.inf,*v[:-1],v[-1],np.inf] for k,v in bins.items()}
woe值计算
##包装一个函数计算不同分箱的woe值
def get_woe(df,col,y,bins):
df = df[[col,y]].copy()
df["cut"] = pd.cut(df[col],bins)
bins_df = df.groupby("cut")[y].value_counts().unstack()
bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum())+1e-6)
woe = bins_df["woe"]
return woe
#将所有特征的WOE存储到字典当中
woeall = {}
#c为每个特征名称字符
for col in hand_bins:
woeall[col] = get_woe(train_data,col,"SeriousDlqin2yrs",hand_bins[col])
#输出各个分箱的woe值
woeall
RevolvingUtilizationOfUnsecuredLines,DebtRatio,NumberRealEstateLoansOrLines,NumberOfDependents,NumberOfTimePastDueNotWorse不完全单调
DebtRatio,NumberRealEstateLoansOrLines,NumberOfDependents,NumberOfTimePastDueNotWorse更加严重
toad分箱绘图观察不单调的特征
from toad.plot import badrate_plot, proportion_plot
from toad.plot import bin_plot,badrate_plot
# 可视化RevolvingUtilizationOfUnsecuredLines箱子的坏样本率
adj_var = 'RevolvingUtilizationOfUnsecuredLines'
#bar代表了样本量占比,红线代表了坏客户占比
bin_plot(train_adj, target='SeriousDlqin2yrs', x=adj_var)
通过合并,判断特征数值等等,对有问题的特征进行重新分箱
## 这里修改箱子的情况
combiner.set_rules({'RevolvingUtilizationOfUnsecuredLines': [0.0396239,
0.115524,
0.18701,
0.307072,
0.470554,
0.696275,
0.882243,
0.999999],'DebtRatio':[0.345,0.49,2.0],
'NumberRealEstateLoansOrLines':[1, 2],
'NumberOfDependents': [0.5,1.5,2.5],
'NumberOfTimePastDueNotWorse': [2.6232e-05,0.66671]})
data_adj = combiner.transform(pd.concat([Y_train,X_train], axis=1))
#继续可视化分箱后的坏样本率,不断调整分箱情况
adj_var = 'RevolvingUtilizationOfUnsecuredLines'
bin_plot(data_adj, target='SeriousDlqin2yrs', x=adj_var)
# 可视化每个箱子的坏样本率
adj_var = 'DebtRatio'
bin_plot(data_adj, target='SeriousDlqin2yrs', x=adj_var)
最终确定分箱,更新模型
#调整箱子
adj_bin ={'RevolvingUtilizationOfUnsecuredLines': [0.0396239,
0.115524,
0.18701,
0.307072,
0.470554,
0.696275,
0.882243,
0.999999],'DebtRatio':[0.345,0.49,2.0],
'NumberRealEstateLoansOrLines':[1, 2],
'NumberOfDependents': [0.5,1.5,2.5],
'NumberOfTimePastDueNotWorse': [2.6232e-05,0.66671]}
# 更新调整后的分箱
combiner.set_rules(adj_bin)
bins = combiner.export()
hand_bins_ = {k:[-np.inf,*v[:-1],v[-1],np.inf] for k,v in bins. items()}
#重新计算woe值,将所有特征的WOE存储到字典当中
woeall_ = {}
for col in hand_bins:
woeall_[col] = get_woe(train_data,col,"SeriousDlqin2yrs",hand_bins_[col])
woeall_
训练集woe转化
#计算WOE,仅在训练集计算WOE,不然会标签泄露
#初始化
transer = toad.transform.WOETransformer()
#根据分箱情况分类
binned_data = combiner.transform(pd.concat([Y_train,X_train], axis=1))
#对WOE的值进行转化,映射到原数据集上。对训练集用fit_transform,测试集用transform.
data_tr_woe = transer.fit_transform(binned_data, binned_data['SeriousDlqin2yrs'], exclude=['SeriousDlqin2yrs'])
data_tr_woe.head()
验证集woe转化
# 先分箱
binned_data = combiner.transform(X_vali)
#对WOE的值进行转化,映射到原数据集上。测试集用transform.
data_test_woe = transer.transform(binned_data)
data_test_woe.head()
逻辑回归
# 训练LR模型
from sklearn.linear_model import LogisticRegression
#实例化
lr = LogisticRegression(solver='liblinear')
#模型训练
lr.fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
#验证集检验
lr.score(data_test_woe,Y_vali)
from sklearn import metrics
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,roc_curve,auc,roc_auc_score,mean_squared_error
#交叉验证
from sklearn.model_selection import cross_val_score
score = cross_val_score(lr,data_tr_woe.iloc[:,1:],data_tr_woe.iloc[:,0],cv=10).mean()
print(score)
分别使用L1,L2正则项以及不同的系数计算准确率
c_1 = np.linspace(0.01,1,20)
score_1 = []
score_2 = []
for i in c_1:
lr_1 = LogisticRegression(penalty="l1",solver='liblinear',C=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
score_1.append(lr_1.score(data_test_woe,Y_vali))
lr_2 = LogisticRegression(penalty="l2",solver='liblinear',C=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
score_2.append(lr_2.score(data_test_woe,Y_vali))
plt.figure()
plt.plot(c_1,score_1,color="blue")
plt.plot(c_1,score_2,color="green")
plt.show()
寻找最佳迭代次数
score = []
for i in np.arange(1,20,1):
lr = LogisticRegression(solver='liblinear',max_iter=i).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
score.append(lr.score(data_test_woe,Y_vali))
plt.figure()
plt.plot(np.arange(1,20,1),score)
plt.show()
绘制ROC曲线
def model_metrics(model, X, y):
y_pred = model.predict(X)
#准确率:预测正确的样本/总样本
#(TP+TN)/all
accuracy = accuracy_score(y, y_pred)
#精确率:将正类预测为正类的样本/所有被预测为正类的样本
#P = TP/(TP+FP)
precision = precision_score(y, y_pred)
#召回率:将正类预测为正类的样本/所有实际的正类的样本
#R = TP/(TP+FN)
recall = recall_score(y, y_pred)
#精确率和召回率的调和平均值
#F1 = 2*P*R/(P+R)
f1 = f1_score(y, y_pred)
#AUC:ROC曲线下面积
roc_auc = roc_auc_score(y, y_pred)
#fpr:false positive rate
#tpr:true positive rate
#thresholds:不同阈值
fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
ks = max(tpr - fpr)
plt.plot(fpr, tpr, label='ROC Curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
return {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1, 'ROC AUC': roc_auc,'KS': ks}
#根据计算的最佳迭代数和正则项重新训练模型
lr = LogisticRegression(solver='liblinear',max_iter=7).fit(data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs'])
#打印训练集和测试集的ROC曲线
print('train ',model_metrics(lr,data_tr_woe.drop(['SeriousDlqin2yrs'],axis=1), data_tr_woe['SeriousDlqin2yrs']))
print('test ',model_metrics(lr,data_test_woe,Y_vali))
评分卡建立
score = A-B*log(odds)
o
d
d
s
=
p
1
−
p
odds=\frac{p}{1-p}
odds=1−pp
p:违约的概率
在逻辑回归模型中,
p
=
y
=
1
1
+
e
−
(
w
x
+
b
)
p=y=\frac{1}{1+e^{-(wx+b)}}
p=y=1+e−(wx+b)1
因此
e
w
x
+
b
=
p
1
−
p
e^{wx+b}=\frac{p}{1-p}
ewx+b=1−pp
l
n
(
o
d
d
s
)
=
w
x
+
b
ln(odds)=wx+b
ln(odds)=wx+b
两个假设(一般由公司规定):
1.假设在某个特定的比率设定特定的预期分值(设定比率为θ的特定点的分值为P)
P=A-Blog(θ)
2.假设比率翻番的分数(PDO)(比率为2θ的点的分值为P-PDO).
P-PDO=A-Blog(2θ)
联立方程解出A,B
score = A-B*(wx+b)
score = A-Bb-bwx
score=base_score-bwx
其中base_score=A-Bb
在建立特征分箱的评分卡时,只需计算该分箱下的-bwx
lc=[]
for i,col in enumerate(X.columns):
score = woeall_[col] * (-B*lr.coef_[0][i])
p = pd.DataFrame(score)
for j in range(len(p)):
lc.append([col,str(p.index[j]),p.iat[j, 0]])
scorecard = pd.DataFrame(lc,columns=["feature","bins","score"])