data=pd.read_csv('D:/database/training.csv')
data.describe()
#导入数据,同时进行描述性分析
#首先观察缺失值,异常值等情况,然后就平均数,中位数,极大极小值等来了解大概的分布情况
#可以用head来观察表头信息,也可以用shape,info来观察行列情况和基本信息
#删掉这行明显存在问题的数据
data=data.drop(columns=['Unnamed: 0'])
#去除重复值
data.drop_duplicates(inplace=True)
data.info()
#要记得习惯性的调整索引
data.index=range(data.shape[0])
data.info()
#可以看见下面显示的索引发生了变化
#观察 NumberOfDependents 的分布情况发现:大量集中在0-4,用均值,中位数貌似不太合适,众数会取0,值会偏低
import seaborn as sns
sns.countplot(x='NumberOfDependents',data=data)
#抓取主要数值,利用随机方式写入
Dependents = pd.Series([0,1,2,3,4]) #将目标数以索引方式写入
for i in data['NumberOfDependents'][data['NumberOfDependents'].isnull()].index: #把该特征中的空值的索引遍历出来
data['NumberOfDependents'][i] = Dependents.sample(1) #将目标数赋值写入,完成补缺
data.info()
#定义随机森林函数
def fill_miss_rf(x,y,to_fill):
"""
x 目标矩阵
y 目标标签
to_fill 目标填充列
"""
df=x.copy() #首先构建新的矩阵
fill=df.loc[:,to_fill] #把要填补的目标列的索引拿出来
df = pd.concat([df.loc[:,df.columns !=to_fill],pd.DataFrame(y)],axis=1) # 把非目标列和无缺数据合并,形成新标签
#把训练集和测试集整理出来
#train是没有缺失的值,用来进行训练
#text是缺失的值,用来模型测试
Ytrain=fill[fill.notnull()] #目标列里面非空的
Ytext=fill[fill.isnull()] #目标列里面空的
Xtrain=df.iloc[Ytrain.index,:] #用目标索引提取出非空目标行
Xtext=df.iloc[Ytext.index,:]
#随机森林入模
from sklearn.ensemble import RandomForestRegressor as rfr
rfr=rfr(n_estimators=30).fit(Xtrain,Ytrain)
Ypredict=rfr.predict(Xtext)
#n_estimators可以设置成100,尽量大点好,我的电脑带不动,只能写30
return Ypredict
x=data.iloc[:,1:]
y=data.iloc[:,0]
y_pred=fill_miss_rf(x,y,"MonthlyIncome")
#找到月收入为空的行和那一列,进行覆盖
data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"]=y_pred
data.info()
#随机森林补缺适合于某一特征大量缺失,其他特征却比较完好的情况
#异常值处理
fig = plt.figure()
ax1 = plt.subplot()
ax1.boxplot(data['age'])
ax1.set_xticklabels(['age'])
plt.show()
#依据经验,只选取16岁以上的独立人和100岁以下者
data = data[data['age']>16]
data = data[data['age']<100]
#一次性观察多个特征的情况
x1 = data['RevolvingUtilizationOfUnsecuredLines']
x2 = data['DebtRatio']
x3 = data['MonthlyIncome']
x4 = data['NumberOfDependents']
x5 = data['NumberOfOpenCreditLinesAndLoans']
x6 = data['NumberRealEstateLoansOrLines']
fig,axes= plt.subplots(2,3,figsize=(12,8))
ax1=axes[0,0]
ax1.boxplot(x1)
ax2=axes[0,1]
ax2.boxplot(x2)
ax3=axes[0,2]
ax3.boxplot(x3)
ax4=axes[1,0]
ax4.boxplot(x4)
ax5=axes[1,1]
ax5.boxplot(x5)
ax6=axes[1,2]
ax6.boxplot(x6)
#提取异常分布的数量
x1=(data['RevolvingUtilizationOfUnsecuredLines']>1).sum()
x2=(data['DebtRatio']>1).sum()
x3=(data['MonthlyIncome']>500000).sum()
x4=(data['NumberOfDependents']>5).sum()
x5=(data['NumberOfOpenCreditLinesAndLoans']>20).sum()
print(x1)
print(x2)
print(x3)
print(x4)
print(x5)
#RevolvingUtilizationOfUnsecuredLines 和 DebtRatio 这两个是百分比,必须小于1,大于1的可以删除
data=data[data['RevolvingUtilizationOfUnsecuredLines']<1]
data=data[data['DebtRatio']<1]
#月收入和家庭人员的异常值不违背常理,没必要删除。所以月收入50万以上的可以划归到50万,家庭人员在5以上的可以划归到5人。
data["MonthlyIncome"][data["MonthlyIncome"]>500000] = 500000
data['NumberOfDependents'][data['NumberOfDependents']>5]=5
#NumberOfOpenCreditLinesAndLoans这项业务不了解,所以把超过20的划归到20
data['NumberOfOpenCreditLinesAndLoans'][data['NumberOfOpenCreditLinesAndLoans']>20]=20
#进一步观察其分布细节
x6 = data['NumberRealEstateLoansOrLines']
plt.figure(figsize=(15,5))
nums,bins,patches=plt.hist(x6,[0,2,4,6,8,10,20,40,60],range=(0,60),edgecolor='k')
plt.xticks(bins,bins)
#把超过4的划归到4
data['NumberRealEstateLoansOrLines'][data['NumberRealEstateLoansOrLines']>4]=4
#将剩下三个和业务相关的放在一起观察,可以发现异常值,一般来讲大于30天是不符合常理的
fig = plt.figure(figsize=(12,4))
x1 = data['NumberOfTime30-59DaysPastDueNotWorse']
x2 = data['NumberOfTime60-89DaysPastDueNotWorse']
x3 = data['NumberOfTimes90DaysLate']
ax = fig.add_subplot(111)
ax.boxplot([x1,x2,x3])
ax.set_xticklabels(['M1','M2','M3'])
ax.grid(True)
#将日期统一划归到30天内
data=data[data['NumberOfTime30-59DaysPastDueNotWorse']<30]
data=data[data['NumberOfTime60-89DaysPastDueNotWorse']<30]
data=data[data['NumberOfTimes90DaysLate']<30]
#结束对异常值的处理,调整索引,观察样本变动情况
data.index=range(data.shape[0])
data.info()
#把特征提取出来
x=data.iloc[:,1:]
y=data.iloc[:,0]
#准备进行过滤法初筛,这里主要是方差法和互信息法
#可以用方差法,把波动性很低且对标签变化无影响的特征删除
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold() #实例化,不填参数默认方差为
x_vare= selector.fit_transform(x,y) #获取删除不合格特征之后的新特征矩阵
x_vare.shape
#使用互信息来观察是否有相关度不足0的特征
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(x_vare,y)
k = result.shape[0] - sum(result <= 0)
x_embedded = SelectFromModel(LR_,norm_order=1)
X_embedded =x_embedded.fit_transform(x,y) #norm_order=1 是L1范式
#可以发现,结果并没有受到太多影响
cross_val_score(LR_,X_embedded,y, cv=10).mean()
df = pd.DataFrame(X_embedded)
df
#可以看见筛选以后原来的特征已经不见了
#保留特征名称
all_name = x.columns.values.tolist() # 获得所有的特征名称
select_name_index = x_embedded.get_support(indices=True) # 留下特征的索引值,list格式
select_name= []
for i in select_name_index:
select_name.append(all_name[i])
print(select_name)
#这样就能提取出最重要的几个变量了
#下面引入决策树来查看特征重要性
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(x,y)
clf.feature_importances_
#把所有变量的特征重要性都列出来
feature_name=['RevolvingUtilizationOfUnsecuredLines',
'age',
'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio',
'MonthlyIncome',
'NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents',
]
[*zip(feature_name,clf.feature_importances_)]
dt_s = sc.var_filter(data_aa, y="SeriousDlqin2yrs",iv_limit=0.02)
data_aa=data.copy()
dt_s = sc.var_filter(data_aa, y="SeriousDlqin2yrs",iv_limit=0.02)
#这里直接引入了var_filter自带的包来筛选掉iv低于0.02的变量
#可以看见有两个变量被踢出去了
dt_s.info()
#相关性分析,这里可以发现,各特征的相关性很低
corr = data.corr()
fig = plt.figure(figsize=(10,7))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(corr,annot = True,cmap = 'rainbow',ax = ax1)
#综合发现,后面的三个特征价值偏低,可以在后期分箱编码时顺便踢出去
#可以发现样本的不均衡性很高
n_sample=x.shape[0]
n_0_sample=y.value_counts()[0]
n_1_sample=y.value_counts()[1]
print("不良客户占比:",(n_1_sample/n_sample)*100,"%")
#分割样本
from sklearn.model_selection import train_test_split
x=pd.DataFrame(x)
y=pd.DataFrame(y)
x_train,x_vali,y_train,y_vali = train_test_split(x,y,test_size=0.3)
model_data = pd.concat([y_train,x_train],axis=1)
model_data.index =range(model_data.shape[0])
model_data.columns = data.columns
vali_data = pd.concat([y_vali,x_vali],axis=1)
vali_data.index =range(vali_data.shape[0])
vali_data.columns = data.columns
#统计每个分箱的0,1数量,同时把分箱的上下限给划分出来
count_y0 =model_data[model_data['SeriousDlqin2yrs']==0].groupby(by="qcut").count()['SeriousDlqin2yrs']
count_y1 =model_data[model_data['SeriousDlqin2yrs']==1].groupby(by="qcut").count()['SeriousDlqin2yrs']
#用zip把分箱结果上下错位,
num_of_bins=[*zip(updown,updown[1:],count_y0,count_y1)]
##把已经分好的箱纳入该函数中用来计算
columns = ["min","max","count_0","count_1"]
df=pd.DataFrame(num_of_bins,columns=columns)
#把woe的计算逻辑推导出来
df["total"]=df.count_0+df.count_1
df["percent"]=df.total/df.total.sum()
df["badrate"]=df.count_1/df.total
df["rate_good%"]=df.count_0/df.count_0.sum()
df["rate_bad%"]=df.count_1/df.count_1.sum()
df["woe"]=np.log(df["rate_good%"]/df["rate_bad%"])
df.head()
#求出iv值
rate= df["rate_good%"] - df["rate_bad%"]
iv = np.sum(rate * df.woe)
print(rate)
print(df.woe)
print(iv)
#把原逻辑封装到函数中来
def get_woe(num_of_bins):
columns = ["min","max","count_0","count_1"]
df=pd.DataFrame(num_of_bins,columns=columns)
df["total"]=df.count_0+df.count_1
df["percent"]=df.total/df.total.sum()
df["badrate"]=df.count_1/df.total
df["rate_good%"]=df.count_0/df.count_0.sum()
df["rate_bad%"]=df.count_1/df.count_1.sum()
df["woe"]=np.log(df["rate_good%"]/df["rate_bad%"])
return df
def get_iv(df):
rate= df["rate_good%"] - df["rate_bad%"]
iv = np.sum(rate * df.woe)
return iv
#复制数据,将原数据留待后面用
n_bins=num_of_bins.copy()
import matplotlib.pyplot as plt
import scipy
#运行函数前检查一下
x1=n_bins[0][2:]
x2=n_bins[1][2:]
print(x1)
print(x2)
IV=[ ]
axisx=[ ]
while len(n_bins)>2:
pvalue=[ ]
for i in range(len(n_bins)-1):
x1=n_bins[i][2:] #i箱里属于0和属于1各自的数
x2=n_bins[i+1][2:] # i+1箱里0和1各自的数
pv= scipy.stats.chi2_contingency([x1,x2])[1] #0是卡方,1是p_value值
pvalue.append(pv)
#利用p来进行合并处理
i=pvalue.index(max(pvalue))
n_bins[i:i+2]= [(
n_bins[i][0], #i行的第一个数作为上限
n_bins[i+1][1], #i+1行的第二个数作为下限
n_bins[i][2]+n_bins[i+1][2], #合并count_0
n_bins[i][3]+n_bins[i+1][3] )]
bins_df=get_woe(n_bins)
axisx.append(len(n_bins))
IV.append(get_iv(bins_df))
print(bins_df)
print(pvalue)
print(axisx)
print(IV)
#运行封装好的函数
auto_bins(model_data,"age","SeriousDlqin2yrs",n=2,q=20,graph=True)
#检查列数据是否缺失
model_data.columns
for i in model_data.columns[1:-1]:
print(i)
auto_bins(model_data,i,"SeriousDlqin2yrs",n=2,q=20,graph=True)
#有些变量再之前的特征工程中发现贡献度低,在这里踢掉
#连续变量
auto_col_bins = {"RevolvingUtilizationOfUnsecuredLines":4,
"age":5,
"DebtRatio":4,
"MonthlyIncome":3,
"NumberOfOpenCreditLinesAndLoans":4}
#不能使用自动分箱的变量
hand_bins = {"NumberOfTime30-59DaysPastDueNotWorse":[-np.inf,1,2,np.inf],
"NumberOfTimes90DaysLate":[-np.inf,1,2,np.inf]
}
bins_of_col={}
#生成自动分箱的分箱区间和分箱后的IV值
for col in auto_col_bins:
bins_df = auto_bins(model_data,col,
"SeriousDlqin2yrs",
n=auto_col_bins[col], #使用字典的性质来取出每个特征所对应的箱的数量
q=20,
graph=False)
bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
#保证区间覆盖使用np.inf替换最大值-np.inf替换最小值
bins_list[0],bins_list[-1] = -np.inf,np.inf
bins_of_col[col] = bins_list
print(auto_col_bins[col])
print(bins_df)
print(bins_list)
#合并手动分箱数据
bins_of_col.update(hand_bins)
bins_of_col
#函数pd.cut,可以根据已知的分箱间隔把数据分箱
#参数为pd.cut(数据,以列表表示的分箱间隔)
data= data[["age","SeriousDlqin2yrs"]].copy()
data["cut"] = pd.cut(data["age"],[-np.inf, 33.0, 53.0, 60.0, 73.0, np.inf])
data
#将数据按分箱结果聚合,并取出其中的标签值
data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
#
#
bins_df = data.groupby("cut") ["SeriousDlqin2yrs"].value_counts().unstack()
bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
bins_df.head()
def new_woe(n_df,col,y,bins):
n_df=n_df[[col,y]].copy()
n_df["cut"] = pd.cut(n_df[col],bins)
bins_df = n_df.groupby("cut")[y].value_counts().unstack()
woe = bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
return woe
#将所有特征的WOE存储到字典当中
woeall ={}
for col in bins_of_col:
woeall[col] = new_woe(model_data,col,"SeriousDlqin2yrs" ,bins_of_col[col])
#最后检查一遍,的确只有7个特征了
woeall
#不希望覆盖掉原本的数据,创建一个新 的DataFrame,索引和原始数据
modeL_ data一模-样
model_woe = pd.DataFrame (index=model_data.index)
#将原数据分箱后,按箱的结果把WOE结构用map映射到数据中
model_woe["age"] = pd.cut(model_data["age"],bins_of_col["age"]).map(woeall["age"])
model_woe.head()
#对所有特征操作可以写成:
for col in bins_of_col:
model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
#将标签补充到数据中
model_woe["SeriousDlqin2yrs"] = model_data ["SeriousDlqin2yrs"]
#这就是我们的建模数据了
model_woe.head( )
#处理测试集
vali_woe = pd.DataFrame(index=vali_data.index)
for col in bins_of_col:
vali_woe[col] = pd.cut(vali_data[col],bins_of_col[col]) .map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = vali_data["SeriousDlqin2yrs"]
vali_x = vali_woe.iloc[:,:-1]
vali_y = vali_woe.iloc[:,-1]
x = model_woe.iloc[:,:-1]
y = model_woe.iloc[:,-1]
vali_x = vali_woe.iloc[:,:-1]
vali_y = vali_woe.iloc[:,-1]
from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(x,y)
lr.score(vali_x,vali_y)
import scikitplot as skplt
vali_proba_df = pd.DataFrame(lr.predict_proba(vali_x))
skplt.metrics.plot_roc(vali_y,vali_proba_df,
plot_micro=False,figsize=(6,6),
plot_macro=False )
B = 20/np.log(2)
A = 600 + B*np.log(1/60)
base_score = A-B*lr.intercept_
print(B)
print(A)
print(base_score)
file="D:/database/Score2.csv"
#open.是用来打开文件的python命令,第- -个参数是文件的路径+文件名,如果你的文件是放在根目录下
#第二个参数是打开文件后的用途,"w"表示用于写入,通常使用的是"r",表示打开来阅读
#首先写入基准分数
#之后使用循环,每次生成-组score_ age 类似的分档和分数,不断写入 文件之中
with open(file,"w") as fdata:
fdata.write("base_score,{}\n".format(base_score))
for i,col in enumerate(x.columns):
score = woeall[col] * (-B*lr.coef_[0][i])
score.name = "Score"
score.index.name = col
score.to_csv(file,header=True, mode="a")
构建信用分级卡 分箱法
最新推荐文章于 2023-11-06 08:44:57 发布