模型开发流程
获取数据→数据清洗,特征工程→模型开发→模型检验与评估→模型上线→检测与报告
案例
1.导库,获取数据
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
data = pd.read_csv(r"D:\Sklearn\rankingcard.csv",index_col=0)
2.探索数据与数据预处理
#观察数据类型
data.head()
#观察数据结构
data.shape()
data.info()
2.1去除重复值
#去除重复值
data.drop_duplicates(inplace=True)
data.info()
#删除之后千万不要忘记,恢复索引
data.index = range(data.shape[0])
data.info()
2.2 填补缺失值
#探索缺失值
data.info()
data.isnull().sum()/data.shape[0]
#data.isnull().mean()
data["NumberOfDependents"].fillna(int(data["NumberOfDependents"].mean()),inplace=True)
data.info()
data.isnull().sum()/data.shape[0]
#写一个能填补任何列的函数
def fill_missing_rf(X,y,to_fill):
"""
使用随机森林填补一个特征的缺失值的函数
参数:
X:要填补的特征矩阵
y:完整的,没有缺失值的标签
to_fill:字符串,要填补的那一列的名称
"""
#构建我们的新特征矩阵和新标签
df = X.copy()
fill = df.loc[:,to_fill]
df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1)
#找出我们的训练集和测试集
Ytrain = fill[fill.notnull()]
Ytest = fill[fill.isnull()]
Xtrain = df.iloc[Ytrain.index,:]
Xtest = df.iloc[Ytest.index,:]
#用随机森林回归来填补缺失值
from sklearn.ensemble import RandomForestRegressor as rfr
rfr = rfr(n_estimators=100)
rfr = rfr.fit(Xtrain, Ytrain)
Ypredict = rfr.predict(Xtest)
return Ypredict
#将参数导入函数
X = data.iloc[:,1:]
y = data["SeriousDlqin2yrs"]
X.shape
#=====【TIME WARNING:1 min】=====#
y_pred = fill_missing_rf(X,y,"MonthlyIncome")
#确认我们的结果合理之后,我们就可以将数据覆盖了
data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"] = y_pred
2.3描述性统计处理异常值
#描述性统计
data.describe([0.01,0.1,0.25,.5,.75,.9,.99]).T
2.4样本不均衡问题,采用上采样方法
import imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42) #实例化
X,y = sm.fit_sample(X,y)
n_sample_ = X.shape[0]
pd.Series(y).value_counts()
n_1_sample = pd.Series(y).value_counts()[1]
n_0_sample = pd.Series(y).value_counts()[0]
print('样本个数:{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample,n_1_sample/n_sample,n_0_sample/n_sample))
2.5分训练集和测试集
from sklearn.model_selection import train_test_split
X = pd.DataFrame(X)
y = pd.DataFrame(y)
X_train, X_vali, Y_train, Y_vali = train_test_split(X,y,test_size=0.3,random_state=420)
model_data = pd.concat([Y_train, X_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns
vali_data = pd.concat([Y_vali, X_vali], axis=1)
vali_data.index = range(vali_data.shape[0])
vali_data.columns = data.columns
model_data.to_csv(r"file_path")
vali_data.to_csv(r"file_path")
3.分箱
4.计算各箱的WOE并映射到数据中
data = model_data.copy()
#函数pd.cut,可以根据已知的分箱间隔把数据分箱
#参数为 pd.cut(数据,以列表表示的分箱间隔)
data = data[["age","SeriousDlqin2yrs"]].copy()
data["cut"] = pd.cut(data["age"],[-np.inf, 48.49986200790144, 58.757170160044694, 64.0,
74.0, np.inf])
data
#将数据按分箱结果聚合,并取出其中的标签值
data.groupby("cut")["SeriousDlqin2yrs"].value_counts()
#使用unstack()来将树状结构变成表状结构
data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df = data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
def get_woe(df,col,y,bins):
df = df[[col,y]].copy()
df["cut"] = pd.cut(df[col],bins)
bins_df = df.groupby("cut")[y].value_counts().unstack()
woe = bins_df["woe"] =
np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
return woe
#将所有特征的WOE存储到字典当中
woeall = {}
for col in bins_of_col:
woeall[col] = get_woe(model_data,col,"SeriousDlqin2yrs",bins_of_col[col])
woeall
#把所有woe映射到原始数据上
#不希望覆盖掉原本的数据,创建一个新的DataFrame,索引和原始数据model_data一模一样
model_woe = pd.DataFrame(index=model_data.index)
#将原数据分箱后,按箱的结果把WOE结构用map函数映射到数据中
model_woe["age"] = pd.cut(model_data["age"],bins_of_col["age"]).map(woeall["age"])
#对所有特征操作可以写成:
for col in bins_of_col:
model_woe[col] = pd.cut(model_data[col],bins_of_col[col]).map(woeall[col])
#将标签补充到数据中
model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]
#这就是我们的建模数据了
model_woe.head()
5.建模与验证
#处理测试集
vali_woe = pd.DataFrame(index=vali_data.index)
for col in bins_of_col:
vali_woe[col] = pd.cut(vali_data[col],bins_of_col[col]).map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = vali_data["SeriousDlqin2yrs"]
vali_X = vali_woe.iloc[:,:-1]
vali_y = vali_woe.iloc[:,-1]
X = model_woe.iloc[:,:-1]
y = model_woe.iloc[:,-1]
from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(X,y)
lr.score(vali_X,vali_y)
c_1 = np.linspace(0.01,1,20)
c_2 = np.linspace(0.01,0.2,20)
score = []
for i in c_2:
lr = LR(solver='liblinear',C=i).fit(X,y)
score.append(lr.score(vali_X,vali_y))
plt.figure()
plt.plot(c_2,score)
plt.show()
lr.n_iter_
score = []
for i in [1,2,3,4,5,6]:
lr = LR(solver='liblinear',C=0.025,max_iter=i).fit(X,y)
score.append(lr.score(vali_X,vali_y))
plt.figure()
plt.plot([1,2,3,4,5,6],score)
plt.show()
6.制作评分卡
B = 20/np.log(2)
A = 600 + B*np.log(1/60)
B,A
base_score = A - B*lr.intercept_
base_score
score_age = woeall["age"] * (-B*lr.coef_[0][0])
score_age
file = "file_path"
with open(file,"w") as fdata:
fdata.write("base_score,{}\n".format(base_score))
for i,col in enumerate(X.columns):
score = woeall[col] * (-B*lr.coef_[0][i])
score.name = "Score"
score.index.name = col
score.to_csv(file,header=True,mode="a"
结果