data=pd.read_csv('D:/database/training.csv')
data.describe()
data=data.drop(columns=['Unnamed: 0'])
data.drop_duplicates(inplace=True)
data.info()
data.index=range(data.shape[0])
data.info()
import seaborn as sns
sns.countplot(x='NumberOfDependents',data=data)
Dependents = pd.Series([0,1,2,3,4])
for i in data['NumberOfDependents'][data['NumberOfDependents'].isnull()].index:
data['NumberOfDependents'][i] = Dependents.sample(1)
data.info()
def fill_miss_rf(x,y,to_fill):
"""
x 目标矩阵
y 目标标签
to_fill 目标填充列
"""
df=x.copy()
fill=df.loc[:,to_fill]
df = pd.concat([df.loc[:,df.columns !=to_fill],pd.DataFrame(y)],axis=1)
Ytrain=fill[fill.notnull()]
Ytext=fill[fill.isnull()]
Xtrain=df.iloc[Ytrain.index,:]
Xtext=df.iloc[Ytext.index,:]
from sklearn.ensemble import RandomForestRegressor as rfr
rfr=rfr(n_estimators=30).fit(Xtrain,Ytrain)
Ypredict=rfr.predict(Xtext)
return Ypredict
x=data.iloc[:,1:]
y=data.iloc[:,0]
y_pred=fill_miss_rf(x,y,"MonthlyIncome")
data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"]=y_pred
data.info()
fig = plt.figure()
ax1 = plt.subplot()
ax1.boxplot(data['age'])
ax1.set_xticklabels(['age'])
plt.show()
data = data[data['age']>16]
data = data[data['age']<100]
x1 = data['RevolvingUtilizationOfUnsecuredLines']
x2 = data['DebtRatio']
x3 = data['MonthlyIncome']
x4 = data['NumberOfDependents']
x5 = data['NumberOfOpenCreditLinesAndLoans']
x6 = data['NumberRealEstateLoansOrLines']
fig,axes= plt.subplots(2,3,figsize=(12,8))
ax1=axes[0,0]
ax1.boxplot(x1)
ax2=axes[0,1]
ax2.boxplot(x2)
ax3=axes[0,2]
ax3.boxplot(x3)
ax4=axes[1,0]
ax4.boxplot(x4)
ax5=axes[1,1]
ax5.boxplot(x5)
ax6=axes[1,2]
ax6.boxplot(x6)
x1=(data['RevolvingUtilizationOfUnsecuredLines']>1).sum()
x2=(data['DebtRatio']>1).sum()
x3=(data['MonthlyIncome']>500000).sum()
x4=(data['NumberOfDependents']>5).sum()
x5=(data['NumberOfOpenCreditLinesAndLoans']>20).sum()
print(x1)
print(x2)
print(x3)
print(x4)
print(x5)
data=data[data['RevolvingUtilizationOfUnsecuredLines']<1]
data=data[data['DebtRatio']<1]
data["MonthlyIncome"][data["MonthlyIncome"]>500000] = 500000
data['NumberOfDependents'][data['NumberOfDependents']>5]=5
data['NumberOfOpenCreditLinesAndLoans'][data['NumberOfOpenCreditLinesAndLoans']>20]=20
x6 = data['NumberRealEstateLoansOrLines']
plt.figure(figsize=(15,5))
nums,bins,patches=plt.hist(x6,[0,2,4,6,8,10,20,40,60],range=(0,60),edgecolor='k')
plt.xticks(bins,bins)
data['NumberRealEstateLoansOrLines'][data['NumberRealEstateLoansOrLines']>4]=4
fig = plt.figure(figsize=(12,4))
x1 = data['NumberOfTime30-59DaysPastDueNotWorse']
x2 = data['NumberOfTime60-89DaysPastDueNotWorse']
x3 = data['NumberOfTimes90DaysLate']
ax = fig.add_subplot(111)
ax.boxplot([x1,x2,x3])
ax.set_xticklabels(['M1','M2','M3'])
ax.grid(True)
data=data[data['NumberOfTime30-59DaysPastDueNotWorse']<30]
data=data[data['NumberOfTime60-89DaysPastDueNotWorse']<30]
data=data[data['NumberOfTimes90DaysLate']<30]
data.index=range(data.shape[0])
data.info()
x=data.iloc[:,1:]
y=data.iloc[:,0]
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
x_vare= selector.fit_transform(x,y)
x_vare.shape
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(x_vare,y)
k = result.shape[0] - sum(result <= 0)
x_embedded = SelectFromModel(LR_,norm_order=1)
X_embedded =x_embedded.fit_transform(x,y)
cross_val_score(LR_,X_embedded,y, cv=10).mean()
df = pd.DataFrame(X_embedded)
df
all_name = x.columns.values.tolist()
select_name_index = x_embedded.get_support(indices=True)
select_name= []
for i in select_name_index:
select_name.append(all_name[i])
print(select_name)
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(x,y)
clf.feature_importances_
feature_name=['RevolvingUtilizationOfUnsecuredLines',
'age',
'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio',
'MonthlyIncome',
'NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents',
]
[*zip(feature_name,clf.feature_importances_)]
dt_s = sc.var_filter(data_aa, y="SeriousDlqin2yrs",iv_limit=0.02)
data_aa=data.copy()
dt_s = sc.var_filter(data_aa, y="SeriousDlqin2yrs",iv_limit=0.02)
dt_s.info()
corr = data.corr()
fig = plt.figure(figsize=(10,7))
ax1 = fig.add_subplot(1,1,1)
sns.heatmap(corr,annot = True,cmap = 'rainbow',ax = ax1)
n_sample=x.shape[0]
n_0_sample=y.value_counts()[0]
n_1_sample=y.value_counts()[1]
print("不良客户占比:",(n_1_sample/n_sample)*100,"%")
from sklearn.model_selection import train_test_split
x=pd.DataFrame(x)
y=pd.DataFrame(y)
x_train,x_vali,y_train,y_vali = train_test_split(x,y,test_size=0.3)
model_data = pd.concat([y_train,x_train],axis=1)
model_data.index =range(model_data.shape[0])
model_data.columns = data.columns
vali_data = pd.concat([y_vali,x_vali],axis=1)
vali_data.index =range(vali_data.shape[0])
vali_data.columns = data.columns
count_y0 =model_data[model_data['SeriousDlqin2yrs']==0].groupby(by="qcut").count()['SeriousDlqin2yrs']
count_y1 =model_data[model_data['SeriousDlqin2yrs']==1].groupby(by="qcut").count()['SeriousDlqin2yrs']
num_of_bins=[*zip(updown,updown[1:],count_y0,count_y1)]
columns = ["min","max","count_0","count_1"]
df=pd.DataFrame(num_of_bins,columns=columns)
df["total"]=df.count_0+df.count_1
df["percent"]=df.total/df.total.sum()
df["badrate"]=df.count_1/df.total
df["rate_good%"]=df.count_0/df.count_0.sum()
df["rate_bad%"]=df.count_1/df.count_1.sum()
df["woe"]=np.log(df["rate_good%"]/df["rate_bad%"])
df.head()
rate= df["rate_good%"] - df["rate_bad%"]
iv = np.sum(rate * df.woe)
print(rate)
print(df.woe)
print(iv)
def get_woe(num_of_bins):
columns = ["min","max","count_0","count_1"]
df=pd.DataFrame(num_of_bins,columns=columns)
df["total"]=df.count_0+df.count_1
df["percent"]=df.total/df.total.sum()
df["badrate"]=df.count_1/df.total
df["rate_good%"]=df.count_0/df.count_0.sum()
df["rate_bad%"]=df.count_1/df.count_1.sum()
df["woe"]=np.log(df["rate_good%"]/df["rate_bad%"])
return df
def get_iv(df):
rate= df["rate_good%"] - df["rate_bad%"]
iv = np.sum(rate * df.woe)
return iv
n_bins=num_of_bins.copy()
import matplotlib.pyplot as plt
import scipy
x1=n_bins[0][2:]
x2=n_bins[1][2:]
print(x1)
print(x2)
IV=[ ]
axisx=[ ]
while len(n_bins)>2:
pvalue=[ ]
for i in range(len(n_bins)-1):
x1=n_bins[i][2:]
x2=n_bins[i+1][2:]
pv= scipy.stats.chi2_contingency([x1,x2])[1]
pvalue.append(pv)
i=pvalue.index(max(pvalue))
n_bins[i:i+2]= [(
n_bins[i][0],
n_bins[i+1][1],
n_bins[i][2]+n_bins[i+1][2],
n_bins[i][3]+n_bins[i+1][3] )]
bins_df=get_woe(n_bins)
axisx.append(len(n_bins))
IV.append(get_iv(bins_df))
print(bins_df)
print(pvalue)
print(axisx)
print(IV)
auto_bins(model_data,"age","SeriousDlqin2yrs",n=2,q=20,graph=True)
model_data.columns
for i in model_data.columns[1:-1]:
print(i)
auto_bins(model_data,i,"SeriousDlqin2yrs",n=2,q=20,graph=True)
auto_col_bins = {"RevolvingUtilizationOfUnsecuredLines":4,
"age":5,
"DebtRatio":4,
"MonthlyIncome":3,
"NumberOfOpenCreditLinesAndLoans":4}
hand_bins = {"NumberOfTime30-59DaysPastDueNotWorse":[-np.inf,1,2,np.inf],
"NumberOfTimes90DaysLate":[-np.inf,1,2,np.inf]
}
bins_of_col={}
for col in auto_col_bins:
bins_df = auto_bins(model_data,col,
"SeriousDlqin2yrs",
n=auto_col_bins[col],
q=20,
graph=False)
bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
bins_list[0],bins_list[-1] = -np.inf,np.inf
bins_of_col[col] = bins_list
print(auto_col_bins[col])
print(bins_df)
print(bins_list)
bins_of_col.update(hand_bins)
bins_of_col
data= data[["age","SeriousDlqin2yrs"]].copy()
data["cut"] = pd.cut(data["age"],[-np.inf, 33.0, 53.0, 60.0, 73.0, np.inf])
data
data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df = data.groupby("cut") ["SeriousDlqin2yrs"].value_counts().unstack()
bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
bins_df.head()
def new_woe(n_df,col,y,bins):
n_df=n_df[[col,y]].copy()
n_df["cut"] = pd.cut(n_df[col],bins)
bins_df = n_df.groupby("cut")[y].value_counts().unstack()
woe = bins_df["woe"] = np.log((bins_df[0]/bins_df[0].sum())/(bins_df[1]/bins_df[1].sum()))
return woe
woeall ={}
for col in bins_of_col:
woeall[col] = new_woe(model_data,col,"SeriousDlqin2yrs" ,bins_of_col[col])
woeall
modeL_ data一模-样
model_woe = pd.DataFrame (index=model_data.index)
model_woe["age"] = pd.cut(model_data["age"],bins_of_col["age"]).map(woeall["age"])
model_woe.head()
for col in bins_of_col:
model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
model_woe["SeriousDlqin2yrs"] = model_data ["SeriousDlqin2yrs"]
model_woe.head( )
vali_woe = pd.DataFrame(index=vali_data.index)
for col in bins_of_col:
vali_woe[col] = pd.cut(vali_data[col],bins_of_col[col]) .map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = vali_data["SeriousDlqin2yrs"]
vali_x = vali_woe.iloc[:,:-1]
vali_y = vali_woe.iloc[:,-1]
x = model_woe.iloc[:,:-1]
y = model_woe.iloc[:,-1]
vali_x = vali_woe.iloc[:,:-1]
vali_y = vali_woe.iloc[:,-1]
from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(x,y)
lr.score(vali_x,vali_y)
import scikitplot as skplt
vali_proba_df = pd.DataFrame(lr.predict_proba(vali_x))
skplt.metrics.plot_roc(vali_y,vali_proba_df,
plot_micro=False,figsize=(6,6),
plot_macro=False )
B = 20/np.log(2)
A = 600 + B*np.log(1/60)
base_score = A-B*lr.intercept_
print(B)
print(A)
print(base_score)
file="D:/database/Score2.csv"
with open(file,"w") as fdata:
fdata.write("base_score,{}\n".format(base_score))
for i,col in enumerate(x.columns):
score = woeall[col] * (-B*lr.coef_[0][i])
score.name = "Score"
score.index.name = col
score.to_csv(file,header=True, mode="a")