利用随机函数产生100个测试样本的 0 1 label 并随机产生分类模型对应预测结果,即判断为阳性的概率。
目标:请用以上数据,完成一下内容。
(1)计算阈值为0.5时的混淆矩阵并绘制其热力图
(2)绘制P-R曲线
(3)绘制ROC曲线
(4)模拟随意两个模型的10折交叉验证,并利用T检验判断模型优劣
代码如下:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
label=[]
classifier_score1=[]
classifier_score2=[]
np.random.seed(1)
label=np.random.randint(2,size=(100,1))
# print(label[2,0])
#为了使曲线更加合理,生成模型分类概率:正更靠近1,负更靠近0
#模型1:-----------------------------------------------
for i in range(100):
if label[i,0]==1:
x=np.random.uniform(0.25,1)
classifier_score1.append(x)
elif label[i,0]==0:
x=np.random.uniform(0,0.7)
classifier_score1.append(x)
#------------------------------------------------------
#模型2(认为分类能力更好):-----------------------------
np.random.seed(1)
for i in range(100):
if label[i,0]==1:
x=np.random.uniform(0.36,1)
classifier_score2.append(x)
elif label[i,0]==0:
x=np.random.uniform(0,0.7)
classifier_score2.append(x)
#------------------------------------------------------
# label=list(label[:,0])
classifier_score1=np.array(classifier_score1)
classifier_score2=np.array(classifier_score2)
# configure=pd.concat(label,classifier_score1)
#创建数据框将score和label放入
df=pd.DataFrame(data=label)
df.insert(1,'classifier_score1',classifier_score1)
df.insert(2,'classifier_score2',classifier_score2)
df.insert(0,'label',label)
##mission 1:绘制混淆矩阵热力图
#混淆矩阵参数收集 (AP=0.5)
k=0
TP=0
TN=0
FN=0
FP=0
TP1=0
TN1=0
FN1=0
FP1=0
for i in range(100):
if df.iloc[i,0]==1 and df.iloc[i,2]>0.5 :
TP+=1
elif df.iloc[i,0]==1 and df.iloc[i,2]<0.5:
FN+=1
elif df.iloc[i,0]==0 and df.iloc[i,2]>0.5:
FP+=1
elif df.iloc[i,0]==0 and df.iloc[i,2]<0.5 :
TN+=1
for i in range(100):
if df.iloc[i,0]==1 and df.iloc[i,3]>0.5 :
TP1 +=1
elif df.iloc[i,0]==1 and df.iloc[i,3]<0.5:
FN1+=1
elif df.iloc[i,0]==0 and df.iloc[i,3]>0.5:
FP1+=1
elif df.iloc[i,0]==0 and df.iloc[i,3]<0.5 :
TN1+=1
# #创建混淆矩阵(模型1)
cm= np.zeros((2,2))
name_co=['predicted positive','predicted negative']
name_idx=['true positive','true negative']
confusion_matrix=pd.DataFrame(data=cm,columns=name_co,index=name_idx)
confusion_matrix.iloc[0:2,0]=TP,FP
confusion_matrix.iloc[0:2,1]=TN,FN
# #创建混淆矩阵(模型2)
cm1= np.zeros((2,2))
confusion_matrix2=pd.DataFrame(data=cm1,columns=name_co,index=name_idx)
confusion_matrix2.iloc[0:2,0]=TP1,FP1
confusion_matrix2.iloc[0:2,1]=TN1,FN1
# print('模型1混淆矩阵:',confusion_matrix,'\n''模型2混淆矩阵:',confusion_matrix2)
plt.rcParams['font.sans-serif']='SimHei'#中文正常显示
f = plt.figure(figsize=(12, 12))
ax1=f.add_subplot(2,2,1)
sns.heatmap(confusion_matrix, annot=True, fmt='.2g')
plt.title('模型1')
ax2=f.add_subplot(2,2,2)
sns.heatmap(confusion_matrix2, annot=True, fmt='.2g')
plt.title('模型2')
#mission 2-3:绘制P-R曲线和ROC曲线
#获取曲线参数(k为当前阈值)
# #排序后模型1
df1=df.sort_values(by="classifier_score1" , ascending=False)
df1.index = range(len(df1))
#排序后模型2
df2=df.sort_values(by="classifier_score2" , ascending=False)
df2.index = range(len(df2))
vara_R=np.zeros((100,1))
vara_P=np.zeros((100,1))
vara_R1=np.zeros((100,1))
vara_P1=np.zeros((100,1))
fp_rate=np.zeros((100,1))
fp_rate1=np.zeros((100,1))
#计算每一个阈值对应当前参数并存在参数矩阵中
for k in range(100):
TP=0
TN=0
FN=0
FP=0
TP1=0
TN1=0
FN1=0
FP1=0
for i in range(100):
if df1.iloc[i,0]==1 and df1.iloc[i,2]>=df1.iloc[k,2] :
TP+=1
elif df1.iloc[i,0]==1 and df1.iloc[i,2]<df1.iloc[k,2]:
FN+=1
elif df1.iloc[i,0]==0 and df1.iloc[i,2]>df1.iloc[k,2]:
FP+=1
elif df1.iloc[i,0]==0 and df1.iloc[i,2]<=df1.iloc[k,2] :
TN+=1
vara_P[k,0]=TP/(TP+FP)
vara_R[k,0]=TP/(TP+FN)
fp_rate[k,0]=FP/(FP+TN)
# print('模型一',TP,FP,TN,FN)
for j in range(100):
if df2.iloc[j,0]==1 and df2.iloc[j,3]>=df2.iloc[k,3] :
TP1 +=1
elif df2.iloc[j,0]==1 and df2.iloc[j,3]<df2.iloc[k,3] :
FN1+=1
elif df2.iloc[j,0]==0 and df2.iloc[j,3]>df2.iloc[k,3] :
FP1+=1
elif df2.iloc[j,0]==0 and df2.iloc[j,3]<=df2.iloc[k,3] :
TN1+=1
vara_P1[k,0]=TP1/(TP1+FP1)
vara_R1[k,0]=TP1/(TP1+FN1)
fp_rate1[k,0]=FP1/(FP1+TN1)
# print('模型二',TP1,FP1,TN1,FN1)
# print('-----')
#绘制PR图,为方便观察,放在一张图中
ax3=f.add_subplot(2,2,3)
plt.plot(vara_R,vara_P,'b-',vara_R1,vara_P1,'r-',)
plt.ylabel('precision')
plt.xlabel('recall')
plt.title('分类模型1,分类模型2的P-R曲线')
#绘制ROC图,为方便观察,放在一张图中
ax3=f.add_subplot(2,2,4)
plt.plot( fp_rate,vara_R,'b-', fp_rate1,vara_R1,'r-',)
plt.ylabel('true positive rate')
plt.xlabel('false positive rate')
plt.title('分类模型1,分类模型2的ROC曲线')
#mission 4:模拟十折交叉验证,用T检验比较模型性能
#使用模型二数据中score作为feature,label为label,再模拟两种算法模型10折交叉验证
#均匀分类数据,先分为比例尽可能一致的10份数据
#这里引入决策树2和逻辑回归1模型
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
score3=0
score4=0
part_S=0
part_S1=0
clf1=LogisticRegression(random_state=(1))
clf2=DecisionTreeClassifier(random_state=(1))
sfolder = StratifiedKFold(n_splits=10,random_state=2,shuffle=True)
for train_index, test_index in sfolder.split(classifier_score2,label):
x_train,x_test=classifier_score2[train_index],classifier_score2[test_index]
y_train,y_test=label[train_index],label[test_index]
score1=clf1.fit(x_train.reshape(-1, 1), y_train.reshape(-1,)).score(x_test.reshape(-1, 1), y_test.reshape(-1,))
score2=clf2.fit(x_train.reshape(-1, 1), y_train.reshape(-1,)).score(x_test.reshape(-1, 1), y_test.reshape(-1,))
part_S+=(score1-0.77)**2
part_S1+=(score2-0.81)**2
score3+=score1
score4+=score2
print('线性回归—10折交叉验证分数:',score3/10,"\n"'线决策树—10折交叉验证分数:',score4/10)
#STUDENT-T检验两模型的结果以证明模型好坏(t检验的标准化流程)
#10折交叉验证的样本均值(u=0.7)
xhat_model1=score3/10
xhat_model2=score4/10
μ=0.85
S=math.sqrt( part_S/9 )
S1=math.sqrt( part_S1/9 )
T_1=(xhat_model1-μ)/(S/math.sqrt(10))
T_2=(xhat_model2-μ)/(S1/math.sqrt(10))
print('t1:',T_1,'t2:',T_2)
# 查表建立原假设和备择假设
# H0:μ⩾0.85
# H1:μ<0.85 令α=0.05即t<=-1.833为拒绝域。
#此时t1拒绝,t2接受
#模型1接受备择假设H1,模型2接受原假设
def model_comparison(T1,T2):
print('α取0.05,作两个假设H0:μ⩾0.85 H1:μ<0.85')
m=0
n=0
m2=0
n2=0
if T1<=-1.833:
print('模型1接受备择假设')
m+=1
elif T1>-1.833:
print('模型1接受原假设')
n+=1
m2+=1
if T2<=-1.833:
print('模型2接受备择假设')
m+=1
elif T2>-1.833:
print('模型2接受原假设')
n+=1
n2+=1
# 判断优劣
if n==2 or m==2:
print('无法判断模型优劣,请修改总体均值μ')
elif n2==1 and m2==0:
print("决策树优于逻辑回归")
elif n2==0 and m2==1:
print("逻辑回归优于决策树")
model_comparison(T_1,T_2)
#若有不懂及时与我沟通,谢谢!