from sklearn.datasets import make_classification
from sklearn.calibration import calibration_curve # 校准曲线
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import matplotlib.pyplot as plt
x,y = make_classification(n_samples=10000
,n_features=20
,n_informative=10 # 含有用信息的特征
,n_classes=2
,random_state=42)
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=42)
for name,model in {'随机森林':RandomForestClassifier(),'高斯贝叶斯':GaussianNB(),'逻辑回归':LogisticRegression(C=0.1),'支持向量机':SVC()}.items():
start = time.time()
model.fit(xtrain,ytrain)
end = time.time()
print(f'{name}用时{end-start};正确率{accuracy_score(ytest,model.predict(xtest))}')
fig,axs = plt.subplots(2,4)
fig.suptitle('不同算法下的校准曲线')
for i in enumerate([5,15]):
x,y = make_classification(n_samples=10000
,n_features=20
#,n_informative=i[1]
,n_redundant=i[1]
,n_classes=2
,random_state=42)
for index,model in zip([0,1,2,3],{'随机森林':RandomForestClassifier(),
'高斯贝叶斯':GaussianNB(),'逻辑回归':LogisticRegression(C=0.1),'支持向量机':SVC(probability=True)}.items()):
model[1].fit(xtrain,ytrain)
prob_true,prob_pred = calibration_curve(ytest,model[1].predict_proba(xtest)[:,1],n_bins=10)
axs[i[0],index].plot([0,1],[0,1]) # 添加对角线做对比
axs[i[0],index].plot(prob_pred,prob_true,linestyle='dashdot',label=f'{model[0]}(冗余特征{i[1]})')
axs[i[0],index].legend(loc='upper center')
plt.show()
#结果如下: