import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
plt.rcParams['font.sans-serif'] = [SimHei]#用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.rcParams['figure.figsize'] = (10, 6)#设置输出图片大小
任务:加载数据并分割测试集和训练集
data = pd.read_csv('clear_data.csv')
train = pd.read_csv('train.csv')
x = data
y=train['Survived']
x_train, x_test, y_train, y_test = train_test_split(
x, y, stratify = y, random_state=42)
任务一:交叉验证
用10折交叉验证来评估之前的逻辑回归模型 计算交叉验证精度的平均值
from sklearn.model_selection import cross_val_score
lr = LogisticRegression()
scores = cross_val_score(lr, x_train, y_train, cv = 10)
scores.mean()
任务二:混淆矩阵
计算二分类问题的混淆矩阵 计算精确率、召回率以及f-分数 【思考】什么是二分类问题的混淆矩阵,理解这个概念,知道它主要是运算到什么任务中的
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
lr=LogisticRegression().fit(x_train, y_train)
y_pred = lr.predict(x_train)
confusion_matrix(y_train, y_pred, labels=[0, 1])
y_train.value_counts(), 358 + 54, 78 + 178
print(classification_report(y_train, y_pred))
2/(1/0.82 + 1/0.87)
任务三:ROC曲线
绘制ROC曲线 【思考】什么是ROC曲线,OCR曲线的存在是为了解决什么问题?
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, lr.decision_function(x_test))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR(recall)")
#找到最接近于0的阈值
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)
plt.title('LR ROC')
from sklearn.metrics import plot_roc_curve
lr = LogisticRegression().fit(x_train, y_train)
lr1 = LogisticRegression(C=1000).fit(x_train, y_train)
lr2 = LogisticRegression(class_weight = 'balanced').fit(x_train, y_train)
lr_display = plot_roc_curve(lr, x_test, y_test, name='LR', response_method= 'decision_function')
lr1_display = plot_roc_curve(lr1, x_test, y_test, name='LR1', response_method= 'decision_function', ax = lr_display.ax_)
lr2_display = plot_roc_curve(lr1, x_test, y_test, name='LR2', response_method= 'decision_function', ax = lr_display.ax_)