1. 数据导入
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14) # 14号大小
import seaborn as sns
sns.set(style="white") # 白色背景 for seaborn plots
sns.set(style="whitegrid", color_codes=True)
import warnings
warnings.simplefilter(action='ignore')
# 导入训练数据集
train_df = pd.read_csv("./input/train.csv")
# 导入测试数据集
test_df = pd.read_csv("./input/test.csv")
# 训练集预览
train_df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Hdoh11HZ-1678272362728)(./image/1.png)]
print('训练集数据量 {}.'.format(train_df.shape[0]))
print('测试集数据量 {}.'.format(test_df.shape[0])) # 测试数据中没有目标变量(即:“生存”栏缺失)
训练集数据量 891.
测试集数据量 418.
2. 数据预处理
# 查看空值的数量
train_df.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
2.1. Age - 缺失数据处理
# percent of missing "Age"
miss_age = train_df['Age'].isnull().sum() # age 缺失数据量
train_count = train_df.shape[0] # 训练集数量
print('缺失age 数据占比: %.2f%%' %((miss_age/train_count)*100))
# hist 直方图将对数据集进行统计
ax = train_df["Age"].hist(bins=15, density=True, color='teal', alpha=0.8)
"""
bins: 统计的区间分布
density: bool,默认为false,显示的是频数统计结果,为True则显示频率统计结果
color: 颜色
alpha设置透明度,0为完全透明
"""
train_df["Age"].plot(kind='density', color='red') # 绘制曲线
ax.set(xlabel='Age') # x label
plt.xlim(-10,85) # x 轴范围
plt.show()
缺失age 数据占比: 19.87%
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qItxLcek-1678272362729)(titanic-lr_files/titanic-lr_8_1.png)]
# age 平均值
print('平均值 "Age" is %.2f' %(train_df["Age"].mean(skipna=True)))
# age 中位数
print('中位数 "Age" is %.2f' %(train_df["Age"].median(skipna=True)))
平均值 "Age" is 29.70
中位数 "Age" is 28.00
因为age 数据集中在20~40 比较集中,平均值稍高于中位数, 可以选择中位数作为填充值
2.2. Cabin - 客舱号
# "Cabin" 缺失百分比
print('Cabin 缺失率 %.2f%%' %((train_df['Cabin'].isnull().sum()/train_df.shape[0])*100))
Cabin 缺失率 77.10%
缺失率较高,使用值进行填充,也会导致数据维度方差较低,数据区分不明显
2.3. Embarked - Missing Values
# "Embarked" 缺失率
print('Embarked 缺失率 %.2f%%' %((train_df['Embarked'].isnull().sum()/train_df.shape[0])*100))
Embarked 缺失率 0.22%
缺失率较低,可以使用多数登船口进行缺失值替换
print('登船口 (C = Cherbourg, Q = Queenstown, S = Southampton):')
print(train_df['Embarked'].value_counts())
sns.countplot(x='Embarked', data=train_df, palette='Set2') # 使用条形显示每个分箱器中的观察计数
plt.show()
print('最多登船口 %s.' %train_df['Embarked'].value_counts().idxmax())
登船口 (C = Cherbourg, Q = Queenstown, S = Southampton):
S 644
C 168
Q 77
Name: Embarked, dtype: int64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cFZXxUTb-1678272362729)(titanic-lr_files/titanic-lr_17_1.png)]
最多登船口 S.
可以使用S, 替换缺失登船口的用户
2.4. 缺失值替换
train_data = train_df.copy()
# 中位数替换age 缺失值
train_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
# 用S 替换没有登船口缺失值
train_data["Embarked"].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True)
train_data.drop('Cabin', axis=1, inplace=True) # 删除Cabin 列
# check missing values in adjusted train data
train_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
dtype: int64
# 预览调整数据
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
# 使用是否独自旅行,替代是否有兄弟姐妹,配偶和子女父母,减少复杂度
train_data['TravelAlone']=np.where((train_data["SibSp"]+train_data["Parch"])>0, 0, 1)
train_data.drop('SibSp', axis=1, inplace=True)
train_data.drop('Parch', axis=1, inplace=True)
# 使用one-hot 编码,进行离散值数据维度拆分
training=pd.get_dummies(train_data, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
# 去掉无关维度
training.drop('PassengerId', axis=1, inplace=True) # 乘客ID
training.drop('Name', axis=1, inplace=True) # 姓名对结果无影响
training.drop('Ticket', axis=1, inplace=True) # 票号
final_train = training
final_train.head()
Survived | Age | Fare | TravelAlone | Pclass_1 | Pclass_2 | Pclass_3 | Embarked_C | Embarked_Q | Embarked_S | Sex_male | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 22.0 | 7.2500 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
1 | 1 | 38.0 | 71.2833 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 1 | 26.0 | 7.9250 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
3 | 1 | 35.0 | 53.1000 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 35.0 | 8.0500 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
测试集与train 对齐
test_df.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
test_data = test_df.copy()
test_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
test_data["Fare"].fillna(train_df["Fare"].median(skipna=True), inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)
# 标记是否独自旅行
test_data['TravelAlone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)
test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)
testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)
final_test = testing
final_test.head()
Age | Fare | TravelAlone | Pclass_1 | Pclass_2 | Pclass_3 | Embarked_C | Embarked_Q | Embarked_S | Sex_male | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 34.5 | 7.8292 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
1 | 47.0 | 7.0000 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
2 | 62.0 | 9.6875 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
3 | 27.0 | 8.6625 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
4 | 22.0 | 12.2875 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
# 添加是不是未成年
final_train['IsMinor']=np.where(final_train['Age']<=16, 1, 0)
final_test['IsMinor']=np.where(final_test['Age']<=16, 1, 0)
final_test.head()
Age | Fare | TravelAlone | Pclass_1 | Pclass_2 | Pclass_3 | Embarked_C | Embarked_Q | Embarked_S | Sex_male | IsMinor | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 34.5 | 7.8292 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
1 | 47.0 | 7.0000 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 62.0 | 9.6875 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
3 | 27.0 | 8.6625 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
4 | 22.0 | 12.2875 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
3. Logistic Regression
3.1. 特征选择
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
"""
使用RFE进行特征选择:RFE是常见的特征选择方法,也叫递归特征消除。它的工作原理是递归删除特征,
并在剩余的特征上构建模型。它使用模型准确率来判断哪些特征(或特征组合)对预测结果贡献较大。
"""
cols = ["Age","Fare","TravelAlone","Pclass_1","Pclass_2","Embarked_C","Embarked_S","Sex_male","IsMinor"]
X = final_train[cols] # x 数据集
y = final_train['Survived'] # label
# 构建逻辑回归
"""
参数
# penalty 正则化'l1', 'l2' default ='l2'
# C=1e5 正则化系数λ的倒数,正则化参数, 越小惩罚力度越大
solver:优化算法选择参数,只有五个可选参数,即newton-cg,lbfgs,liblinear,sag,saga。默认为liblinear。solver参数决定了我们对逻辑回归损失函数的优化方法,有四种算法可以选择,分别是:
liblinear:使用了开源的liblinear库实现,内部使用了坐标轴下降法来迭代优化损失函数。
lbfgs:拟牛顿法的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
newton-cg:也是牛顿法家族的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
sag:即随机平均梯度下降,是梯度下降法的变种,和普通梯度下降法的区别是每次迭代仅仅用一部分的样本来计算梯度,适合于样本数据多的时候。
saga:线性收敛的随机优化算法的的变重。
max_iter:算法收敛最大迭代次数,int类型,默认为10。 - 学习率通过max_inter 控制
仅在正则化优化算法为newton-cg, sag和lbfgs才有用,算法收敛的最大迭代次数。
"""
lr_model = LogisticRegression(penalty='l2')
# 8 属性进行递归特征消除
rfe = RFE(lr_model, n_features_to_select= 8, step = 1)
rfe = rfe.fit(X, y)
# 汇总选择属性
print('选择的特征: %s' % list(X.columns[rfe.support_]))
选择的特征: ['Age', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Embarked_C', 'Embarked_S', 'Sex_male', 'IsMinor']
Selected_features = ['Age', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Embarked_C',
'Embarked_S', 'Sex_male', 'IsMinor']
X = final_train[Selected_features] # 训练集
# 计算皮尔森系数 corr
plt.subplots(figsize=(8, 5))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Fm4Lm1Sf-1678272362730)(titanic-lr_files/titanic-lr_31_0.png)]
模型训练
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss
Selected_features = ['Age', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Embarked_C',
'Embarked_S', 'Sex_male', 'IsMinor']
X = final_train[Selected_features] # 训练集
y = final_train['Survived'] # label
# 测试集
X_test = final_test[Selected_features] # 测试集
y_test_df = pd.read_csv("./input/gender_submission.csv")
y_test = y_test_df['Survived'] # test label
# 模型训练
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X_test) # 预测-测试集
y_pred_proba = logreg.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba) # fpr = 假正率 / tpr= 真正率
# 准确率、recall、auc
print("accuracy is %2.3f" % accuracy_score(y_test, y_pred))
print("recall is %2.3f" % recall_score(y_test, y_pred))
print("auc is %2.3f" % auc(fpr, tpr))
idx = np.min(np.where(tpr > 0.95)) # 敏感性 > 0.95
plt.figure()
plt.plot(fpr, tpr, color='coral', label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot([0,fpr[idx]], [tpr[idx],tpr[idx]], 'k--', color='blue')
plt.plot([fpr[idx],fpr[idx]], [0,tpr[idx]], 'k--', color='blue')
plt.xlim([0.0, 1.0]) # x 轴范围
plt.ylim([0.0, 1.05]) # y 轴范围
plt.xlabel('fpr', fontsize=14)
plt.ylabel('tpr(recall)', fontsize=14)
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()
accuracy is 0.947
recall is 0.928
auc is 0.989
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-z52bPzLX-1678272362731)(titanic-lr_files/titanic-lr_33_1.png)]
交叉验证
from sklearn.model_selection import cross_validate
scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}
modelCV = LogisticRegression()
results = cross_validate(modelCV, X, y, cv=10, scoring=list(scoring.values()),
return_train_score=False)
print('K-fold cross-validation results:')
for sc in range(len(scoring)):
print("average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
if list(scoring.values())[sc]=='neg_log_loss'
else results['test_%s' % list(scoring.values())[sc]].mean(),
results['test_%s' % list(scoring.values())[sc]].std()))
K-fold cross-validation results:
average accuracy: 0.795 (+/-0.025)
average log_loss: 0.454 (+/-0.037)
average auc: 0.850 (+/-0.028)