1. 提出问题
泰坦尼克号共2224个人,沉船后只有772人存活,生存率仅有32%。尝试建立机器学习模型,通过分析乘客的个人信息,预测个人的存活率,并用测试数据评测模型的预测准确率。
2. 数据处理
首先从网络上下载泰坦尼克号的训练数据与测试数据v。利用pd.read_csv将数据导入。
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import warnings
import matplotlib as mpl
data_train=pd.read_csv("C:/Users/28555/Desktop/train.csv")
data_test=pd.read_csv("C:/Users/28555/Desktop/test.csv")
3. 存活率可视化的实现
乘客的可用信息包括性别,年龄,客舱等,而我们的可视化分析主要就分析这三个方面,代码如下:
data_train_age = data_train[data_train['Age'].notnull()]
plt.figure(figsize=(8,3))
data_train_age['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
plt.show()
bins = [0,6, 12, 20,39,59,100]
group_names = ['infant', 'child', 'teen',"prime","middle","old"]
data_train['categories'] = pd.cut(data_train['Age'], bins, labels = group_names)
mpl.rcParams['font.family']='DFKai-SB' # 修改了全局变量
plt.style.use('grayscale')
s_pclass= data_train['Survived'].groupby(data_train['categories'])
s_pclass = s_pclass.value_counts().unstack()
fig = s_pclass.plot(kind='bar',stacked = True, colormap='tab20c',title='mortality rate of age',fontsize=20)
fig.axes.title.set_size(20)
plt.show()
mpl.rcParams['font.family']='DFKai-SB' # 修改了全局变量
plt.style.use('grayscale')
s_pclass= data_train['Survived'].groupby(data_train['Pclass'])
s_pclass = s_pclass.value_counts().unstack()
s_sex = data_train['Survived'].groupby(data_train['Sex'])
s_sex = s_sex.value_counts().unstack()
fig = s_sex.plot(kind='bar',stacked = True, colormap='tab20c',title=' mortality rate of sex',fontsize=20)
plt.show()
fig = s_pclass.plot(kind='bar',stacked = True, colormap='tab20c',title='mortality rate of pclass',fontsize=20)
fig.axes.title.set_size(20)
fig,ax = plt.subplots(1,2, figsize = (9,4))
sns.violinplot("Pclass","Age",hue="Survived",data=data_train_age,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age",hue="Survived",data=data_train_age,split=True,ax=