泰坦尼克号数据之生还率分析

韩立 •

已于 2022-07-15 21:34:39 修改

阅读量1.6k

点赞数 2

分类专栏：数据分析项目练手文章标签： python pandas 机器学习

于 2022-06-20 21:46:12 首次发布

本文链接：https://blog.csdn.net/qq_44386182/article/details/125381098

版权

生存率船舱等级性别年龄家庭成员

关键词由CSDN通过智能技术生成

数据分析项目练手专栏收录该内容

5 篇文章

订阅专栏

1 首先统计总的死亡人数和幸存人数对比（柱状图和饼图）

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # SimHei是黑体的意思


df = pd.read_csv('train.csv')

print(df.info)
print(df.head())
# 统计总的生存和死亡人数
total_survived_sum = df['Survived'].sum()
total_nosurvived_sum =df['Survived'][df['Survived']==0].count()
print("幸存者为%d,遇难者为%d"%(total_survived_sum,total_nosurvived_sum))

plt.bar([1, 0], [total_survived_sum,total_nosurvived_sum], width=0.5, alpha=0.5)
plt.xticks([1, 0], ['survived', 'no_survived'])
plt.title('总体生存和死亡人数')
for i, j in zip([1, 0], [total_survived_sum, total_nosurvived_sum]):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.show()
plt.pie([total_nosurvived_sum,total_survived_sum],labels=['no survived','survived'],autopct='%1.0f%%')
plt.title('Survival rate')
plt.show()

2 船舱等级因素对幸存的影响

a 首先获得各船舱人数（船舱等级为1，2，3（数字越小，档次越高））

#  三个船舱等级下人数
number = df[['Pclass', 'Survived']].groupby(['Pclass']).count()  # count函数用于统计Pclass中1，2，3的行数
number.index = ['1', '2', '3']
print(number)  # 分组集合后的数据格式为series 包含index 和values 两个方法。
plt.subplot(121)
plt.bar([1, 2, 3], number['Survived'], width=0.5, alpha=0.5)
plt.xticks([1, 2, 3], number.index)
plt.title('1,2,3各舱人数')
for i, j in zip([1, 2, 3], number['Survived']):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')

plt.subplot(122)
plt.pie((df[['Pclass','Survived']].groupby(['Pclass']).count())['Survived'],labels=['1','2','3'],autopct='%1.0f%%')
plt.title('1,2,3各舱人数比例')
plt.show()  # plt.pie 传入的是列表

再分析一下各舱幸存人数以及占各舱幸存人数占幸存总人数比例

#  灾难后各舱幸存人数
survived_df=df[df[ 'Survived'] == 1]  # 过滤掉死去的人（去除包含该数据条件的所有行）
number2 = survived_df[['Pclass','Survived']].groupby('Pclass').sum()
print(number2)  # number2为series

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([1, 2, 3], number2['Survived'], width=0.5, alpha=0.5)
plt.xticks([1, 2, 3], number2.index)
plt.title('各舱幸存总人数')
for i, j in zip([1, 2, 3], number2['Survived']):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(122)
plt.pie((survived_df[['Pclass', 'Survived']].groupby('Pclass').sum())['Survived'], labels=['1','2','3'],autopct='%1.0f%%')
plt.title('各舱幸存人数占总生存人数比例')
plt.show()

再看一下各舱各自生存率状况

# 各舱生存和死亡人数柱状图 以及各舱生存率比例饼图
Pclass1=df[df['Pclass']==1]
Pclass2=df[df['Pclass']==2]
Pclass3=df[df['Pclass']==3]
number3 = Pclass1[['Pclass', 'Survived']].groupby('Survived').count()

plt.figure(figsize=(10,20))
plt.subplot(4,2,1)
plt.bar([0, 1], number3['Pclass'], width=0.5, alpha=0.5)
plt.xticks([0, 1], number3.index)
plt.title('1号舱生存与死亡人数对比')
for i, j in zip([0, 1], number3['Pclass']):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(4,2,2)
plt.pie([Pclass1['Survived'][Pclass1['Survived'] == 0].count(),Pclass1['Survived'][Pclass1['Survived'] == 1].count()],labels=['No Survived', 'Survived'],autopct='%1.0f%%')


number4 = Pclass2[['Pclass', 'Survived']].groupby('Survived').count()
plt.subplot(4,2,3)
plt.bar([0, 1], number4['Pclass'], width=0.5, alpha=0.5)
plt.xticks([0, 1], number4.index)
plt.title('2号舱生存与死亡人数对比')
for i, j in zip([0, 1], number4['Pclass']):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(4,2,4)
plt.pie([Pclass2['Survived'][Pclass2['Survived'] == 0].count(),Pclass2['Survived'][Pclass2['Survived'] == 1].count()],labels=['No Survived', 'Survived'],autopct='%1.0f%%')

number5 = Pclass3[['Pclass', 'Survived']].groupby('Survived').count()
plt.subplot(4,2,5)
plt.bar([0, 1], number5['Pclass'], width=0.5, alpha=0.5)
plt.xticks([0, 1], number5.index)
plt.title('3号舱生存与死亡人数对比')
for i, j in zip([0, 1], number5['Pclass']):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(4,2,6)
plt.pie([Pclass3['Survived'][Pclass3['Survived'] == 0].count(),Pclass3['Survived'][Pclass3['Survived'] == 1].count()],labels=['No Survived', 'Survived'],autopct='%1.0f%%')
plt.show()

接下来探究性别对生还情况的影响

首先绘制船上男女比例图

#  性别对死亡率的影响

male=df['Sex'][df['Sex']=='male'].count()
female=df['Sex'][df['Sex']=='female'].count()
print('船上男性为%d人,女性为%d人'%(male,female))
#
plt.figure(figsize=(10,5))
plt.bar([0, 1], [male, female], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['男性', '女性'])
plt.title('男女人数比例')
for i, j in zip([0, 1], [male, female]):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.show()

可看出船上男 577人女 314 人

男女饼状图

plt.pie([male,female],labels=['male','female'],autopct='%1.0f%%')
plt.show()

接下来统计幸存者中男女比例

# 统计幸存者中男女各自人数
survived_df=df[df[ 'Survived'] == 1]
Survived_male=survived_df['Sex'][survived_df['Sex']=='male'].count()
Survived_female=survived_df['Sex'][survived_df['Sex']=='female'].count()
print('幸存者中男性为%d人，女性为%d人'%(Survived_male, Survived_female))


#  幸存男性和幸存女性数量对比
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([0, 1], [Survived_male, Survived_female], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['幸存男性', '幸存女性'])
plt.title('幸存男性和幸存女性数量对比')
for i, j in zip([0, 1], [Survived_male, Survived_female]):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(122)
plt.pie([Survived_male, Survived_female],labels=['male', 'female'],autopct='%1.0f%%')
plt.title('幸存者中幸存男性和幸存女性比例')
plt.show()

再接下来，分析在男性种群中，男性生还率状况 (幸存男性加死亡男性等于事故前船上男性)

#  幸存男性和死亡男性对比
male_df=df[df['Sex']=='male']
nu1 = male_df['Survived'][male_df['Survived']==1].count()
nu2 = male_df['Survived'][male_df['Survived']==0].count()
print('幸存男性为%d人，死亡男性为%d人'%(nu1, nu2))
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([0, 1], [nu1, nu2], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['幸存男性', '死亡男性'])
plt.title('比例')
for i, j in zip([0, 1], [nu1, nu2]):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(122)
plt.pie([male_df['Survived'][male_df['Survived']==0].count(),male_df['Survived'][male_df['Survived']==1].count()],labels=['no Survived','Survived'],autopct='%1.0f%%')
plt.show()

如法炮制，分析在女性种群中，女性生还率状况 (幸存女性加死亡女性等于事故前船上女性)

#  女性生还和死亡的人数对比
female_df=df[df['Sex']=='female']
num1 = female_df['Survived'][female_df['Survived']==1].count()
num2 = female_df['Survived'][female_df['Survived']==0].count()
print('幸存女性为%d人，死亡女性为%d人'%(num1, num2))
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([0, 1], [num1, num2], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['幸存女性', '死亡女性'])
plt.title('女性生存死亡比例')
for i, j in zip([0, 1], [num1, num2]):
    print(i, j)
    print('%i' % j)
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(122)
plt.pie([female_df['Survived'][female_df['Survived']==0].count(),female_df['Survived'][female_df['Survived']==1].count()],labels=['no Survived','Survived'],autopct='%1.0f%%')
plt.show()

年龄因素的影响

因为年龄一栏目存在空值，所以先进行填充

  年龄因素分析
average_age=df['Age'].mean()  # 计算年龄均值
std_age=df['Age'].std()  # 计算年龄栏方差
count_nan_age=df['Age'].isnull().sum()  # 计算年龄栏空值总和
#  生成（average_age-std_age,average_age+std_age）范围的size个随机数
rand1=np.random.randint(average_age-std_age,average_age+std_age,size=count_nan_age)
df['Age'][np.isnan(df['Age'])]=rand1  # np.isnan（）输出为布尔变量

然后粗略看一下年龄分布直方图和箱线图

# 粗略看一下年龄分布直方图
plt.figure(figsize=(12,5))
plt.subplot(121)
df['Age'].hist(bins=100)
plt.xlabel('Age')
plt.ylabel('Num')
#  箱线图
plt.subplot(122)
df.boxplot(column='Age',showfliers=False)
plt.show()

首先将年龄分为四类

统计四类人群下的生还人数以及生还率

#  首先将年龄分为四类
children_df=df[df['Age']<=12]
juvenile_df = df[(df['Age'] > 12) & (df['Age'] < 18)]
adults_df=df[(df['Age']>=18)&(df['Age']<65)]
agedness_df=df[df['Age']>=65]
#  四类年龄下人群生还人数分别统计
children_survived_sum = children_df['Survived'].sum()
juvenile_survived_sum = juvenile_df['Survived'].sum()
adults_survived_sum = adults_df['Survived'].sum()
agedness_survived_sum = agedness_df['Survived'].sum()
print('儿童生还人数为%d人，少年生还人数为%d人，成年人生还人数为%d人，老年人生还人数为%d人'%(children_survived_sum, juvenile_survived_sum, adults_survived_sum , agedness_survived_sum))
#  四类年龄人群生还率统计
children_survived_rate = children_df["Survived"].mean()
juvenile_survived_rate = juvenile_df['Survived'].mean()
adults_survived_rate = adults_df['Survived'].mean()
agedness_survived_rate = agedness_df['Survived'].mean()
print('儿童生还率为%f，少年生还率为%f，成年人生还率为%f，老年人生还率为%f'%(children_survived_rate, juvenile_survived_rate, adults_survived_rate, agedness_survived_rate))

#  年龄数据可视化
x = ['children', 'juvenile', 'adults', 'agedness']
b = [40, 26, 275, 1]  # 各年龄段生还人数
y = [children_survived_rate, juvenile_survived_rate , adults_survived_rate, agedness_survived_rate]  # 各年龄段生还率
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.bar([0, 1, 2, 3], b, width=0.5, alpha=0.5)
plt.xticks([0, 1, 2, 3], x)
plt.title('各年龄段生还人数')
for i, j in zip([0, 1, 2, 3], b):

    plt.text(i, j, '%i' % j, ha='center', va='bottom')

x = ['children', 'juvenile', 'adults', 'agedness']
y = [children_survived_rate, juvenile_survived_rate , adults_survived_rate, agedness_survived_rate]
plt.subplot(122)
plt.bar([0, 1, 2, 3], y, width=0.5, alpha=0.5)
plt.xticks([0, 1, 2, 3], x)
plt.title('各年龄段生还率')
for i, j in zip([0, 1, 2, 3], y):
    plt.text(i, j,  j, ha='center', va='bottom')
plt.show()

最后看一下父母因素的影响统计有父母和无父母这两个因素影响

有父母情况下幸存和死亡状况

#  父母因素分析
parch_df=df[df['Parch']!=0]  # 有父母
no_parch_df=df[df['Parch']==0]  # 无父母
sur_parch_df_sum = parch_df['Survived'].sum()  # 有父母的生存下来的人数
nosur_parch_df_sum = parch_df['Survived'][parch_df['Survived'] == 0].count()  # 有父母的阵亡的人数

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([0, 1], [sur_parch_df_sum, nosur_parch_df_sum], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['幸存', '死亡'])
plt.title('有父母人群生还和死亡情况')
for i, j in zip([0, 1], [sur_parch_df_sum, nosur_parch_df_sum]):
    plt.text(i, j, '%i' % j, ha='center', va='bottom')

plt.subplot(122)
plt.pie([parch_df['Survived'][parch_df['Survived'] == 0].count(),parch_df['Survived'][parch_df['Survived'] == 1].count()],labels=['No Survived', 'Survived'],autopct='%1.0f%%')
plt.show()

在看一下无父母的情况

# 无父母的生存和死亡状况分析
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.bar([0, 1], [no_parch_df['Survived'][no_parch_df['Survived'] == 0].count(),no_parch_df['Survived'][no_parch_df['Survived'] ==1].count()], width=0.5, alpha=0.5)
plt.xticks([0, 1], ['死亡', '幸存'])
plt.title('无父母人群幸存和死亡人数')
for i, j in zip([0, 1], [no_parch_df['Survived'][no_parch_df['Survived'] == 0].count(),no_parch_df['Survived'][no_parch_df['Survived'] ==1].count()]):
    plt.text(i, j, '%i' % j, ha='center', va='bottom')
plt.subplot(122)
plt.pie([no_parch_df['Survived'][no_parch_df['Survived'] == 0].count(),no_parch_df['Survived'][no_parch_df['Survived'] ==1].count()],labels=['No Survived', 'Survived'],autopct='%1.0f%%')
plt.title('无父母人群幸存和死亡率')
plt.show()

结束（主要用于巩固基础学习）