前几天帮同学进行了有关泰坦尼克号乘客数据的分析,重新回忆了自己对python数据分析方面的知识。果然,学了不用,就忘记了,真是太难了(留下没知识的眼泪)。
题目如下:
能力有限,只对里面简单的部分进行了分析
话不多说,代码如下:
#导入处理数据包
import pandas as pd
import matplotlib.pyplot as plt
#导入数据
train = pd.read_csv('train.csv')
#打印列信息
#print(train.info())
#进行年龄清洗
train['Age']=train['Age'].fillna(train['Age'].mean())
#各乘客等级的乘客的平均年龄
Pclass_1=train[train['Pclass']==1].mean()['Age']
Pclass_2=train[train['Pclass']==2].mean()['Age']
Pclass_3=train[train['Pclass']==3].mean()['Age']
rects=plt.bar(['1','2','3'],[Pclass_1,Pclass_2,Pclass_3])
plt.xlabel("Pclass")
plt.ylabel("averageAge")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(round(height,2)), ha='center', va='bottom')
plt.show()
#各乘客等级的乘客的生存率
Total=len(train['Pclass'])
Survived_1=train[train['Pclass']==1]['Survived'].sum()
Survived_2=train[train['Pclass']==2]['Survived'].sum()
Survived_3=train[train['Pclass']==3]['Survived'].sum()
rects=plt.bar(['1','2','3'],[Survived_1/Total,Survived_2/Total,Survived_3/Total])
plt.xlabel("Pclass")
plt.ylabel("survivalRate")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(round(height*100,2))+'%', ha='center', va='bottom')
plt.show()
#各性别的乘客的生存率
Total=len(train['Sex'])
Survived_male=train[train['Sex']=='male']['Survived'].sum()
Survived_female=train[train['Sex']=='female']['Survived'].sum()
rects=plt.bar(['male','female'],[Survived_male/Total,Survived_female/Total])
plt.xlabel("Sex")
plt.ylabel("survivalRate")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(round(height*100,2))+'%', ha='center', va='bottom')
plt.show()
#乘客等级和性别交叉维度的生存率
Pclass_1=train[train['Pclass']==1]
#等级1男性存活人数
Pclass_1_male=Pclass_1[Pclass_1['Sex']=='male']['Survived'].sum()
#等级1女性存活人数
Pclass_1_female=Pclass_1[Pclass_1['Sex']=='female']['Survived'].sum()
Pclass_2=train[train['Pclass']==2]
#等级2男性存活人数
Pclass_2_male=Pclass_2[Pclass_2['Sex']=='male']['Survived'].sum()
#等级2女性存活人数
Pclass_2_female=Pclass_2[Pclass_2['Sex']=='female']['Survived'].sum()
Pclass_3=train[train['Pclass']==3]
#等级1男性存活人数
Pclass_3_male=Pclass_3[Pclass_3['Sex']=='male']['Survived'].sum()
#等级1女性存活人数
Pclass_3_female=Pclass_3[Pclass_3['Sex']=='female']['Survived'].sum()
rects=plt.bar(['1_male','1_female','2_male','2_female','3_male','3_female'],
[Pclass_1_male/Total,Pclass_1_female/Total,
Pclass_2_male / Total, Pclass_2_female / Total,
Pclass_3_male / Total, Pclass_3_female / Total])
plt.xlabel("PclassAndSex")
plt.ylabel("survivalRate")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(round(height*100,2))+'%', ha='center', va='bottom')
plt.show()
结果如下: