生存情况信息统计分析:
案例题目:查看数据信息,求平均年龄。求男女生存率,未成年人获救的概率。
数据来源:https://www.kaggle.com/c/titanic/data
# coding=utf-8
import numpy as np
import pandas as pd
#中文的话这样打开,不会出现Initializing from file failed这种错误
f=open("./data/train.csv")
titanic=pd.read_csv(f)
titanic.head(10)
survived | sex | age | n_siblings_spouses | parch | fare | class | deck | embark_town | alone | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | male | 22.0 | 1 | 0 | 7.2500 | Third | unknown | Southampton | n |
1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | First | C | Cherbourg | n |
2 | 1 | female | 26.0 | 0 | 0 | 7.9250 | Third | unknown | Southampton | y |
3 | 1 | female | 35.0 | 1 | 0 | 53.1000 | First | C | Southampton | n |
4 | 0 | male | 28.0 | 0 | 0 | 8.4583 | Third | unknown | Queenstown | y |
5 | 0 | male | 2.0 | 3 | 1 | 21.0750 | Third | unknown | Southampton | n |
6 | 1 | female | 27.0 | 0 | 2 | 11.1333 | Third | unknown | Southampton | n |
7 | 1 | female | 14.0 | 1 | 0 | 30.0708 | Second | unknown | Cherbourg | n |
8 | 1 | female | 4.0 | 1 | 1 | 16.7000 | Third | G | Southampton | n |
9 | 0 | male | 20.0 | 0 | 0 | 8.0500 | Third | unknown | Southampton | y |
age = titanic["age"]
age_is_null = pd.isnull(age)#年龄是空值的布尔值
#如果不去除缺失值,就无法计算平均年龄
good_age = titanic["age"][age_is_null==False]
print(sum(good_age)/len(good_age)) #年龄的平均值
print(titanic["age"].mean()) #调用.mean()也可以直接计算年龄的平均值
29.631307814992027
29.631307814992027
titanicsur = titanic.pop('survived')
print(titanic.head())
print(titanicsur.head())
print(pd.concat([titanic, titanicsur], axis = 1).groupby('sex').survived.mean())
sex age n_siblings_spouses parch fare class deck \
0 male 22.0 1 0 7.2500 Third unknown
1 female 38.0 1 0 71.2833 First C
2 female 26.0 0 0 7.9250 Third unknown
3 female 35.0 1 0 53.1000 First C
4 male 28.0 0 0 8.4583 Third unknown
embark_town alone
0 Southampton n
1 Cherbourg n
2 Southampton y
3 Southampton n
4 Queenstown y
0 0
1 1
2 1
3 1
4 0
Name: survived, dtype: int64
sex
female 0.778802
male 0.180488
Name: survived, dtype: float64
def generate_age_label(row):
age = row["age"]
if pd.isnull(age):
return "unknown"
elif age<18:
return "minor"
else:
return "adult"
age_labels = titanic.apply(generate_age_label,axis=1)
titanic["age_labels"]=age_labels #各个年龄段获救的概率
pd.concat([titanic, titanicsur], axis = 1).groupby('age_labels').survived.mean()
age_labels
adult 0.371841
minor 0.506849
Name: survived, dtype: float64