%matplotlib inline
import pandas as pandas
import matplotlib.pyplot as plt
# 读取csv文件,年龄等数据存在缺失值
csv = pandas.read_csv('titanic-data.csv')
csv.info()
In [62]:
# 判断各个元素对存活率的影响,删除当前元素缺失值
def influence(survived, element):
# 判断**对存活率的影响
survived_element = csv[[survived, element]].dropna()
groupby_element = survived_element.groupby(element)
total_groupby_element = groupby_element.count()
survived_groupby_element = groupby_element.sum()
# **的生还人数
print("生还人数")
print(survived_groupby_element)
# **的总人数
print("总人数")
print(total_groupby_element)
# **的生还率
print("生还率")
survived_rate_element = survived_groupby_element / total_groupby_element
print(survived_rate_element)
# 直方图**的生还率
survived_rate_element.plot(kind='bar')
plt.title(element + 'Survival rate')
plt.xlabel(element)
plt.ylabel(survived)
plt.show()
In [63]:
# 判断区间元素对存活率的影响,删除当前元素缺失值
def section_influence(survived, element, lim):
# 判断区间元素对存活率的影响
# 分组
def cla(n, lim):
if n == 0:
return 'unknown'
return '[%d, %d)' % (lim * (n // lim), lim * (n // lim) + lim)
survived_section = csv[[survived, element]].dropna()
section_group = pandas.DataFrame({
'element_group': [cla(section, lim) for section in survived_section[element]]
})
groupby_section = pandas.concat([csv['Survived'], section_group], axis=1)
groupby_element = groupby_section.groupby('element_group')
total_groupby_element = groupby_element.count()
survived_groupby_element = groupby_element.sum()
# 分组的生还人数
print(survived_groupby_element)
# 分组的总人数
print(total_groupby_element)
# 分组的生还率
survived_rate_element = survived_groupby_element / total_groupby_element
print(survived_rate_element)
# 直方图分组的生还率
survived_rate_element.plot(kind='bar')
plt.title(element + 'Survival rate')
plt.xlabel(element)
plt.ylabel(survived)
survived_rate_element.plot(kind='pie',subplots=True)
plt.show()
In [64]:
# 判断双元素对存活率的影响
def double_influence(element1, element2, survived):
# 判断**对存活率的影响
# survived_element = csv[[survived, element]]
groupby_element = csv.groupby([element1, element2])[survived]
total_groupby_element = groupby_element.count()
survived_groupby_element = groupby_element.sum()
# **的生还人数
print("生还人数")
print(survived_groupby_element)
# **的总人数
print("总人数")
print(total_groupby_element)
# **的生还率
print("生还率")
survived_rate_element = survived_groupby_element / total_groupby_element
print(survived_rate_element)
# 直方图**的生还率
survived_rate_element.unstack().plot(kind='bar')
plt.title(element1 + "+" + element2 + " "+ 'Survival rate')
plt.xlabel(element1 + "+" + element2)
plt.ylabel(survived)
plt.show()
In [65]:
influence('Survived', 'Pclass')
In [66]:
influence('Survived', 'Sex')
In [67]:
section_influence('Survived', 'Age', 20)
In [68]:
influence('Survived', 'SibSp')
In [69]:
influence('Survived', 'Parch')
In [70]:
section_influence('Survived', 'Fare', 20)
In [71]:
influence('Survived', 'Embarked')
In [72]:
double_influence('Pclass', 'Sex', 'Survived')
建议变量名使用更具有实际意义的
titanic_data
、titanic_df
等,其中 df 代表DataFrame
数据类型