# 检查你的Python版本
from sys import version_info
if version_info.major != 2 and version_info.minor != 7:
raise Exception('请使用Python 2.7来完成此项目')
In [25]:
import numpy as np
import pandas as pdfull
# 数据可视化代码
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)
display(full_data.head())
Out[25]:
In [26]:
# 从数据集中移除 'Survived' 这个特征,并将它存储在一个新的变量中。
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)
# 显示已移除 'Survived' 特征的数据集
display(data.head())
display(outcomes.head())
Out[26]:
Out[26]:
In [27]:
def accuracy_score(truth, pred):
""" 返回 pred 相对于 truth 的准确率 """
# 确保预测的数量与结果的数量一致
if len(truth) == len(pred):
# 计算预测准确率(百分比)
return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
else:
return "Number of predictions does not match number of outcomes!"
# 测试 'accuracy_score' 函数
predictions = pd.Series(np.ones(5, dtype = int)) #五个预测全部为1,既存活
print accuracy_score(outcomes[:5], predictions)
In [28]:
def predictions_0(data):
""" 不考虑任何特征,预测所有人都无法生还 """
predictions = []
for _, passenger in data.iterrows():
# 预测 'passenger' 的生还率
predictions.append(0)
# 返回预测结果
return pd.Series(predictions)
# 进行预测
predictions = predictions_0(data)
In [29]:
print accuracy_score(outcomes, predictions)
In [30]:
survival_stats(data, outcomes, 'Sex')
In [50]:
def predictions_2(data):
""" 考虑两个特征:
- 如果是女性则生还
- 如果是男性并且小于10岁则生还 """
predictions = []
for _, passenger in data.iterrows():
if (passenger['Sex'] == 'male' and passenger['Age'] < 10) or passenger['Sex'] == 'female':
predictions.append(1)
else:
predictions.append(0)
# 返回预测结果
return pd.Series(predictions)
# 进行预测
predictions = predictions_2(data)
In [51]:
print accuracy_score(outcomes, predictions)
In [42]:
survival_stats(data, outcomes, 'Age', ["Sex == 'male'", "Age < 18"])
In [127]:
survival_stats(data, outcomes, 'Pclass',["Sex == 'female'", "Age < 80"])
survival_stats(data, outcomes, 'Age',["Sex == 'male'","Pclass == 1"])
In [133]:
def predictions_3(data):
""" 考虑多个特征,准确率至少达到80% """
predictions = []
for _, passenger in data.iterrows():
if passenger['Sex'] == 'female':
if passenger['Age'] > 40 and passenger['Age'] < 60 and passenger['Pclass']==3:
predictions.append(0)
else:
predictions.append(1)
else:
if passenger['Sex'] == 'male' and passenger['Age']< 10 :
predictions.append(1)
else:
predictions.append(0)
# 返回预测结果
return pd.Series(predictions)
# 进行预测
predictions = predictions_3(data)
In [134]:
print accuracy_score(outcomes, predictions)