In [21]:
import numpy as np
import pandas as pd
# RMS Titanic data visualization code
# 数据可视化代码
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline
# Load the dataset
# 加载数据集
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)
# Print the first few entries of the RMS Titanic data
# 显示数据列表中的前几项乘客数据
display(full_data.head())
In [22]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
# 从数据集中移除 'Survived' 这个特征,并将它存储在一个新的变量中。
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)
# Show the new dataset with 'Survived' removed
# 显示已移除 'Survived' 特征的数据集
display(data.head())
In [23]:
def accuracy_score(truth, pred):
""" Returns accuracy score for input truth and predictions. """
# Ensure that the number of predictions matches number of outcomes
# 确保预测的数量与结果的数量一致
if len(truth) == len(pred):
# Calculate and return the accuracy as a percent
# 计算预测准确率(百分比)
return "Predictions have an accuracy of {:.2f}%.".format((truth == pred).mean()*100)
else:
return "Number of predictions does not match number of outcomes!"
# Test the 'accuracy_score' function
# 测试 'accuracy_score' 函数
predictions = pd.Series(np.ones(5, dtype = int))
print accuracy_score(outcomes[:5], predictions)
In [24]:
def predictions_0(data):
""" Model with no features. Always predicts a passenger did not survive. """
predictions = []
for _, passenger in data.iterrows():
# Predict the survival of 'passenger'
# 预测 'passenger' 的生还率
predictions.append(0)
# Return our predictions
# 返回预测结果
return pd.Series(predictions)
# Make the predictions
# 进行预测
predictions = predictions_0(data)
In [25]:
print accuracy_score(outcomes, predictions)
In [26]:
survival_stats(data, outcomes, 'Sex')
In [27]:
def predictions_1(data):
""" Model with one feature:
- Predict a passenger survived if they are female. """
predictions = []
for _, passenger in data.iterrows():
# Remove the 'pass' statement below
# 移除下方的 'pass' 声明
# and write your prediction conditions here
# 输入你自己的预测条件
if passenger['Sex'] == 'female':
predictions.append(1)
else:
predictions.append(0)
# Return our predictions
# 返回预测结果
return pd.Series(predictions)
# Make the predictions
# 进行预测
predictions = predictions_1(data)
In [28]:
print accuracy_score(outcomes, predictions)
In [29]:
survival_stats(data, outcomes, 'Age', ["Sex == 'male'"])
In [30]:
def predictions_2(data):
""" Model with two features:
- Predict a passenger survived if they are female.
- Predict a passenger survived if they are male and younger than 10. """
predictions = []
for _, passenger in data.iterrows():
# Remove the 'pass' statement below
# 移除下方的 'pass' 声明
# and write your prediction conditions here
# 输入你自己的预测条件
if passenger['Sex'] == 'male' and passenger['Age'] < 10:
predictions.append(1)
elif passenger['Sex'] == 'female':
predictions.append(1)
else:
predictions.append(0)
# Return our predictions
# 返回预测结果
return pd.Series(predictions)
# Make the predictions
# 进行预测
predictions = predictions_2(data)
In [31]:
print accuracy_score(outcomes, predictions)
In [32]:
survival_stats(data, outcomes, 'Age', ["Sex == 'male'", "Age < 18"])
In [36]:
survival_stats(data, outcomes, 'Age',["Pclass == 3", "Sex == 'female'"])
In [38]:
survival_stats(data, outcomes, 'Age',["Pclass == 1", "Sex == 'male'"])
In [34]:
def predictions_3(data):
""" Model with multiple features. Makes a prediction with an accuracy of at least 80%. """
predictions = []
for _, passenger in data.iterrows():
# Remove the 'pass' statement below
# and write your prediction conditions here
if passenger['Sex'] == 'female':
if passenger['Age'] > 40 and passenger['Age'] < 60 and passenger['Pclass'] == 3:
predictions.append(0)
else:
predictions.append(1)
else:
if passenger['Age'] <= 10:
predictions.append(1)
elif passenger['Age'] < 40 and passenger['Pclass'] == 1:
predictions.append(1)
else:
predictions.append(0)
# Return our predictions
return pd.Series(predictions)
# Make the predictions
predictions = predictions_3(data)
In [35]:
print accuracy_score(outcomes, predictions)