再刷泰坦尼克号
数据探索和可视化:形成假设的第一步
导入需要的包
#忽略警告提示
import warnings
warnings.filterwarnings('ignore')
#数据处理
import pandas as pd
import numpy as np
import random
#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
数据读入
path='C:/Users/Titanic/'
p1=open(path+'train.csv')
p2=open(path+'test.csv')
train=pd.read_csv(p1)
test=pd.read_csv(p2)
数据概览
train.head(3)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th… | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
train.info()
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train.drop('PassengerId',axis=1).corr()
Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Survived | 1.000000 | -0.338481 | -0.077221 | -0.035322 | 0.081629 | 0.257307 |
Pclass | -0.338481 | 1.000000 | -0.369226 | 0.083081 | 0.018443 | -0.549500 |
Age | -0.077221 | -0.369226 | 1.000000 | -0.308247 | -0.189119 | 0.096067 |
SibSp | -0.035322 | 0.083081 | -0.308247 | 1.000000 | 0.414838 | 0.159651 |
Parch | 0.081629 | 0.018443 | -0.189119 | 0.414838 | 1.000000 | 0.216225 |
Fare | 0.257307 | -0.549500 | 0.096067 | 0.159651 | 0.216225 | 1.000000 |
#图来观察更直观
sns.set(context='paper',font='monospace')
sns.set(style='white')
f,ax=plt.subplots(figsize=(10,6))
train_corr=train.drop('PassengerId',axis=1).corr()
sns.heatmap(train_corr,ax=ax,vmax=.9,square=True,cmap=plt.cm.get_cmap('RdYlBu'))
ax.set_xticklabels(train_corr.index,size=15)
ax.set_yticklabels(train_corr.columns,size=15)
ax.set_title('train feature corr',fontsize=20)
- Pclass与获救情况负相关
- Fare和获救情况正相关
- Pclass和Fare负相关
Age
#深入观察一下年龄
fig,axes=plt.subplots(2,1,figsize=(8,6))
sns.set_style('white')
sns.distplot(train.Age.fillna(-20),rug=True,color='b',ax=axes[0])
ax0=axes[0]
ax0.set_title('Age distribution')
ax0.set_xlabel('')
ax1=axes[1]
ax1.set_title('Age survived distribution&#