import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
% matplotlib inline
train_df = pd. read_csv( './train.csv' )
test_df = pd. read_csv( './test.csv' )
combine = [ train_df, test_df]
print ( train_df. columns)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
train_df. head( )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
train_df. info( )
print ( '_' * 40 )
test_df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_df. describe( )
PassengerId
Survived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
446.000000
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
257.353842
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
1.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
223.500000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
446.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
668.500000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
891.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
train_df. describe( include= 'O' )
Name
Sex
Ticket
Cabin
Embarked
count
891
891
891
204
889
unique
891
2
681
147
3
top
Beane, Mrs. Edward (Ethel Clarke)
male
CA. 2343
B96 B98
S
freq
1
577
7
4
644
train_df[ [ 'Pclass' , 'Survived' ] ] . groupby( [ 'Pclass' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Pclass
Survived
0
1
0.629630
1
2
0.472826
2
3
0.242363
train_df. groupby( [ 'Sex' ] ) [ 'Sex' , 'Survived' ] . mean( )
Survived
Sex
female
0.742038
male
0.188908
train_df[ [ 'Sex' , 'Survived' ] ] . groupby( [ 'Sex' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Sex
Survived
0
female
0.742038
1
male
0.188908
train_df[ [ 'SibSp' , 'Survived' ] ] . groupby( [ 'SibSp' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
SibSp
Survived
1
1
0.535885
2
2
0.464286
0
0
0.345395
3
3
0.250000
4
4
0.166667
5
5
0.000000
6
8
0.000000
train_df[ [ 'Parch' , 'Survived' ] ] . groupby( [ 'Parch' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Parch
Survived
3
3
0.600000
1
1
0.550847
2
2
0.500000
0
0
0.343658
5
5
0.200000
4
4
0.000000
6
6
0.000000
g = sns. FacetGrid( train_df, col= 'Survived' )
g. map ( plt. hist, 'Age' , bins= 20 )
<seaborn.axisgrid.FacetGrid at 0x1a12933d30>
grid = sns. FacetGrid( train_df, col= 'Survived' , row= 'Pclass' , size