import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
% matplotlib inline
train_df = pd. read_csv( './train.csv' )
test_df = pd. read_csv( './test.csv' )
combine = [ train_df, test_df]
print ( train_df. columns)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
train_df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train_df. info( )
print ( '_' * 40 )
test_df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_df. describe( )
PassengerId Survived Pclass Age SibSp Parch Fare count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
train_df. describe( include= 'O' )
Name Sex Ticket Cabin Embarked count 891 891 891 204 889 unique 891 2 681 147 3 top Beckwith, Mrs. Richard Leonard (Sallie Monypeny) male 1601 G6 S freq 1 577 7 4 644
train_df[ [ 'Pclass' , 'Survived' ] ] . groupby( [ 'Pclass' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Pclass Survived 0 1 0.629630 1 2 0.472826 2 3 0.242363
train_df[ [ 'Sex' , 'Survived' ] ] . groupby( [ 'Sex' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Sex Survived 0 female 0.742038 1 male 0.188908
train_df[ [ 'SibSp' , 'Survived' ] ] . groupby( [ 'SibSp' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
SibSp Survived 1 1 0.535885 2 2 0.464286 0 0 0.345395 3 3 0.250000 4 4 0.166667 5 5 0.000000 6 8 0.000000
train_df[ [ 'Parch' , 'Survived' ] ] . groupby( [ 'Parch' ] , as_index= False ) \
. mean( ) . sort_values( by= 'Survived' , ascending= False )
Parch Survived 3 3 0.600000 1 1 0.550847 2 2 0.500000 0 0 0.343658 5 5 0.200000 4 4 0.000000 6 6 0.000000
g = sns. FacetGrid( train_df, col= 'Survived' )
g. map ( plt. hist, 'Age' , bins= 20 )
<seaborn.axisgrid.FacetGrid at 0x1a08bea550>
grid = sns. FacetGrid( train_df, col= 'Survived' , row= 'Pclass' , size= 2.2 , aspect= 1.6 )
grid. map ( plt. hist, 'Age' , alpha= .5 , bins= 20 )
grid. add_legend( )
<seaborn.axisgrid.FacetGrid at 0x102855978>
grid = sns. FacetGrid( train_df, row= 'Embarked' , size= 2.2 , aspect= 1.6 )
grid. map ( sns. pointplot, 'Pclass' , 'Survived' , 'Sex' , palette= 'deep' )
grid. add_legend( )
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:708: UserWarning: Using the pointplot function without specifying `hue_order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x1a145b2ba8>
grid = sns. FacetGrid( train_df, row= 'Embarked' , col= 'Survived' , size= 2.2 , aspect= 1.6 )
grid. map ( sns. barplot, 'Sex' , 'Fare' , alpha= .5 , ci= None )
grid. add_legend( )
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x1a1457ecf8>
print ( "Before" , train_df. shape, test_df. shape, combine[ 0 ] . shape, combine[ 1 ] . shape)
Before (891, 12) (418, 11) (891, 12) (418, 11)
train_df = train_df. drop( [ 'Ticket' , 'Cabin' , 'Name' ] , axis= 1 )
test_df = test_df. drop( [ 'Ticket' , 'Cabin' , 'Name' ] , axis= 1 )
combine = [ train_df, test_df]
print ( "After" , train_df. shape, test_df. shape, combine[ 0 ] . shape, combine[ 1 ] . shape)
After (891, 9) (418, 8) (891, 9) (418, 8)
for dataset in combine:
dataset[ 'Sex' ] = dataset[ 'Sex' ] . map ( { 'female' : 1 , 'male' : 0 } ) . astype( int )
train_df. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked 0 1 0 3 0 22.0 1 0 7.2500 S 1 2 1 1 1 38.0 1 0 71.2833 C 2 3 1 3 1 26.0 0 0 7.9250 S 3 4 1 1 1 35.0 1 0 53.1000 S 4 5 0 3 0 35.0 0 0 8.0500 S
guess_ages = np. zeros( ( 2 , 3 ) )
guess_ages
array([[0., 0., 0.],
[0., 0., 0.]])
for dataset in combine:
for i in range ( 0 , 2 ) :
for j in range ( 0 , 3 ) :
guess_df = dataset[ ( dataset[ 'Sex' ] == i) & \
( dataset[ 'Pclass' ] == j+ 1 ) ] [ 'Age' ] . dropna( )
age_guess = guess_df. median( )
guess_ages[ i, j] = int ( age_guess/ 0.5 + 0.5 ) * 0.5
for i in range ( 0 , 2 ) :
for j in range ( 0 , 3 ) :
dataset. loc[ ( dataset. Age. isnull( ) ) & ( dataset. Sex == i) & ( dataset. Pclass == j+ 1 ) , \
[ 'Age' ] ] = guess_ages[ i, j]
train_df. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked 0 1 0 3 0 22.0 1 0 7.2500 S 1 2 1 1 1 38.0 1 0 71.2833 C 2 3 1 3 1 26.0 0 0 7.9250 S 3 4 1 1 1 35.0 1 0 53.1000 S 4 5 0 3 0 35.0 0 0 8.0500 S
train_df[ 'AgeBand' ] = pd. cut( train_df[ 'Age' ] , 5 )
train_df[ [ 'AgeBand' , 'Survived' ] ] . groupby( [ 'AgeBand' ] , as_index= False ) . \
mean( ) . sort_values( by= 'AgeBand' , ascending= True )
AgeBand Survived 0 (0.34, 16.336] 0.550000 1 (16.336, 32.252] 0.336714 2 (32.252, 48.168] 0.412844 3 (48.168, 64.084] 0.434783 4 (64.084, 80.0] 0.090909
for dataset in combine:
dataset. loc[ dataset[ 'Age' ] <= 16 , 'Age' ] = 0
dataset. loc[ ( dataset[ 'Age' ] > 16 ) & ( dataset[ 'Age' ] <= 32 ) , 'Age' ] = 1
dataset. loc[ ( dataset[ 'Age' ] > 32 ) & ( dataset[ 'Age' ] <= 48 ) , 'Age' ] = 2
dataset. loc[ ( dataset[ 'Age' ] > 48 ) & ( dataset[ 'Age' ] <= 64 ) , 'Age' ] = 3
dataset. loc[ dataset[ 'Age' ] > 64 , 'Age' ]
train_df. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked AgeBand 0 1 0 3 0 1.0 1 0 7.2500 S (16.336, 32.252] 1 2 1 1 1 2.0 1 0 71.2833 C (32.252, 48.168] 2 3 1 3 1 1.0 0 0 7.9250 S (16.336, 32.252] 3 4 1 1 1 2.0 1 0 53.1000 S (32.252, 48.168] 4 5 0 3 0 2.0 0 0 8.0500 S (32.252, 48.168]
train_df = train_df. drop( [ 'AgeBand' ] , axis= 1 )
combine = [ train_df, test_df]
train_df. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked 0 1 0 3 0 1.0 1 0 7.2500 S 1 2 1 1 1 2.0 1 0 71.2833 C 2 3 1 3 1 1.0 0 0 7.9250 S 3 4 1 1 1 2.0 1 0 53.1000 S 4 5 0 3 0 2.0 0 0 8.0500 S
freq_port = train_df. Embarked. dropna( ) . mode( ) [ 0 ]
for dataset in combine:
dataset[ 'Embarked' ] = dataset[ 'Embarked' ] . fillna( freq_port)
train_df[ [ 'Embarked' , 'Survived' ] ] . groupby( [ 'Embarked' ] , as_index= False ) . mean( ) . \
sort_values( by= 'Survived' , ascending= False )
Embarked Survived 0 C 0.553571 1 Q 0.389610 2 S 0.339009
for dataset in combine:
dataset[ 'Embarked' ] = dataset[ 'Embarked' ] . map ( { 'S' : 0 , 'C' : 1 , 'Q' : 2 } ) . astype( int )
train_df. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked 0 1 0 3 0 1.0 1 0 7.2500 0 1 2 1 1 1 2.0 1 0 71.2833 1 2 3 1 3 1 1.0 0 0 7.9250 0 3 4 1 1 1 2.0 1 0 53.1000 0 4 5 0 3 0 2.0 0 0 8.0500 0
test_df[ 'Fare' ] . fillna( test_df[ 'Fare' ] . dropna( ) . median( ) , inplace= True )
train_df[ 'FareBand' ] = pd. qcut( train_df[ 'Fare' ] , 4 )
train_df[ [ 'FareBand' , 'Survived' ] ] . groupby( [ 'FareBand' ] , as_index= False ) . mean( ) . \
sort_values( by= 'FareBand' , ascending= True )
FareBand Survived 0 (-0.001, 7.91] 0.197309 1 (7.91, 14.454] 0.303571 2 (14.454, 31.0] 0.454955 3 (31.0, 512.329] 0.581081
for dataset in combine:
dataset. loc[ dataset[ 'Fare' ] <= 7.91 , 'Fare' ] = 0
dataset. loc[ ( dataset[ 'Fare' ] > 7.91 ) & ( dataset[ 'Fare' ] <= 14.454 ) , 'Fare' ] = 1
dataset. loc[ ( dataset[ 'Fare' ] > 14.454 ) & ( dataset[ 'Fare' ] <= 31 ) , 'Fare' ] = 2
dataset. loc[ dataset[ 'Fare' ] > 31 , 'Fare' ] = 3
dataset[ 'Fare' ] = dataset[ 'Fare' ] . astype( int )
train_df = train_df. drop( [ 'FareBand' ] , axis= 1 )
combine = [ train_df, test_df]
train_df. head( 10 )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked 0 1 0 3 0 1.0 1 0 0 0 1 2 1 1 1 2.0 1 0 3 1 2 3 1 3 1 1.0 0 0 1 0 3 4 1 1 1 2.0 1 0 3 0 4 5 0 3 0 2.0 0 0 1 0 5 6 0 3 0 1.0 0 0 1 2 6 7 0 1 0 3.0 0 0 3 0 7 8 0 3 0 0.0 3 1 2 0 8 9 1 3 1 1.0 0 2 1 0 9 10 1 2 1 0.0 1 0 2 1