import urllib. request
import os
url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'
filepath = './data/titanic3.xls'
if not os. path. isfile( filepath) :
result = urllib. request. urlretrieve( url, filepath)
print ( 'downloaded:' , result)
downloaded: ('./data/titanic3.xls', <http.client.HTTPMessage object at 0x00000214763EAEB8>)
import numpy as np
import pandas as pd
all_df = pd. read_excel( './data/titanic3.xls' )
cols = [
'survived' , 'name' , 'pclass' , 'sex' , 'age' , 'sibsp' , 'parch' , 'fare' ,
'embarked'
]
all_df = all_df[ cols]
all_df[ : 2 ]
survived name pclass sex age sibsp parch fare embarked 0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 0 211.3375 S 1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 2 151.5500 S
df = all_df. drop( [ 'name' ] , axis= 1 )
all_df. isnull( ) . sum ( )
survived 0
name 0
pclass 0
sex 0
age 263
sibsp 0
parch 0
fare 1
embarked 2
dtype: int64
age_mean = df[ 'age' ] . mean( )
df[ 'age' ] = df[ 'age' ] . fillna( age_mean)
fare_mean = df[ 'fare' ] . mean( )
df[ 'fare' ] = df[ 'fare' ] . fillna( fare_mean)
df[ 'sex' ] = df[ 'sex' ] . map ( { 'female' : 0 , 'male' : 1 } ) . astype( int )
x_Onehot_df = pd. get_dummies( data= df, columns= [ 'embarked' ] )
x_Onehot_df[ : 2 ]
survived pclass sex age sibsp parch fare embarked_C embarked_Q embarked_S 0 1 1 0 29.0000 0 0 211.3375 0 0 1 1 1 1 1 0.9167 1 2 151.5500 0 0 1
ndarray = x_Onehot_df. values
ndarray. shape
(1309, 10)
ndarray[ : 2 ]
array([[ 1. , 1. , 0. , 29. , 0. , 0. ,
211.3375, 0. , 0. , 1. ],
[ 1. , 1. , 1. , 0.9167, 1. , 2. ,
151.55 , 0. , 0. , 1. ]])
Label = ndarray[ : , 0 ]
Features = ndarray[ : , 1 : ]
Label[ : 2 ]
array([1., 1.])
Features[ : 2 ]
array([[ 1. , 0. , 29. , 0. , 0. , 211.3375,
0. , 0. , 1. ],
[ 1. , 1. , 0.9167, 1. , 2. , 151.55 ,
0. , 0. , 1. ]])
from sklearn import preprocessing
minmax_Scale = preprocessing. MinMaxScaler( feature_range= ( 0 , 1 ) )
scaledFeatures = minmax_Scale. fit_transform( Features)
scaledFeatures[ : 2 ]
array([[0. , 0. , 0.36116884, 0. , 0. ,
0.41250333, 0. , 0. , 1. ],
[0. , 1. , 0.00939458, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
msk = np. random. rand( len ( all_df) ) < 0.8
train_df = all_df[ msk]
test_df = all_df[ ~ msk]
print ( 'total:' , len ( all_df) , 'train:' , len ( train_df) , 'test:' , len ( test_df) )
total: 1309 train: 1071 test: 238
def PreprocessData ( raw_df) :
df = raw_df. drop( [ 'name' ] , axis= 1 )
age_mean = df[ 'age' ] . mean( )
df[ 'age' ] = df[ 'age' ] . fillna( age_mean)
fare_mean = df[ 'fare' ] . mean( )
df[ 'fare' ] = df[ 'fare' ] . fillna( age_mean)
df[ 'sex' ] = df[ 'sex' ] . map ( { 'female' : 0 , 'male' : 1 } ) . astype( int )
x_Onehot_df = pd. get_dummies( data= df, columns= [ 'embarked' ] )
ndarray = x_Onehot_df. values
Features = ndarray[ : , 1 : ]
Label = ndarray[ : , 0 ]
minmax_scale = preprocessing. MinMaxScaler( feature_range= ( 0 , 1 ) )
scaledFeatures = minmax_scale. fit_transform( Features)
return scaledFeatures, Label
train_Features, train_Label = PreprocessData( train_df)
test_Features, test_Label = PreprocessData( test_df)
train_Features[ : 2 ]
array([[0. , 0. , 0.0229641 , 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ],
[0. , 1. , 0.37369494, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
test_Label[ : 2 ]
array([1., 1.])
from tensorflow. keras. models import Sequential
from tensorflow. keras. layers import Dense, Dropout
model = Sequential( )
model. add(
Dense( units= 40 ,
input_dim= 9 ,
kernel_initializer= 'uniform' ,
activation= 'relu' ) )
model. add( Dense( units= 30 , kernel_initializer= 'uniform' , activation= 'relu' ) )
model. add( Dense( units= 1 , kernel_initializer= 'uniform' , activation= 'sigmoid' ) )
model. compile ( loss= 'binary_crossentropy' ,
optimizer= 'adam' ,
metrics= [ 'accuracy' ] )
W0819 11:23:43.940761 6100 deprecation_wrapper.py:119] From E:\Anaconda3\envs\ml\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.
W0819 11:23:43.970712 6100 deprecation_wrapper.py:119] From E:\Anaconda3\envs\ml\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.
W0819 11:23:43.976665 6100 deprecation.py:323] From E:\Anaconda3\envs\ml\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
train_history = model. fit( x= train_Features,
y= train_Label,
validation_split= 0.1 ,
batch_size= 30 ,
epochs= 30 ,
verbose= 2 )
Train on 963 samples, validate on 108 samples
Epoch 1/30
- 0s - loss: 0.6645 - acc: 0.6023 - val_loss: 0.5840 - val_acc: 0.7685
Epoch 2/30
- 0s - loss: 0.6062 - acc: 0.6594 - val_loss: 0.4936 - val_acc: 0.7870
Epoch 3/30
- 0s - loss: 0.5513 - acc: 0.7487 - val_loss: 0.4564 - val_acc: 0.7870
Epoch 4/30
- 0s - loss: 0.5151 - acc: 0.7747 - val_loss: 0.4487 - val_acc: 0.8056
Epoch 5/30
- 0s - loss: 0.4968 - acc: 0.7757 - val_loss: 0.4538 - val_acc: 0.8056
Epoch 6/30
- 0s - loss: 0.4882 - acc: 0.7736 - val_loss: 0.4354 - val_acc: 0.8056
Epoch 7/30
- 0s - loss: 0.4839 - acc: 0.7695 - val_loss: 0.4277 - val_acc: 0.8148
Epoch 8/30
- 0s - loss: 0.4818 - acc: 0.7788 - val_loss: 0.4254 - val_acc: 0.8148
Epoch 9/30
- 0s - loss: 0.4796 - acc: 0.7840 - val_loss: 0.4231 - val_acc: 0.8333
Epoch 10/30
- 0s - loss: 0.4766 - acc: 0.7819 - val_loss: 0.4247 - val_acc: 0.8148
Epoch 11/30
- 0s - loss: 0.4733 - acc: 0.7830 - val_loss: 0.4240 - val_acc: 0.8148
Epoch 12/30
- 0s - loss: 0.4714 - acc: 0.7840 - val_loss: 0.4174 - val_acc: 0.8333
Epoch 13/30
- 0s - loss: 0.4684 - acc: 0.7871 - val_loss: 0.4181 - val_acc: 0.8426
Epoch 14/30
- 0s - loss: 0.4666 - acc: 0.7871 - val_loss: 0.4169 - val_acc: 0.8426
Epoch 15/30
- 0s - loss: 0.4643 - acc: 0.7892 - val_loss: 0.4151 - val_acc: 0.8519
Epoch 16/30
- 0s - loss: 0.4632 - acc: 0.7892 - val_loss: 0.4134 - val_acc: 0.8426
Epoch 17/30
- 0s - loss: 0.4618 - acc: 0.7902 - val_loss: 0.4133 - val_acc: 0.8426
Epoch 18/30
- 0s - loss: 0.4618 - acc: 0.7913 - val_loss: 0.4145 - val_acc: 0.8056
Epoch 19/30
- 0s - loss: 0.4606 - acc: 0.7944 - val_loss: 0.4160 - val_acc: 0.8426
Epoch 20/30
- 0s - loss: 0.4606 - acc: 0.7934 - val_loss: 0.4155 - val_acc: 0.8148
Epoch 21/30
- 0s - loss: 0.4588 - acc: 0.7944 - val_loss: 0.4124 - val_acc: 0.8426
Epoch 22/30
- 0s - loss: 0.4568 - acc: 0.7954 - val_loss: 0.4136 - val_acc: 0.8426
Epoch 23/30
- 0s - loss: 0.4571 - acc: 0.7985 - val_loss: 0.4152 - val_acc: 0.8333
Epoch 24/30
- 0s - loss: 0.4585 - acc: 0.7923 - val_loss: 0.4190 - val_acc: 0.8056
Epoch 25/30
- 0s - loss: 0.4577 - acc: 0.7923 - val_loss: 0.4162 - val_acc: 0.8426
Epoch 26/30
- 0s - loss: 0.4610 - acc: 0.7882 - val_loss: 0.4192 - val_acc: 0.8426
Epoch 27/30
- 0s - loss: 0.4553 - acc: 0.8006 - val_loss: 0.4156 - val_acc: 0.8333
Epoch 28/30
- 0s - loss: 0.4580 - acc: 0.7902 - val_loss: 0.4186 - val_acc: 0.7963
Epoch 29/30
- 0s - loss: 0.4590 - acc: 0.7975 - val_loss: 0.4145 - val_acc: 0.8426
Epoch 30/30
- 0s - loss: 0.4550 - acc: 0.7934 - val_loss: 0.4165 - val_acc: 0.8241
scores = model. evaluate( x= test_Features, y= test_Label)
238/238 [==============================] - 0s 21us/step
scores[ 1 ]
0.8025210089042407
Jack = pd. Series( [ 0 , 'Jack' , 3 , 'male' , 23 , 1 , 0 , 5.000 , 'S' ] )
Rose = pd. Series( [ 1 , 'Rose' , 1 , 'female' , 20 , 1 , 0 , 100.000 , 'S' ] )
JR_df = pd. DataFrame( [ list ( Jack) , list ( Rose) ] ,
columns= [
'survived' , 'name' , 'pclass' , 'sex' , 'age' , 'sibsp' ,
'parch' , 'fare' , 'embarked'
] )
all_df = pd. concat( [ all_df, JR_df] )
all_df[ ~ 2 : ]
survived name pclass sex age sibsp parch fare embarked 1308 0 Zimmerman, Mr. Leo 3 male 29.0 0 0 7.875 S 0 0 Jack 3 male 23.0 1 0 5.000 S 1 1 Rose 1 female 20.0 1 0 100.000 S
all_Features, Label = PreprocessData( all_df)
all_probability = model. predict( all_Features)
all_probability[ : 10 ]
array([[0.97387624],
[0.36760893],
[0.9653297 ],
[0.29578814],
[0.96136355],
[0.26288155],
[0.93404984],
[0.27685004],
[0.92254674],
[0.30783302]], dtype=float32)
pd = all_df
pd. insert( len ( all_df. columns) ,
'probability' , all_probability)
pd[ ~ 2 : ]
survived name pclass sex age sibsp parch fare embarked probability 1308 0 Zimmerman, Mr. Leo 3 male 29.0 0 0 7.875 S 0.132631 0 0 Jack 3 male 23.0 1 0 5.000 S 0.130663 1 1 Rose 1 female 20.0 1 0 100.000 S 0.963028
pd[ ( pd[ 'survived' ] == 0 ) ]
survived name pclass sex age sibsp parch fare embarked probability 2 0 Allison, Miss. Helen Loraine 1 female 2.0 1 2 151.5500 S 0.965330 3 0 Allison, Mr. Hudson Joshua Creighton 1 male 30.0 1 2 151.5500 S 0.295788 4 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) 1 female 25.0 1 2 151.5500 S 0.961364 7 0 Andrews, Mr. Thomas Jr 1 male 39.0 0 0 0.0000 S 0.276850 9 0 Artagaveytia, Mr. Ramon 1 male 71.0 0 0 49.5042 C 0.307833 10 0 Astor, Col. John Jacob 1 male 47.0 1 0 227.5250 C 0.382211 15 0 Baumann, Mr. John D 1 male NaN 0 0 25.9250 S 0.303370 16 0 Baxter, Mr. Quigg Edmond 1 male 24.0 0 1 247.5208 C 0.568902 19 0 Beattie, Mr. Thomson 1 male 36.0 0 0 75.2417 C 0.418435 25 0 Birnbaum, Mr. Jakob 1 male 25.0 0 0 26.0000 C 0.446399 30 0 Blackwell, Mr. Stephen Weart 1 male 45.0 0 0 35.5000 S 0.271255 34 0 Borebank, Mr. John James 1 male 42.0 0 0 26.5500 S 0.275931 38 0 Brady, Mr. John Bertram 1 male 41.0 0 0 30.5000 S 0.278998 39 0 Brandeis, Mr. Emil 1 male 48.0 0 0 50.4958 C 0.364540 40 0 Brewe, Dr. Arthur Jackson 1 male NaN 0 0 39.6000 C 0.429713 45 0 Butt, Major. Archibald Willingham 1 male 45.0 0 0 26.5500 S 0.269356 46 0 Cairns, Mr. Alexander 1 male NaN 0 0 31.0000 S 0.304525 51 0 Carlsson, Mr. Frans Olof 1 male 33.0 0 0 5.0000 S 0.291429 52 0 Carrau, Mr. Francisco M 1 male 28.0 0 0 47.1000 S 0.312617 53 0 Carrau, Mr. Jose Pedro 1 male 17.0 0 0 47.1000 S 0.339315 58 0 Case, Mr. Howard Brown 1 male 49.0 0 0 26.0000 S 0.260632 60 0 Cavendish, Mr. Tyrell William 1 male 36.0 1 0 78.8500 S 0.271148 62 0 Chaffee, Mr. Herbert Fuller 1 male 46.0 1 0 61.1750 S 0.246321 70 0 Chisholm, Mr. Roderick Robert Crispin 1 male NaN 0 0 0.0000 S 0.297509 71 0 Clark, Mr. Walter Miller 1 male 27.0 1 0 136.7792 C 0.426140 74 0 Clifford, Mr. George Quincy 1 male NaN 0 0 52.0000 S 0.309330 75 0 Colley, Mr. Edward Pomeroy 1 male 47.0 0 0 25.5875 S 0.264827 77 0 Compton, Mr. Alexander Taylor Jr 1 male 37.0 1 1 83.1583 C 0.365659 80 0 Crafton, Mr. John Bertram 1 male NaN 0 0 26.5500 S 0.303512 81 0 Crosby, Capt. Edward Gifford 1 male 70.0 1 1 71.0000 S 0.200277 ... ... ... ... ... ... ... ... ... ... ... 1276 0 Vander Planke, Mrs. Julius (Emelia Maria Vande... 3 female 31.0 1 0 18.0000 S 0.390119 1278 0 Vendel, Mr. Olof Edvin 3 male 20.0 0 0 7.8542 S 0.144151 1279 0 Vestrom, Miss. Hulda Amanda Adolfina 3 female 14.0 0 0 7.8542 S 0.541694 1280 0 Vovk, Mr. Janko 3 male 22.0 0 0 7.8958 S 0.141521 1281 0 Waelens, Mr. Achille 3 male 22.0 0 0 9.0000 S 0.141519 1282 0 Ware, Mr. Frederick 3 male NaN 0 0 8.0500 S 0.131565 1283 0 Warren, Mr. Charles William 3 male NaN 0 0 7.5500 S 0.131566 1284 0 Webber, Mr. James 3 male NaN 0 0 8.0500 S 0.131565 1285 0 Wenzel, Mr. Linhart 3 male 32.5 0 0 9.5000 S 0.128364 1287 0 Widegren, Mr. Carl/Charles Peter 3 male 51.0 0 0 7.7500 S 0.107727 1288 0 Wiklund, Mr. Jakob Alfred 3 male 18.0 1 0 6.4958 S 0.136882 1289 0 Wiklund, Mr. Karl Johan 3 male 21.0 1 0 6.4958 S 0.133120 1291 0 Willer, Mr. Aaron ("Abi Weller") 3 male NaN 0 0 8.7125 S 0.131564 1292 0 Willey, Mr. Edward 3 male NaN 0 0 7.5500 S 0.131566 1293 0 Williams, Mr. Howard Hugh "Harry" 3 male NaN 0 0 8.0500 S 0.131565 1294 0 Williams, Mr. Leslie 3 male 28.5 0 0 16.1000 S 0.133237 1295 0 Windelov, Mr. Einar 3 male 21.0 0 0 7.2500 S 0.142832 1296 0 Wirz, Mr. Albert 3 male 27.0 0 0 8.6625 S 0.135120 1297 0 Wiseman, Mr. Phillippe 3 male NaN 0 0 7.2500 S 0.131567 1298 0 Wittevrongel, Mr. Camille 3 male 36.0 0 0 9.5000 S 0.124216 1299 0 Yasbeck, Mr. Antoni 3 male 27.0 1 0 14.4542 C 0.161984 1301 0 Youseff, Mr. Gerious 3 male 45.5 0 0 7.2250 C 0.147109 1302 0 Yousif, Mr. Wazli 3 male NaN 0 0 7.2250 C 0.169266 1303 0 Yousseff, Mr. Gerious 3 male NaN 0 0 14.4583 C 0.169295 1304 0 Zabour, Miss. Hileni 3 female 14.5 1 0 14.4542 C 0.674486 1305 0 Zabour, Miss. Thamine 3 female NaN 1 0 14.4542 C 0.603369 1306 0 Zakarian, Mr. Mapriededer 3 male 26.5 0 0 7.2250 C 0.174369 1307 0 Zakarian, Mr. Ortin 3 male 27.0 0 0 7.2250 C 0.173603 1308 0 Zimmerman, Mr. Leo 3 male 29.0 0 0 7.8750 S 0.132631 0 0 Jack 3 male 23.0 1 0 5.0000 S 0.130663
810 rows × 10 columns
pd[ ( pd[ 'survived' ] == 0 ) & ( pd[ 'probability' ] > 0.9 ) ]
survived name pclass sex age sibsp parch fare embarked probability 2 0 Allison, Miss. Helen Loraine 1 female 2.0 1 2 151.5500 S 0.965330 4 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) 1 female 25.0 1 2 151.5500 S 0.961364 105 0 Evans, Miss. Edith Corse 1 female 36.0 0 0 31.6792 C 0.973539 169 0 Isham, Miss. Ann Elizabeth 1 female 50.0 0 0 28.7125 C 0.971705 286 0 Straus, Mrs. Isidor (Rosalie Ida Blun) 1 female 63.0 1 0 221.7792 S 0.954021
pd[ : 5 ]
survived name pclass sex age sibsp parch fare embarked probability 0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 0 211.3375 S 0.973876 1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 2 151.5500 S 0.367609 2 0 Allison, Miss. Helen Loraine 1 female 2.0000 1 2 151.5500 S 0.965330 3 0 Allison, Mr. Hudson Joshua Creighton 1 male 30.0000 1 2 151.5500 S 0.295788 4 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) 1 female 25.0000 1 2 151.5500 S 0.961364