找到了一篇基于深度学习进行泰坦尼克比赛预测的代码。写得非常好,我做了一点点修改,准确了准确率在0.794
代码是基于keras写的,以后再做此类问题时可以模仿着进行迁移。原文链接:
https://www.kaggle.com/rafaelvleite/titanic-artificial-neural-network-80-score/code
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 7 21:34:37 2018
@author: www
"""
import numpy as np
import pandas as pd
dataset_test = pd.read_csv('./input/test.csv')
dataset_train = pd.read_csv('./input/train.csv')
df1 = pd.DataFrame(dataset_train)
df2 = pd.DataFrame(dataset_test)
dataset_joined = pd.concat([df1, df2])
dataset_joined['Name'] = dataset_joined['Name'].str.split(',').str[1]
dataset_joined['Name'] = dataset_joined['Name'].str.split('.').str[0]
dataset_joined.drop(['Ticket', 'Cabin' ], axis = 1, inplace = True)
dataset_joined['Embarked'].fillna('S', inplace=True)
names = dataset_joined['Name'].copy()
for item in names:
if (item == ' Mr'):
names.replace(item, 1, inplace = True)
elif (item == ' Miss' or item == ' Mrs'):
names.replace(item, 0, inplace = True)
elif (item == ' Capt' or item == ' Col' or item == ' Don' or item == ' Dona' or item == ' Dr' or item == ' Jonkheer' or item == ' Lady' or item == ' Major' or item == ' Master' or item == ' Mile' or item == ' Mlle' or item == ' Mme' or item == ' Ms' or item == ' Rev' or item == ' Sir' or item == ' the Countess'):
names.replace(item, 2, inplace = True)
dataset_joined['Name'] = names
from sklearn.preprocessing import LabelEncoder
labelencoder_sex = LabelEncoder()
dataset_joined['Sex'] = labelencoder_sex.fit_transform(dataset_joined['Sex'])
labelencoder_embarked = LabelEncoder()
dataset_joined['Embarked'] = labelencoder_embarked.fit_transform(dataset_joined['Embarked'])
median_ages = np.zeros((2,3))
median_fares = np.zeros((2,3))
for i in range(0,2):
for j in range(0,3):
median_ages[i,j] = dataset_joined[ (dataset_joined['Sex'] == i) & \
(dataset_joined['Pclass'] == j+1)]['Age'].dropna().median()
median_fares[i,j] = dataset_joined[ (dataset_joined['Sex'] == i) & \
(dataset_joined['Pclass'] == j+1)]['Fare'].dropna().median()
dataset_joined['AgeFill'] = dataset_joined['Age'].copy()
#dataset_joined[ dataset_joined['Age'].isnull()][['Age', 'AgeFill', 'Sex', 'Pclass']]
dataset_joined['FareFill'] = dataset_joined['Fare'].copy()
#dataset_joined[ dataset_joined['Fare'].isnull()][['Fare', 'AgeFill', 'Sex', 'Pclass']]
for i in range(0, 2):
for j in range(0, 3):
dataset_joined.loc[ (dataset_joined.Age.isnull()) & (dataset_joined.Sex == i) & (dataset_joined.Pclass == j+1),\
'AgeFill'] = median_ages[i,j]
dataset_joined.loc[ (dataset_joined.Fare.isnull()) & (dataset_joined.Sex == i) & (dataset_joined.Pclass == j+1),\
'FareFill'] = median_fares[i,j]
dataset_joined['AgeIsNull'] = pd.isnull(dataset_joined['Age']).astype(int)
dataset_joined['FareIsNull'] = pd.isnull(dataset_joined['Fare']).astype(int)
dataset_joined.head()
dataset_joined.drop(['Age', 'Fare' ], axis = 1, inplace = True)
dataset_joined['Survived'].fillna(-1, inplace=True)
dataset_joined.info()
dataset_joined = pd.get_dummies(dataset_joined, columns=['Embarked', 'Name', 'Pclass'])
dataset_train_revised = dataset_joined.iloc[:891, :]
dataset_test_revised = dataset_joined.iloc[891:, :]
X_train = dataset_train_revised.iloc[:,[0,2,3,5,6,9,10,11,12,13,14]]
y_train = dataset_train_revised.iloc[:, [4]]
X_test = dataset_test_revised.iloc[:,[0,2,3,5,6,9,10,11,12,13,14]]
pass_index = dataset_joined.iloc[891:,1]
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
import numpy
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
seed = 7
numpy.random.seed(seed)
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden
classifier.add(Dense(15, activation = 'tanh', input_dim = 11))
# Adding the output layer
classifier.add(Dense(1, activation = 'sigmoid'))
# Compiling the ANN
optimizer = SGD(lr = 0.01, momentum = 0.9)
classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train.values, batch_size = 10, epochs = 50)
y_pred = classifier.predict(X_test)
# We now consider any prediction >0.5 as 1 and <=0.5 as 0
y_pred = np.round_(y_pred,0)
#Part 4 - Generating Submission File
only_final_values = pd.read_csv('./input/test.csv')
only_final_values['Survived'] = y_pred
only_final_values = only_final_values.iloc[: , [0,11]]
only_final_values['Survived'] = only_final_values['Survived'].astype(np.int64)
only_final_values.to_csv('titanic_submission.csv', index = False)