# ----------------Load lib and import data---------------------
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
In [2]:
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
# show the data
train.head()
Out[3]:
In [4]:
test.head()
Out[4]:
In [5]:
# -----------------Understand each feature of your data------------------
train.describe()
Out[5]:
In [6]:
print(train.isnull().sum(), train.info())
# then we know that, there's missing in Age, Cabin and Enbarked. Moreover, Cabin lost most of data
In [7]:
# check the data condition
train_suv = train[train['Survived']==1]
train_unsuv = train[train['Survived']==0]
suv_num = len(train_suv)
unsuv_num = len(train_unsuv)
print ("survived: %i (%.1f percent) and unsurvived: %i (%.1f percent)" \
%(suv_num, float(suv_num)/len(train)*100, unsuv_num, float(unsuv_num)/len(train)*100))
In [8]:
plt.figure(figsize=[15, 5])
plt.subplot(111)
age_train = train[['Age','Survived']].groupby(['Age'], as_index=False)
avg_age = age_train.mean()
sns.barplot(x='Age', y='Survived', data=avg_age)
plt.figure(figsize=[15, 10])
# plt.subplot(331)
# sns.distplot(train_suv['Age'].dropna().values, color='g', label='suv', \
# axlabel='Age', kde_kws={'label': 'suv'}, bins=range(0,81,1))
# sns.distplot(train_unsuv['Age'].dropna().values, color='r', label='unsuv', \
# axlabel='Age', kde_kws={'label': 'unsuv'}, bins=range(0,81,1))
plt.subplot(231)
sns.barplot(x='Pclass', y='Survived', data=train)
plt.subplot(232)
sns.barplot(x='Sex', y='Survived', data=train, order=['female', 'male'])
plt.subplot(233)
sns.barplot(x='SibSp', y='Survived', data=train)
plt.subplot(234)
sns.barplot(x='Parch', y='Survived', data=train)
plt.subplot(235)
sns.barplot(x='Embarked', y='Survived', data=train)
plt.subplot(236)
sns.distplot(train_suv['Fare'].dropna().values, color='g', label='suv', \
axlabel='Fare', kde_kws={'label': 'suv'})
sns.distplot(train_unsuv['Fare'].dropna().values, color='r', label='unsuv', \
axlabel='Fare', kde_kws={'label': 'unsuv'})
Out[8]:
In [9]:
# sepecial process for Cabin, Ticket and Name
# Cabin
# get the first letter to mark cabin
for item in train['Cabin'].dropna().index:
train.loc[item, 'Cabin'] = train.loc[item, 'Cabin'][0]
# check if cabin info useful
cabin_mean = train[['Survived', 'Cabin']].iloc[train['Cabin'].dropna().index].groupby(['Cabin'], as_index=False).mean()
plt.figure(figsize=[10, 5])
plt.subplot(121)
sns.barplot(x='Cabin', y='Survived', data=cabin_mean)
# check the difference between cabin known and cabin unknown
cabin_known_mean = train['Survived'][~pd.isnull(train['Cabin'])].mean()
cabin_unknown_mean = train['Survived'][pd.isnull(train['Cabin'])].mean()
cabin_mean = pd.DataFrame([cabin_known_mean, cabin_unknown_mean])
cabin_mean.columns = ['Survived']
# print(cabin_known_mean, cabin_unknown_mean)
plt.subplot(122)
sns.barplot(x=['Cabin_known','Cabin_unknown'],y='Survived', data=cabin_mean) # wow, can be this way
# although cabin known has more possibility to be survived, however, this should be unfair to use this to judge if \
# survive or not, because test data may has no this cabin number. why test, just because don not know the result
Out[9]:
In [11]:
# Family, subjectivily speaking, we think if one person is alone, he may have high risk to be unsurvived
train_family = pd.DataFrame(train['Parch'] + train['SibSp'])
train_family.columns = ['Family']
train_family['Survived'] = train['Survived']
train_family['Family'][train_family['Family']>1] = 1
train_family
plt.figure(figsize=[5, 5])
plt.subplot(111)
sns.barplot(x='Family', y='Survived', data=train_family)
Out[11]:
In [13]:
# i am not sure if it's useful to figure each peer features like the example. However, should make the sex to int\
# and re-calculate
# why has no sex ---------------> my god...
train['Sex'][train['Sex']=='female'] = np.int64(1)
train['Sex'][train['Sex']=='male'] = np.int64(0)
#print(train.head())
#print(type(train.loc[1,'Pclass']))
#plt.figure(figsize=[15, 14])
#plt.subplot(111)
#sns.heatmap(train.drop(['PassengerId'], axis=1).corr(), annot=True)
# -----------------Fill the missing values for both train and test------------------
In [14]:
cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
r = sns.pairplot(data=train.dropna(), vars=cols, hue='Survived', palette=['r', 'b'])
r.set(xticklabels=[])
Out[14]:
In [16]:
tab = pd.crosstab(train['Embarked'], train['Sex'])
print(tab)
tab = tab.div(tab.sum(1).astype('float'), axis=0)
tab.plot(kind='bar', stacked=True)
Out[16]:
In [17]:
print(train.isnull().sum(), train.info())
# then we know that, there's missing in Age, Cabin and Enbarked. Moreover, Cabin lost most of data
In [18]:
print(test.isnull().sum(), test.info())
# there missing values for age, Cabin, Fare.
In [19]:
# --------------------------------Fill missing valuse for data------------------------------------
# filling age values, both for train and test data
# choose avg+_std as filling values
avg_age_train = train['Age'].mean()
std_age_train = train['Age'].std()
print(avg_age_train, std_age_train)
num_null_age_train = sum(train['Age'].isnull())
print(num_null_age_train)
rand_fill_age = np.random.randint(avg_age_train-std_age_train, avg_age_train+std_age_train,size=num_null_age_train)
train['Age'][train['Age'].isnull()] = rand_fill_age
avg_age_test = test['Age'].mean()
std_age_test = test['Age'].std()
print(avg_age_test, std_age_test)
num_null_age_test = sum(test['Age'].isnull())
print(num_null_age_test)
rand_fill_age = np.random.randint(avg_age_test-std_age_test, avg_age_test+std_age_test,size=num_null_age_test)
test['Age'][test['Age'].isnull()] = rand_fill_age
In [20]:
# filling fare values
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()
In [21]:
# --------------------------------Engineered features-----------------------------------------
data = pd.concat([train.drop(['Survived'], axis=1), test])
print(len(train), len(test), len(data))
In [22]:
# remove PassengerId, Name, Embarked
data= data.drop(['PassengerId', 'Name', 'Embarked'], axis=1)
print(data.columns)
# add family, remove SibSp and Parch
data['Family'] = data['SibSp'] + data['Parch']
data['Family'][data['Family']>1] = 1
data = data.drop(['SibSp', 'Parch'], axis=1)
print(data.head())
# change ticket to Tshare
data_share_index = []
data_no_share_index = []
data_ticket_group = data.groupby(['Ticket'], as_index=False)
for ticket, group in data_ticket_group:
if len(data_ticket_group.get_group(ticket)) > 1:
#print(data_ticket_group.get_group(ticket))
data_share_index.extend(data_ticket_group.get_group(ticket).index)
else:
data_no_share_index.extend(data_ticket_group.get_group(ticket).index)
data['TShare'] = 0
for item in data_share_index:
data.loc[item, 'TShare'] = 1
data=data.drop(['Ticket'], axis=1)
# change cabin to 0 and 1
cabin_null = pd.isnull(data['Cabin'])
cabin_no_null = ~pd.isnull(data['Cabin'])
data['Cabin'][cabin_null] = 0
data['Cabin'][cabin_no_null] = 1
data['Cabin'] = data['Cabin'].astype(int)
# change sex to 0 and 1
data['Sex'][data['Sex']=='female'] = np.int64(1)
data['Sex'][data['Sex']=='male'] = np.int64(0)
In [23]:
# test remove col
# data=data.drop(['Tshare'], axis=1)
data_train = data[:len(train)]
data_train['Survived'] = train['Survived']
data_test = data[len(train):]
print(len(train), data_train.head())
print('\n')
print(len(test), data_test.head())
x_data_train = data_train.drop(['Survived'], axis=1)
y_data_train = data_train['Survived']
x_data_test = data_test
In [24]:
# logistic regression
logreg = LogisticRegressionCV(max_iter=100)
logreg.fit(x_data_train, y_data_train)
print(logreg.get_params())
print(logreg.score(x_data_train, y_data_train))
logreg_score = cross_val_score(logreg, x_data_train, y_data_train, cv=5).mean()
print(logreg_score)
In [25]:
# perceptron
ptron = Perceptron(max_iter=40)
ptron.fit(x_data_train, y_data_train)
print(ptron.get_params())
print(ptron.score(x_data_train, y_data_train))
ptron_score = cross_val_score(ptron, x_data_train, y_data_train, cv=5).mean()
print(ptron_score)
In [26]:
# KNN
kNN = KNeighborsClassifier(n_neighbors=4, weights='distance')
kNN.fit(x_data_train, y_data_train)
print(kNN.get_params())
print(kNN.score(x_data_train, y_data_train))
kNN_score = cross_val_score(kNN, x_data_train, y_data_train, cv=5).mean()
print(kNN_score)
In [38]:
# SVM
svm_kernel = svm.SVC(class_weight='balanced')
svm_kernel.fit(x_data_train, y_data_train)
print(svm_kernel.get_params())
print(svm_kernel.score(x_data_train, y_data_train))
svm_score = cross_val_score(svm_kernel, x_data_train, y_data_train, cv=5).mean()
print(svm_score)
In [101]:
# Decision tree
dec_tree = tree.DecisionTreeClassifier(min_weight_fraction_leaf=0.01)
dec_tree.fit(x_data_train, y_data_train)
print(dec_tree.get_params())
print(dec_tree.score(x_data_train, y_data_train))
dec_score = cross_val_score(dec_tree, x_data_train, y_data_train, cv=10).mean()
print(dec_score)
In [214]:
# random forest
rand_forest = RandomForestClassifier(n_estimators=100, min_weight_fraction_leaf=0.01)
rand_forest.fit(x_data_train, y_data_train)
print(rand_forest.get_params())
print(rand_forest.score(x_data_train, y_data_train))
rand_forest_score = cross_val_score(rand_forest, x_data_train, y_data_train, cv=10).mean()
print(rand_forest_score)
In [181]:
# bagging
bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(min_weight_fraction_leaf=0.01), n_estimators=20)
bagging.fit(x_data_train, y_data_train)
print(bagging.get_params())
print(bagging.score(x_data_train, y_data_train))
bagging_score = cross_val_score(bagging, x_data_train, y_data_train, cv=5).mean()
print(bagging_score)
In [177]:
# extra tree
extra_tree = ExtraTreesClassifier(n_estimators=20, min_samples_split=20)
extra_tree.fit(x_data_train, y_data_train)
print(extra_tree.get_params())
print(extra_tree.score(x_data_train, y_data_train))
extra_tree_score = cross_val_score(extra_tree, x_data_train, y_data_train, cv=5).mean()
print(extra_tree_score)
In [182]:
# Gradient boosting
grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(x_data_train, y_data_train)
print(grad_boost.get_params())
print(grad_boost.score(x_data_train, y_data_train))
grad_boost_score = cross_val_score(grad_boost, x_data_train, y_data_train, cv=5).mean()
print(grad_boost_score)
In [171]:
# adaboost
adaboost = AdaBoostClassifier(n_estimators=50)
adaboost.fit(x_data_train, y_data_train)
print(adaboost.get_params())
print(adaboost.score(x_data_train, y_data_train))
adaboost_score = cross_val_score(adaboost, x_data_train, y_data_train, cv=5).mean()
print(adaboost_score)
In [34]:
# Bayes
gauss = GaussianNB()
gauss.fit(x_data_train, y_data_train)
print(gauss.get_params())
print(gauss.score(x_data_train, y_data_train))
gauss_score = cross_val_score(gauss, x_data_train, y_data_train, cv=5).mean()
print(gauss_score)
In [35]:
# voting
'''
voting = VotingClassifier()
voting.fit(x_data_train, y_data_train)
print(voting.score(x_data_train, y_data_train))
voting_score = cross_val_score(voting, x_data_train, y_data_train, cv=5).mean()
print(voting_score)
'''
Out[35]:
In [216]:
clf = rand_forest
pred_test = clf.predict(data_test)
submit = pd.DataFrame({'PassengerId': test.loc[:, 'PassengerId'],'Survived':pred_test.T})
print(submit.head())
submit.to_csv("output.csv", index=False)