titanic prediction

# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("train.csv", dtype={"Age": np.float64}, )
test_df    = pd.read_csv("test.csv", dtype={"Age": np.float64}, )

# preview the data
titanic_df.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale2210A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th…female3810PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale2600STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female351011380353.1000C123S
4503Allen, Mr. William Henrymale35003734508.0500NaNS
titanic_df=titanic_df.drop(['PassengerId','Name','Ticket'],axis=1)
test_df   =test_df.drop(['Name','Ticket'],axis=1)


# Embarked

# only in titanic_df, fill the two missing values with the most occurred value, which is "S".
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

# plot
sns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3)

fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)
# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)
sns.countplot(x='Embarked', data=titanic_df, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2)

# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)

# Either to consider Embarked column in predictions,
# and remove "S" dummy variable, 
# and leave "C" & "Q", since they seem to have a good rate for Survival.

# OR, don't create dummy variables for Embarked column, just drop it, 
# because logically, Embarked doesn't seem to be useful in prediction.

embark_dummies_titanic  = pd.get_dummies(titanic_df['Embarked'])
embark_dummies_titanic.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test_df['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

titanic_df = titanic_df.join(embark_dummies_titanic)
test_df    = test_df.join(embark_dummies_test)

titanic_df.drop(['Embarked'], axis=1,inplace=True)
test_df.drop(['Embarked'], axis=1,inplace=True)

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

这里写图片描述

这里写图片描述

test_df.info()
titanic_df.info()
#fare
#fill the missing "Fare" for the test_df
test_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)
#convert from float to int 
titanic_df['Fare']=titanic_df['Fare'].astype(int)
test_df['Fare']=test_df['Fare'].astype(int)
#get fare from survived and not survived
fare_not_survived=titanic_df['Fare'][titanic_df['Survived']==0]
fare_survived=titanic_df['Fare'][titanic_df['Survived']==1]
fare_not_survived
0 7 4 8 5 8 6 51 7 21 12 8 13 31 14 7 16 29 18 18 20 26 24 21 26 7 27 263 29 7 30 27 33 10 34 82 35 52 37 8 38 18 40 9 41 21 42 7 45 8 46 15 48 21 49 17 50 39 51 7 … 844 8 845 7 846 69 847 7 848 33 850 31 851 7 852 15 854 26 859 7 860 14 861 11 863 69 864 13 867 50 868 9 870 7 872 5 873 9 876 9 877 7 878 7 881 7 882 10 883 10 884 7 885 29 886 13 888 23 890 7 Name: Fare, dtype: int64
fare_survived
1 71 2 7 3 53 8 11 9 30 10 16 11 26 15 16 17 13 19 7 21 13 22 8 23 35 25 31 28 7 31 146 32 7 36 7 39 11 43 41 44 7 47 7 52 76 53 26 55 35 56 10 58 27 61 80 65 15 66 10 … 809 53 820 93 821 8 823 12 827 37 828 7 829 80 830 14 831 18 835 83 838 56 839 29 842 31 849 89 853 39 855 9 856 164 857 26 858 19 862 25 865 13 866 13 869 11 871 52 874 24 875 7 879 83 880 26 887 30 889 30 Name: Fare, dtype: int64
#get average and std fare from survived and unsurvived passengers
average_fare=DataFrame([fare_not_survived.mean(),fare_survived.mean()])
std_fare=DataFrame([fare_not_survived.std(),fare_survived.std()])
#plot
titanic_df['Fare'].plot(kind='hist',figsize=(15,3),bins=100,xlim=(titanic_df['Fare'].min(),100))
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fe37b70>

这里写图片描述

average_fare
0
021.690346
147.991228
std_fare
0
031.392191
166.608344
average_fare.index.names=std_fare.index.names=['survived']
average_fare.plot(yerr=std_fare,kind='bar',legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f554fa62208>

这里写图片描述

#age
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age value_titanic')
axis2.set_title('New Age value_titanic')

#get average,std and number of NaN values in titanic
average_age_titanic=titanic_df['Age'].mean()
std_age_titanic=titanic_df['Age'].std()
number_of_nan_titanic=titanic_df['Age'].isnull().sum()

#get average,std and number of NaN values in test
average_age_test=test_df['Age'].mean()
std_age_test=test_df['Age'].std()
number_of_nan_test=test_df['Age'].isnull().sum()

#generate random values between average-std adn average+std
rand_1=np.random.randint(average_age_titanic-std_age_titanic,average_age_titanic+std_age_titanic,size=number_of_nan_titanic)
rand_2=np.random.randint(average_age_test-std_age_test,average_age_test+std_age_test,size=number_of_nan_test)

#plot the originial age value_titanic                              
titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

#fill NaN values in age with random age genetated 
titanic_df['Age'][np.isnan(titanic_df['Age'])]=rand_1
test_df['Age'][np.isnan(test_df['Age'])]=rand_2

#convert from float to int
titanic_df['Age']=titanic_df['Age'].astype(int)
test_df['Age']=test_df['Age'].astype(int)

#plot new age value_titanic
titanic_df['Age'].hist(bins=70,ax=axis2)
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy





<matplotlib.axes._subplots.AxesSubplot at 0x7f554f9ad518>

这里写图片描述

#continue plot age
#peaks for survived /not survived by their age
facet=sns.FacetGrid(titanic_df,hue='Survived',aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0,titanic_df['Age'].max()))
facet.add_legend()
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<seaborn.axisgrid.FacetGrid at 0x7f554fac10b8>

这里写图片描述

#average survived passengers by age
average_survived_by_age=titanic_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
fig,axis1=plt.subplots(1,1,figsize=(18,4))
sns.barplot(x='Age',y='Survived',data=average_survived_by_age)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<matplotlib.axes._subplots.AxesSubplot at 0x7f554f941a90>

这里写图片描述

#Cabin 
#it has a lot of NaN values,so it can't have huge impact on predication
titanic_df.drop('Cabin',axis=1,inplace=True)
test_df.drop('Cabin',axis=1,inplace=True)
titanic_df.head()
SurvivedPclassSexAgeSibSpParchFareCQ
003male2210700
111female38107110
213female2600700
311female35105300
403male3500800
#Family
#instead of having two columns Sibsp & Parch
#we use only one column to represent whether the passenger has any family member on board 
#meaning , if has family on board if will increase of chance of survivl or not

titanic_df['Family']=titanic_df['SibSp']+titanic_df['Parch']
titanic_df['Family'].loc[titanic_df['Family']>0]=1
titanic_df['Family'].loc[titanic_df['Family']==0]=0

test_df['Family']=test_df['SibSp']+test_df['Parch']
test_df['Family'].loc[test_df['Family']>0]=1
test_df['Family'].loc[test_df['Family']==0]=0

#drop SibSp & Parch
titanic_df=titanic_df.drop(['SibSp','Parch'],axis=1)
test_df=test_df.drop(['SibSp','Parch'],axis=1)

#plot
fig,(axis1,axis2)=plt.subplots(1,2,sharex=True,figsize=(10,5))

#countplot
sns.countplot(x='Family',data=titanic_df,order=[1,0],ax=axis1)

#average survival by Family
average_survival_by_Family=titanic_df[['Family','Survived']].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family',y='Survived',data=average_survival_by_Family,order=[1,0],ax=axis2)
axis1.set_xticklabels(['with family','alone'],rotation=0)
/usr/lib/python3/dist-packages/pandas/core/indexing.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





[<matplotlib.text.Text at 0x7f554f65a8d0>,
 <matplotlib.text.Text at 0x7f554f6519b0>]

这里写图片描述

# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    if age<16:
        return 'child'
    else:
        return sex

titanic_df['Person'] = titanic_df[['Age','Sex']].apply(get_person,axis=1)
test_df['Person']    = test_df[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
titanic_df.drop(['Sex'],axis=1,inplace=True)
test_df.drop(['Sex'],axis=1,inplace=True)
person_dummy=pd.get_dummies(titanic_df['Person'])
person_dummy.columns = ['Child','Female','Male']
person_dummy.drop(['Male'],axis=1,inplace=True)

person_dummy_test=pd.get_dummies(test_df['Person'])
person_dummy_test.columns=['Child','Female','Male']
person_dummy_test.drop(['Male'],axis=1,inplace=True)

titanic_df=titanic_df.join(person_dummy)
test_df=test_df.join(person_dummy_test)
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))
sns.countplot(x='Person',data=titanic_df,ax=axis1)
person_perc=titanic_df[['Person','Survived']].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person',y='Survived',data=person_perc,ax=axis2,order=['male','female','child'])
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
#drop person
titanic_df.drop(['Person'],axis=1,inplace=True)
test_df.drop(['Person'],axis=1,inplace=True)
#Pclass
sns.factorplot('Pclass','Survived',order=[1,2,3],data=titanic_df,size=5)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<seaborn.axisgrid.FacetGrid at 0x7f554f7d8c18>

这里写图片描述

pclass_dummies_titanic=pd.get_dummies(titanic_df['Pclass'])
pclass_dummies_titanic.columns=['Class1','Class2','Class3']
pclass_dummies_titanic.drop(['Class3'],axis=1,inplace=True)

pclass_dummies_test=pd.get_dummies(test_df['Pclass'])
pclass_dummies_test.columns=['Class1','Class2','Class3']
pclass_dummies_test.drop(['Class3'],axis=1,inplace=True)

titanic_df.drop(['Pclass'],axis=1,inplace=True)
test_df.drop(['Pclass'],axis=1,inplace=True)

titanic_df.join(pclass_dummies_titanic)
test_df.join(pclass_dummies_test)
PassengerIdAgeFareCQFamilyChildFemaleClass1Class2
08923470100000
18934770010100
28946290100001
38952780000000
489622120010100
58971490001000
68983070100100
789926290010001
89001871000100
990121240010000
109023070000000
1190346260000010
1290423820010110
1390563260010001
1490647610010110
1590724271010101
1690835120100001
179092171000000
189102770010100
199114571000100
2091255591010010
21913930011000
2291421310000110
2391521611010010
24916482621010110
2591750140010000
2691822611010110
279192271000000
2892041300000010
2992139211010000
38812802170100000
38912816210011000
390128223930000010
391128351390010110
392128413200011000
393128547100000001
394128629220010000
395128718600010110
39612882470100000
397128948791010110
39812902270000000
39912913170100000
4001292301640000110
401129338210010001
402129422591010110
403129517470000010
404129643271010010
405129720131000001
406129823100010001
4071299502111010010
40813003970100100
40913013130011000
41013023870100100
411130337900110110
41213042870000100
41313052580000000
4141306391081000110
41513073870000000
41613083980000000
417130927221010000

418 rows × 10 columns

#defining the training and testing set
x_train=titanic_df.drop(['Survived'],axis=1)
y_train=titanic_df['Survived']
x_test=test_df.drop('PassengerId',axis=1)
#logistic regression
logre=LogisticRegression()
logre.fit(x_train,y_train)
y_pred=logre.predict(x_test)
logre.score(x_train,y_train)
0.77104377104377109
# Support Vector Machines

svc = SVC()

svc.fit(x_train, y_train)

Y_pred = svc.predict(x_test)

svc.score(x_train, y_train)
0.88327721661054992
#Random Forest
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train,y_train)
y_pred=random_forest.predict(x_test)
random_forest.score(x_train,y_train)
0.96520763187429859
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('titanic.csv', index=False)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

hebastast

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值