titanic prediction

最新推荐文章于 2022-08-18 16:15:38 发布

hebastast

最新推荐文章于 2022-08-18 16:15:38 发布

阅读量1.4k

点赞数 1

分类专栏： python 机器学习数据挖掘文章标签：机器学习

本文链接：https://blog.csdn.net/HE19930303/article/details/52318706

版权

机器学习同时被 3 个专栏收录

19 篇文章 0 订阅

订阅专栏

数据挖掘

14 篇文章 0 订阅

订阅专栏

python

3 篇文章 0 订阅

订阅专栏

# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("train.csv", dtype={"Age": np.float64}, )
test_df    = pd.read_csv("test.csv", dtype={"Age": np.float64}, )

# preview the data
titanic_df.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th…	female	38	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NaN	S

titanic_df=titanic_df.drop(['PassengerId','Name','Ticket'],axis=1)
test_df   =test_df.drop(['Name','Ticket'],axis=1)



# Embarked

# only in titanic_df, fill the two missing values with the most occurred value, which is "S".
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("S")

# plot
sns.factorplot('Embarked','Survived', data=titanic_df,size=4,aspect=3)

fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))

# sns.factorplot('Embarked',data=titanic_df,kind='count',order=['S','C','Q'],ax=axis1)
# sns.factorplot('Survived',hue="Embarked",data=titanic_df,kind='count',order=[1,0],ax=axis2)
sns.countplot(x='Embarked', data=titanic_df, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=titanic_df, order=[1,0], ax=axis2)

# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = titanic_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)

# Either to consider Embarked column in predictions,
# and remove "S" dummy variable, 
# and leave "C" & "Q", since they seem to have a good rate for Survival.

# OR, don't create dummy variables for Embarked column, just drop it, 
# because logically, Embarked doesn't seem to be useful in prediction.

embark_dummies_titanic  = pd.get_dummies(titanic_df['Embarked'])
embark_dummies_titanic.drop(['S'], axis=1, inplace=True)

embark_dummies_test  = pd.get_dummies(test_df['Embarked'])
embark_dummies_test.drop(['S'], axis=1, inplace=True)

titanic_df = titanic_df.join(embark_dummies_titanic)
test_df    = test_df.join(embark_dummies_test)

titanic_df.drop(['Embarked'], axis=1,inplace=True)
test_df.drop(['Embarked'], axis=1,inplace=True)

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

这里写图片描述

test_df.info()

titanic_df.info()

#fare
#fill the missing "Fare" for the test_df
test_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)
#convert from float to int 
titanic_df['Fare']=titanic_df['Fare'].astype(int)
test_df['Fare']=test_df['Fare'].astype(int)
#get fare from survived and not survived
fare_not_survived=titanic_df['Fare'][titanic_df['Survived']==0]
fare_survived=titanic_df['Fare'][titanic_df['Survived']==1]

fare_not_survived

0 7 4 8 5 8 6 51 7 21 12 8 13 31 14 7 16 29 18 18 20 26 24 21 26 7 27 263 29 7 30 27 33 10 34 82 35 52 37 8 38 18 40 9 41 21 42 7 45 8 46 15 48 21 49 17 50 39 51 7 … 844 8 845 7 846 69 847 7 848 33 850 31 851 7 852 15 854 26 859 7 860 14 861 11 863 69 864 13 867 50 868 9 870 7 872 5 873 9 876 9 877 7 878 7 881 7 882 10 883 10 884 7 885 29 886 13 888 23 890 7 Name: Fare, dtype: int64

fare_survived

1 71 2 7 3 53 8 11 9 30 10 16 11 26 15 16 17 13 19 7 21 13 22 8 23 35 25 31 28 7 31 146 32 7 36 7 39 11 43 41 44 7 47 7 52 76 53 26 55 35 56 10 58 27 61 80 65 15 66 10 … 809 53 820 93 821 8 823 12 827 37 828 7 829 80 830 14 831 18 835 83 838 56 839 29 842 31 849 89 853 39 855 9 856 164 857 26 858 19 862 25 865 13 866 13 869 11 871 52 874 24 875 7 879 83 880 26 887 30 889 30 Name: Fare, dtype: int64

#get average and std fare from survived and unsurvived passengers
average_fare=DataFrame([fare_not_survived.mean(),fare_survived.mean()])
std_fare=DataFrame([fare_not_survived.std(),fare_survived.std()])
#plot
titanic_df['Fare'].plot(kind='hist',figsize=(15,3),bins=100,xlim=(titanic_df['Fare'].min(),100))

<matplotlib.axes._subplots.AxesSubplot at 0x7f554fe37b70>

这里写图片描述

average_fare

	0
0	21.690346
1	47.991228

std_fare

	0
0	31.392191
1	66.608344

average_fare.index.names=std_fare.index.names=['survived']
average_fare.plot(yerr=std_fare,kind='bar',legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x7f554fa62208>

这里写图片描述

#age
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age value_titanic')
axis2.set_title('New Age value_titanic')

#get average,std and number of NaN values in titanic
average_age_titanic=titanic_df['Age'].mean()
std_age_titanic=titanic_df['Age'].std()
number_of_nan_titanic=titanic_df['Age'].isnull().sum()

#get average,std and number of NaN values in test
average_age_test=test_df['Age'].mean()
std_age_test=test_df['Age'].std()
number_of_nan_test=test_df['Age'].isnull().sum()

#generate random values between average-std adn average+std
rand_1=np.random.randint(average_age_titanic-std_age_titanic,average_age_titanic+std_age_titanic,size=number_of_nan_titanic)
rand_2=np.random.randint(average_age_test-std_age_test,average_age_test+std_age_test,size=number_of_nan_test)

#plot the originial age value_titanic                              
titanic_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

#fill NaN values in age with random age genetated 
titanic_df['Age'][np.isnan(titanic_df['Age'])]=rand_1
test_df['Age'][np.isnan(test_df['Age'])]=rand_2

#convert from float to int
titanic_df['Age']=titanic_df['Age'].astype(int)
test_df['Age']=test_df['Age'].astype(int)

#plot new age value_titanic
titanic_df['Age'].hist(bins=70,ax=axis2)

/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python3.5/dist-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy





<matplotlib.axes._subplots.AxesSubplot at 0x7f554f9ad518>

这里写图片描述

#continue plot age
#peaks for survived /not survived by their age
facet=sns.FacetGrid(titanic_df,hue='Survived',aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0,titanic_df['Age'].max()))
facet.add_legend()

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<seaborn.axisgrid.FacetGrid at 0x7f554fac10b8>

这里写图片描述

#average survived passengers by age
average_survived_by_age=titanic_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
fig,axis1=plt.subplots(1,1,figsize=(18,4))
sns.barplot(x='Age',y='Survived',data=average_survived_by_age)

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<matplotlib.axes._subplots.AxesSubplot at 0x7f554f941a90>

这里写图片描述

#Cabin 
#it has a lot of NaN values,so it can't have huge impact on predication
titanic_df.drop('Cabin',axis=1,inplace=True)
test_df.drop('Cabin',axis=1,inplace=True)
titanic_df.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	C
0	0	3	male	22	1	7	0
1	1	1	female	38	1	71	1
2	1	3	female	26	0	7	0
3	1	1	female	35	1	53	0
4	0	3	male	35	0	8	0

#Family
#instead of having two columns Sibsp & Parch
#we use only one column to represent whether the passenger has any family member on board 
#meaning , if has family on board if will increase of chance of survivl or not

titanic_df['Family']=titanic_df['SibSp']+titanic_df['Parch']
titanic_df['Family'].loc[titanic_df['Family']>0]=1
titanic_df['Family'].loc[titanic_df['Family']==0]=0

test_df['Family']=test_df['SibSp']+test_df['Parch']
test_df['Family'].loc[test_df['Family']>0]=1
test_df['Family'].loc[test_df['Family']==0]=0

#drop SibSp & Parch
titanic_df=titanic_df.drop(['SibSp','Parch'],axis=1)
test_df=test_df.drop(['SibSp','Parch'],axis=1)

#plot
fig,(axis1,axis2)=plt.subplots(1,2,sharex=True,figsize=(10,5))

#countplot
sns.countplot(x='Family',data=titanic_df,order=[1,0],ax=axis1)

#average survival by Family
average_survival_by_Family=titanic_df[['Family','Survived']].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family',y='Survived',data=average_survival_by_Family,order=[1,0],ax=axis2)
axis1.set_xticklabels(['with family','alone'],rotation=0)

/usr/lib/python3/dist-packages/pandas/core/indexing.py:117: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





[<matplotlib.text.Text at 0x7f554f65a8d0>,
 <matplotlib.text.Text at 0x7f554f6519b0>]

这里写图片描述

# Sex

# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    if age<16:
        return 'child'
    else:
        return sex

titanic_df['Person'] = titanic_df[['Age','Sex']].apply(get_person,axis=1)
test_df['Person']    = test_df[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
titanic_df.drop(['Sex'],axis=1,inplace=True)
test_df.drop(['Sex'],axis=1,inplace=True)

person_dummy=pd.get_dummies(titanic_df['Person'])
person_dummy.columns = ['Child','Female','Male']
person_dummy.drop(['Male'],axis=1,inplace=True)

person_dummy_test=pd.get_dummies(test_df['Person'])
person_dummy_test.columns=['Child','Female','Male']
person_dummy_test.drop(['Male'],axis=1,inplace=True)

titanic_df=titanic_df.join(person_dummy)
test_df=test_df.join(person_dummy_test)

fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))
sns.countplot(x='Person',data=titanic_df,ax=axis1)
person_perc=titanic_df[['Person','Survived']].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person',y='Survived',data=person_perc,ax=axis2,order=['male','female','child'])

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))

#drop person
titanic_df.drop(['Person'],axis=1,inplace=True)
test_df.drop(['Person'],axis=1,inplace=True)

#Pclass
sns.factorplot('Pclass','Survived',order=[1,2,3],data=titanic_df,size=5)

/usr/lib/python3/dist-packages/matplotlib/__init__.py:894: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))





<seaborn.axisgrid.FacetGrid at 0x7f554f7d8c18>

这里写图片描述

pclass_dummies_titanic=pd.get_dummies(titanic_df['Pclass'])
pclass_dummies_titanic.columns=['Class1','Class2','Class3']
pclass_dummies_titanic.drop(['Class3'],axis=1,inplace=True)

pclass_dummies_test=pd.get_dummies(test_df['Pclass'])
pclass_dummies_test.columns=['Class1','Class2','Class3']
pclass_dummies_test.drop(['Class3'],axis=1,inplace=True)

titanic_df.drop(['Pclass'],axis=1,inplace=True)
test_df.drop(['Pclass'],axis=1,inplace=True)

titanic_df.join(pclass_dummies_titanic)
test_df.join(pclass_dummies_test)

	PassengerId	Age	Fare	C	Q	Family	Child	Female	Class1	Class2
0	892	34	7	0	1	0	0	0	0	0
1	893	47	7	0	0	1	0	1	0	0
2	894	62	9	0	1	0	0	0	0	1
3	895	27	8	0	0	0	0	0	0	0
4	896	22	12	0	0	1	0	1	0	0
5	897	14	9	0	0	0	1	0	0	0
6	898	30	7	0	1	0	0	1	0	0
7	899	26	29	0	0	1	0	0	0	1
8	900	18	7	1	0	0	0	1	0	0
9	901	21	24	0	0	1	0	0	0	0
10	902	30	7	0	0	0	0	0	0	0
11	903	46	26	0	0	0	0	0	1	0
12	904	23	82	0	0	1	0	1	1	0
13	905	63	26	0	0	1	0	0	0	1
14	906	47	61	0	0	1	0	1	1	0
15	907	24	27	1	0	1	0	1	0	1
16	908	35	12	0	1	0	0	0	0	1
17	909	21	7	1	0	0	0	0	0	0
18	910	27	7	0	0	1	0	1	0	0
19	911	45	7	1	0	0	0	1	0	0
20	912	55	59	1	0	1	0	0	1	0
21	913	9	3	0	0	1	1	0	0	0
22	914	21	31	0	0	0	0	1	1	0
23	915	21	61	1	0	1	0	0	1	0
24	916	48	262	1	0	1	0	1	1	0
25	917	50	14	0	0	1	0	0	0	0
26	918	22	61	1	0	1	0	1	1	0
27	919	22	7	1	0	0	0	0	0	0
28	920	41	30	0	0	0	0	0	1	0
29	921	39	21	1	0	1	0	0	0	0
…	…	…	…	…	…	…	…	…	…	…
388	1280	21	7	0	1	0	0	0	0	0
389	1281	6	21	0	0	1	1	0	0	0
390	1282	23	93	0	0	0	0	0	1	0
391	1283	51	39	0	0	1	0	1	1	0
392	1284	13	20	0	0	1	1	0	0	0
393	1285	47	10	0	0	0	0	0	0	1
394	1286	29	22	0	0	1	0	0	0	0
395	1287	18	60	0	0	1	0	1	1	0
396	1288	24	7	0	1	0	0	0	0	0
397	1289	48	79	1	0	1	0	1	1	0
398	1290	22	7	0	0	0	0	0	0	0
399	1291	31	7	0	1	0	0	0	0	0
400	1292	30	164	0	0	0	0	1	1	0
401	1293	38	21	0	0	1	0	0	0	1
402	1294	22	59	1	0	1	0	1	1	0
403	1295	17	47	0	0	0	0	0	1	0
404	1296	43	27	1	0	1	0	0	1	0
405	1297	20	13	1	0	0	0	0	0	1
406	1298	23	10	0	0	1	0	0	0	1
407	1299	50	211	1	0	1	0	0	1	0
408	1300	39	7	0	1	0	0	1	0	0
409	1301	3	13	0	0	1	1	0	0	0
410	1302	38	7	0	1	0	0	1	0	0
411	1303	37	90	0	1	1	0	1	1	0
412	1304	28	7	0	0	0	0	1	0	0
413	1305	25	8	0	0	0	0	0	0	0
414	1306	39	108	1	0	0	0	1	1	0
415	1307	38	7	0	0	0	0	0	0	0
416	1308	39	8	0	0	0	0	0	0	0
417	1309	27	22	1	0	1	0	0	0	0

418 rows × 10 columns

#defining the training and testing set
x_train=titanic_df.drop(['Survived'],axis=1)
y_train=titanic_df['Survived']
x_test=test_df.drop('PassengerId',axis=1)

#logistic regression
logre=LogisticRegression()
logre.fit(x_train,y_train)
y_pred=logre.predict(x_test)
logre.score(x_train,y_train)

0.77104377104377109

# Support Vector Machines

svc = SVC()

svc.fit(x_train, y_train)

Y_pred = svc.predict(x_test)

svc.score(x_train, y_train)

0.88327721661054992

#Random Forest
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train,y_train)
y_pred=random_forest.predict(x_test)
random_forest.score(x_train,y_train)

0.96520763187429859

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('titanic.csv', index=False)

hebastast

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
titanic prediction

# Imports# pandasimport pandas as pdfrom pandas import Series,DataFrame# numpy, matplotlib, seabornimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set_style('whitegrid')
复制链接

扫一扫