泰坦尼克号沉船练习(Titanic Practice)

最新推荐文章于 2022-04-27 21:03:40 发布

木公鼠跪鱼

最新推荐文章于 2022-04-27 21:03:40 发布

阅读量1.5k

点赞数 1

分类专栏：实践练习文章标签：机器学习

本文链接：https://blog.csdn.net/weixin_39337018/article/details/82013112

版权

实践练习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import Imputer
%matplotlib inline

#导入数据集
train_df = pd.read_csv(r"C:\Users\maokong\Desktop\data\Titanic-machine-learning-practice\train.csv")

#查看数据信息
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

#数据集中‘Cabin’仅有204个非空的数据，有效数据占比较小，把它去除。
train_df = train_df.drop(["Cabin" ],axis = 1)

#去除后我们可以看到，‘Age’，‘Embarked’,也还是有缺失值的，但数量不多。
#可以选择填入平均值、中位数或者直接删除缺失值来处理。这里是直接去除缺失值的。
train_df = train_df.dropna()

#现在预处理第一步已经做好了，接下来我们来画一下图看看各个特征与存活之间的关系
#首先看下生还人数的一个分布，由于matplotlib对中文支持不太友好，用英文做标题
train_df.Survived.value_counts().plot(kind="bar")
plt.title("distribution of survival,(1 = survived, 0 = did not survived)")

Text(0.5,1,'distribution of survival,(1 = survived, 0 = did not survived)')

png

#之后我们来看下不同阶级游客的数量
train_df.Pclass.value_counts().plot(kind="bar")
plt.title("Number of passengers per class")

Text(0.5,1,'Number of passengers per class')

png

#再来看下不同登船口上船的人员数量
train_df.Embarked.value_counts().plot(kind='bar')
plt.title("Passengers per embarked location")

Text(0.5,1,'Passengers per embarked location')

png

#有n个兄弟姐妹或配偶在船上的数量
train_df.SibSp.value_counts().plot(kind='bar')
plt.title("Passengers with siblings or spouse")

Text(0.5,1,'Passengers with siblings or spouse')

png

#还有存活下来与年龄之间的关系
plt.scatter(train_df.Age,train_df.Survived,alpha = 0.1)
plt.title("Age distribution v/s survived")

Text(0.5,1,'Age distribution v/s survived')

png

#现在我们串起来看看，性别与存活情况之间的关系
#首先来看下男士的生还情况
train_df.Survived[train_df.Sex == 'male'].value_counts().plot(kind = 'bar')
plt.title("Analyzing male passengers: survived and not survived")

Text(0.5,1,'Analyzing male passengers: survived and not survived')

png

#这个是女性的生还情况
train_df.Survived[train_df.Sex == 'female'].value_counts().sort_index().plot(kind = 'bar', color='pink')
plt.title("Analyzing female passengers: survived and not survived")

Text(0.5,1,'Analyzing female passengers: survived and not survived')

png

#很明显，这个存活情况与性别是很有关系的
#之后我们再看看不同阶级之间存活情况的区别
#因为3rd几乎是1&2的两倍，所以富人们按1+2来算
train_df.Survived[train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar', color = 'green')
plt.title("Analyzing high class passengers: not survived and survived")

Text(0.5,1,'Analyzing high class passengers: not survived and survived')

png

#下面我们来看看那些低阶级的穷人们
train_df.Survived[train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar', color = 'green')
plt.title("Analyzing low class passengers: not survived and survived")

Text(0.5,1,'Analyzing low class passengers: not survived and survived')

png

#基本上我们能看到个数据分布的大概，现在我们再来看看不同阶层不同性别之间的生存情况
fig = plt.figure(figsize=(18,4), dpi=800)
ax1=fig.add_subplot(141)
hightclass_female = train_df.Survived[train_df.Sex == 'female'][train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar', label ="hightclass_female",color='pink')
plt.legend(loc='best')
plt.title("Sex v/s Class")

ax2=fig.add_subplot(142, sharey=ax1)
lowclass_female = train_df.Survived[train_df.Sex == 'female'][train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar',alpha = 0.5 ,label ="lowclass_female",color='pink')
plt.legend(loc='best')

ax3=fig.add_subplot(143, sharey=ax1)
hightclass_male = train_df.Survived[train_df.Sex == 'male'][train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar',label ="hightclass_male",color='blue')
plt.legend(loc='best')

ax4=fig.add_subplot(144, sharey=ax1)
lowclass_male = train_df.Survived[train_df.Sex == 'male'][train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar',alpha = 0.5,label ="lowclass_male", color='blue')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x1c33c518978>

png

#复制一下数据，下面我们进一步将数据集做处理
train = train_df.copy()

#将性别、上船位置转化成数字来代替
train.loc[train["Sex"] == "female","Sex"] = 1
train.loc[train["Sex"] == "male","Sex"] = 0
train.loc[train["Embarked"] == "S","Embarked"] = 0
train.loc[train["Embarked"] == "C","Embarked"] = 1
train.loc[train["Embarked"] == "Q","Embarked"] = 2

#列出我们需要预测用到的特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
#接下来就开始建立模型，先开始使用最简单的logistics模型

from sklearn.linear_model import LogisticRegression

#实例化模型
logit = LogisticRegression()

#训练数据
logit.fit(train[predictors],train.Survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#预测数据
train_predictions = logit.predict(train[predictors])

#查看预测准确度
print(sum(train_predictions == train.Survived) * 1.0 / train.shape[0])

0.797752808988764

#现在我们减少下特征，选取我们之前看到的关系比较密切的几个特征，性别和贫富登记还有年龄
predictors_new = ["Pclass", "Sex", "Age"]
logit_new = LogisticRegression()
logit_new.fit(train[predictors_new],train.Survived)
train_predictions_new = logit_new.predict(train[predictors_new])
print(sum(train_predictions_new == train.Survived) *1.0/train.shape[0])

0.7921348314606742

#看似logistics回归好像并没有提高，我们换SVM模型尝试一下
from sklearn.svm import SVC
classifier_liner = SVC(kernel = "linear",gamma = 3)
classifier_liner.fit(train[predictors],train.Survived)
classifier_liner_predictions = classifier_liner.predict(train[predictors])
print(sum(classifier_liner_predictions == train.Survived)*1.0/train.shape[0])

0.7794943820224719

#正确率依然不高，换个kernel在尝试一下
classifier_rbf = SVC(kernel = "rbf",gamma =3)
classifier_rbf.fit(train[predictors],train.Survived)
classifier_rbf_predictions = classifier_rbf.predict(train[predictors])
print(sum(classifier_rbf_predictions == train.Survived)*1.0/train.shape[0])

0.9452247191011236

#rbf的kernel明显有较大提高，所以我们就选定rbf模型，现在我们来坐下cv
eighty_precentile = int(.8*train.shape[0])

train_set = train[:eighty_precentile]
cv_set = train[eighty_precentile:]

classifier_rbf_cv = SVC(gamma = 3,kernel = "rbf")
classifier_rbf_cv.fit(train_set[predictors],train_set.Survived)

cv_predictions = classifier_rbf_cv.predict(cv_set[predictors])
print(sum(cv_predictions == cv_set.Survived)*1.0 /cv_set.shape[0])

0.6013986013986014

#交叉验证的结果并不怎么样·说明过拟合比较严重
#我们继续更换模型，这次使用随机森林RandomforestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(train[predictors], train.Survived)
print (rfc.score(train[predictors], train.Survived))

0.9339887640449438

#这次结果比较不错，最后试一试交叉验证
rfc_cv = RandomForestClassifier()
rfc_cv.fit(train_set[predictors],train_set.Survived)
print (rfc_cv.score(cv_set[predictors], cv_set.Survived))

0.8041958041958042

#达到80%以上了比之前的模型都稍微好些，我们就使用该模型,现在读取测试文件进行预测
titanic_test =pd.read_csv(r"C:\Users\maokong\Desktop\data\Titanic-machine-learning-practice\test.csv")

#开始预测前，记得处理下数据
titanic_test = titanic_test.drop(labels = ['Cabin', 'Fare'], axis = 1)
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

#预测值
test_predictions = rfc_cv.predict(titanic_test[predictors])

#把数据更改成符合上传的格式
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": test_predictions
    })

submission.to_csv("kaggle.csv", index=False)

木公鼠跪鱼

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
泰坦尼克号沉船练习(Titanic Practice)

import pandas as pdimport matplotlib.pyplot as pltimport numpy as npfrom sklearn.preprocessing import Imputer%matplotlib inline#导入数据集train_df = pd.read_csv(r"C:\Users\maokong\Desktop\data\Tit...
复制链接

扫一扫