泰坦尼克号沉船练习(Titanic Practice)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import Imputer
%matplotlib inline
#导入数据集
train_df = pd.read_csv(r"C:\Users\maokong\Desktop\data\Titanic-machine-learning-practice\train.csv")
#查看数据信息
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
#数据集中‘Cabin’仅有204个非空的数据,有效数据占比较小,把它去除。
train_df = train_df.drop(["Cabin" ],axis = 1)
#去除后我们可以看到,‘Age’,‘Embarked’,也还是有缺失值的,但数量不多。
#可以选择填入平均值、中位数或者直接删除缺失值来处理。这里是直接去除缺失值的。
train_df = train_df.dropna()
#现在预处理第一步已经做好了,接下来我们来画一下图看看各个特征与存活之间的关系
#首先看下生还人数的一个分布,由于matplotlib对中文支持不太友好,用英文做标题
train_df.Survived.value_counts().plot(kind="bar")
plt.title("distribution of survival,(1 = survived, 0 = did not survived)")
Text(0.5,1,'distribution of survival,(1 = survived, 0 = did not survived)')

png

#之后我们来看下不同阶级游客的数量
train_df.Pclass.value_counts().plot(kind="bar")
plt.title("Number of passengers per class")
Text(0.5,1,'Number of passengers per class')

png

#再来看下不同登船口上船的人员数量
train_df.Embarked.value_counts().plot(kind='bar')
plt.title("Passengers per embarked location")
Text(0.5,1,'Passengers per embarked location')

png

#有n个兄弟姐妹或配偶在船上的数量
train_df.SibSp.value_counts().plot(kind='bar')
plt.title("Passengers with siblings or spouse")
Text(0.5,1,'Passengers with siblings or spouse')

png

#还有存活下来与年龄之间的关系
plt.scatter(train_df.Age,train_df.Survived,alpha = 0.1)
plt.title("Age distribution v/s survived")
Text(0.5,1,'Age distribution v/s survived')

png

#现在我们串起来看看,性别与存活情况之间的关系
#首先来看下男士的生还情况
train_df.Survived[train_df.Sex == 'male'].value_counts().plot(kind = 'bar')
plt.title("Analyzing male passengers: survived and not survived")
Text(0.5,1,'Analyzing male passengers: survived and not survived')

png

#这个是女性的生还情况
train_df.Survived[train_df.Sex == 'female'].value_counts().sort_index().plot(kind = 'bar', color='pink')
plt.title("Analyzing female passengers: survived and not survived")
Text(0.5,1,'Analyzing female passengers: survived and not survived')

png

#很明显,这个存活情况与性别是很有关系的
#之后我们再看看不同阶级之间存活情况的区别
#因为3rd几乎是1&2的两倍,所以富人们按1+2来算
train_df.Survived[train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar', color = 'green')
plt.title("Analyzing high class passengers: not survived and survived")
Text(0.5,1,'Analyzing high class passengers: not survived and survived')

png

#下面我们来看看那些低阶级的穷人们
train_df.Survived[train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar', color = 'green')
plt.title("Analyzing low class passengers: not survived and survived")
Text(0.5,1,'Analyzing low class passengers: not survived and survived')

png

#基本上我们能看到个数据分布的大概,现在我们再来看看不同阶层不同性别之间的生存情况
fig = plt.figure(figsize=(18,4), dpi=800)
ax1=fig.add_subplot(141)
hightclass_female = train_df.Survived[train_df.Sex == 'female'][train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar', label ="hightclass_female",color='pink')
plt.legend(loc='best')
plt.title("Sex v/s Class")

ax2=fig.add_subplot(142, sharey=ax1)
lowclass_female = train_df.Survived[train_df.Sex == 'female'][train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar',alpha = 0.5 ,label ="lowclass_female",color='pink')
plt.legend(loc='best')

ax3=fig.add_subplot(143, sharey=ax1)
hightclass_male = train_df.Survived[train_df.Sex == 'male'][train_df.Pclass != 3].value_counts().sort_index().plot(kind = 'bar',label ="hightclass_male",color='blue')
plt.legend(loc='best')

ax4=fig.add_subplot(144, sharey=ax1)
lowclass_male = train_df.Survived[train_df.Sex == 'male'][train_df.Pclass == 3].value_counts().sort_index().plot(kind = 'bar',alpha = 0.5,label ="lowclass_male", color='blue')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x1c33c518978>

png

#复制一下数据,下面我们进一步将数据集做处理
train = train_df.copy()
#将性别、上船位置转化成数字来代替
train.loc[train["Sex"] == "female","Sex"] = 1
train.loc[train["Sex"] == "male","Sex"] = 0
train.loc[train["Embarked"] == "S","Embarked"] = 0
train.loc[train["Embarked"] == "C","Embarked"] = 1
train.loc[train["Embarked"] == "Q","Embarked"] = 2
#列出我们需要预测用到的特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
#接下来就开始建立模型,先开始使用最简单的logistics模型
from sklearn.linear_model import LogisticRegression
#实例化模型
logit = LogisticRegression()
#训练数据
logit.fit(train[predictors],train.Survived)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
#预测数据
train_predictions = logit.predict(train[predictors])
#查看预测准确度
print(sum(train_predictions == train.Survived) * 1.0 / train.shape[0])
0.797752808988764
#现在我们减少下特征,选取我们之前看到的关系比较密切的几个特征,性别和贫富登记还有年龄
predictors_new = ["Pclass", "Sex", "Age"]
logit_new = LogisticRegression()
logit_new.fit(train[predictors_new],train.Survived)
train_predictions_new = logit_new.predict(train[predictors_new])
print(sum(train_predictions_new == train.Survived) *1.0/train.shape[0])
0.7921348314606742
#看似logistics回归好像并没有提高,我们换SVM模型尝试一下
from sklearn.svm import SVC
classifier_liner = SVC(kernel = "linear",gamma = 3)
classifier_liner.fit(train[predictors],train.Survived)
classifier_liner_predictions = classifier_liner.predict(train[predictors])
print(sum(classifier_liner_predictions == train.Survived)*1.0/train.shape[0])
0.7794943820224719
#正确率依然不高,换个kernel在尝试一下
classifier_rbf = SVC(kernel = "rbf",gamma =3)
classifier_rbf.fit(train[predictors],train.Survived)
classifier_rbf_predictions = classifier_rbf.predict(train[predictors])
print(sum(classifier_rbf_predictions == train.Survived)*1.0/train.shape[0])
0.9452247191011236
#rbf的kernel明显有较大提高,所以我们就选定rbf模型,现在我们来坐下cv
eighty_precentile = int(.8*train.shape[0])

train_set = train[:eighty_precentile]
cv_set = train[eighty_precentile:]

classifier_rbf_cv = SVC(gamma = 3,kernel = "rbf")
classifier_rbf_cv.fit(train_set[predictors],train_set.Survived)

cv_predictions = classifier_rbf_cv.predict(cv_set[predictors])
print(sum(cv_predictions == cv_set.Survived)*1.0 /cv_set.shape[0])
0.6013986013986014
#交叉验证的结果并不怎么样·说明过拟合比较严重
#我们继续更换模型,这次使用随机森林RandomforestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(train[predictors], train.Survived)
print (rfc.score(train[predictors], train.Survived))
0.9339887640449438
#这次结果比较不错,最后试一试交叉验证
rfc_cv = RandomForestClassifier()
rfc_cv.fit(train_set[predictors],train_set.Survived)
print (rfc_cv.score(cv_set[predictors], cv_set.Survived))
0.8041958041958042
#达到80%以上了比之前的模型都稍微好些,我们就使用该模型,现在读取测试文件进行预测
titanic_test =pd.read_csv(r"C:\Users\maokong\Desktop\data\Titanic-machine-learning-practice\test.csv")
#开始预测前,记得处理下数据
titanic_test = titanic_test.drop(labels = ['Cabin', 'Fare'], axis = 1)
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2
#预测值
test_predictions = rfc_cv.predict(titanic_test[predictors])
#把数据更改成符合上传的格式
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": test_predictions
    })

submission.to_csv("kaggle.csv", index=False)
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值