Titanic-乘客获救预测1

最新推荐文章于 2022-05-17 07:08:56 发布

monkey_susu

最新推荐文章于 2022-05-17 07:08:56 发布

阅读量3.6k

点赞数 2

文章标签：机器学习 python

本文链接：https://blog.csdn.net/sinat_37935727/article/details/104864297

版权

代码中数据集：https://github.com/jsusu/Titanic_passenger-survival-prediction/tree/master/titanic_data

# Titanic乘客生存预测1

#数据分析库
import pandas as pd
#科学计算库
import numpy as np 
from pandas import Series,DataFrame

# 1.获取数据样本
data_train = pd.read_csv("./titanic_data/titanic_train.csv")
data_test = pd.read_csv("./titanic_data/titanic_test.csv")
# 2.数据处理
data_train.head(10)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C

data_test.head(10)

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S
5	897	3	Svensson, Mr. Johan Cervin	male	14.0	0	0	7538	9.2250	NaN	S
6	898	3	Connolly, Miss. Kate	female	30.0	0	0	330972	7.6292	NaN	Q
7	899	2	Caldwell, Mr. Albert Francis	male	26.0	1	1	248738	29.0000	NaN	S
8	900	3	Abrahim, Mrs. Joseph (Sophie Halaut Easu)	female	18.0	0	0	2657	7.2292	NaN	C
9	901	3	Davies, Mr. John Samuel	male	21.0	2	0	A/4 48871	24.1500	NaN	S

data_train.info()
#从上面数据我们可以看到Age,Cabin和Embarked列的数据有缺失，
# Age列共有714条数据，缺失117条数据；Cabin列有204条数据，缺失687条数据；E
# mbarked列有889条数据，缺失2条数据，其他列都是891条数据；

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

data_train.describe()
#mean字段告诉我们，大概0.383838的人最后获救了，2/3等舱的人数比1等舱要多，平均乘客年龄大概是29.7岁(计算这个时候会略掉无记录的)等等

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

# 3.特征选取

# 3.1 数据空值处理
# 客舱号Cabin列由于存在大量的空值，如果直接对空值进行填空，带来的误差影响会比较大，先不选用Cabin列做特征
# 年龄列对于是否能够存活的判断很重要，采用Age均值对空值进行填充
# PassengerId是一个连续的序列，对于是否能够存活的判断无关，不选用PassengerId作为特征

#Age列中的缺失值用Age中位数进行填充
data_train["Age"] = data_train['Age'].fillna(data_train['Age'].median())  
data_train.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.361582	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	13.019697	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	22.000000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	35.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

# 4. 线性回归算法

#线性回归
from sklearn.linear_model import LinearRegression   
#训练集交叉验证，得到平均值
#from sklearn.cross_validation import KFold 
from sklearn.model_selection import KFold
 
#选取简单的可用输入特征
predictors = ["Pclass","Age","SibSp","Parch","Fare"]     
 
#初始化现行回归算法
alg = LinearRegression()
#样本平均分成3份，3折交叉验证
#kf = KFold(data_train.shape[0],n_folds=3,random_state=1)   
kf = KFold(n_splits=3,shuffle=False,random_state=1) 

predictions = []
for train,test in kf.split(data_train):
    #The predictors we're using to train the algorithm.  Note how we only take then rows in the train folds.
    train_predictors = (data_train[predictors].iloc[train,:])
    #The target we're using to train the algorithm.
    train_target = data_train["Survived"].iloc[train]
    #Training the algorithm using the predictors and target.
    alg.fit(train_predictors,train_target)
    #We can now make predictions on the test fold
    test_predictions = alg.predict(data_train[predictors].iloc[test,:])
    predictions.append(test_predictions)

/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning

import numpy as np
 
#The predictions are in three aeparate numpy arrays.	Concatenate them into one.
#We concatenate them on axis 0,as they only have one axis.
predictions = np.concatenate(predictions,axis=0)
 
#Map predictions to outcomes(only possible outcomes are 1 and 0)
predictions[predictions>.5] = 1
predictions[predictions<=.5] = 0
accuracy = sum(predictions == data_train["Survived"]) / len(predictions)
print ("准确率为: ", accuracy)

准确率为:  0.7037037037037037

# 5.逻辑回归算法
from sklearn import model_selection
#逻辑回归
from sklearn.linear_model import LogisticRegression   

#初始化逻辑回归算法
LogRegAlg=LogisticRegression(random_state=1)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])

#使用sklearn库里面的交叉验证函数获取预测准确率分数
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)

#使用交叉验证分数的平均值作为最终的准确率
print("准确率为: ",scores.mean())

准确率为:  0.7003367003367004

# 5.1 增加特征Sex和Embarked列，查看对预测的影响
# 对性别Sex列和登船港口Embarked列进行字符处理
data_train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

#Sex性别列处理：male用0，female用1
data_train.loc[data_train["Sex"] == "male","Sex"] = 0
data_train.loc[data_train["Sex"] == "female","Sex"] = 1

#缺失值用最多的S进行填充
data_train["Embarked"] = data_train["Embarked"].fillna('S') 
#地点用0,1,2
data_train.loc[data_train["Embarked"] == "S","Embarked"] = 0    
data_train.loc[data_train["Embarked"] == "C","Embarked"] = 1
data_train.loc[data_train["Embarked"] == "Q","Embarked"] = 2

# 增加2个特征Sex和Embarked，继续使用逻辑回归算法进行预测
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]  

LogRegAlg=LogisticRegression(random_state=1)
#Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)
#Take the mean of the scores (because we have one for each fold)
print("准确率为: ",scores.mean())

准确率为:  0.7957351290684623


/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

# 通过增加了2个特征，模型的准确率提高到78.78%，说明好的特征有利于提升模型的预测能力。
data_test.describe()

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

data_test.head()

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

#新增：对测试集数据进行预处理，并进行结果预测
#Age列中的缺失值用Age均值进行填充
data_test["Age"] = data_test["Age"].fillna(data_test["Age"].median())
#Fare列中的缺失值用Fare最大值进行填充
data_test["Fare"] = data_test["Fare"].fillna(data_test["Fare"].max()) 

#Sex性别列处理：male用0，female用1
data_test.loc[data_test["Sex"] == "male","Sex"] = 0
data_test.loc[data_test["Sex"] == "female","Sex"] = 1
#缺失值用最多的S进行填充
data_test["Embarked"] = data_test["Embarked"].fillna('S') 
#地点用0,1,2
data_test.loc[data_test["Embarked"] == "S","Embarked"] = 0    
data_test.loc[data_test["Embarked"] == "C","Embarked"] = 1
data_test.loc[data_test["Embarked"] == "Q","Embarked"] = 2

test_features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"] 
#构造测试集的Survived列，
data_test["Survived"] = -1

test_predictors = data_test[test_features]
data_test["Survived"] = LogRegAlg.predict(test_predictors)

data_test.head(10)

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Survived
0	892	3	Kelly, Mr. James	0	34.5	0	0	330911	7.8292	NaN	2	0
1	893	3	Wilkes, Mrs. James (Ellen Needs)	1	47.0	1	0	363272	7.0000	NaN	0	0
2	894	2	Myles, Mr. Thomas Francis	0	62.0	0	0	240276	9.6875	NaN	2	0
3	895	3	Wirz, Mr. Albert	0	27.0	0	0	315154	8.6625	NaN	0	0
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	1	22.0	1	1	3101298	12.2875	NaN	0	1
5	897	3	Svensson, Mr. Johan Cervin	0	14.0	0	0	7538	9.2250	NaN	0	0
6	898	3	Connolly, Miss. Kate	1	30.0	0	0	330972	7.6292	NaN	2	1
7	899	2	Caldwell, Mr. Albert Francis	0	26.0	1	1	248738	29.0000	NaN	0	0
8	900	3	Abrahim, Mrs. Joseph (Sophie Halaut Easu)	1	18.0	0	0	2657	7.2292	NaN	1	1
9	901	3	Davies, Mr. John Samuel	0	21.0	2	0	A/4 48871	24.1500	NaN	0	0

# 6.使用随机森林算法
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
 
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
 

#10棵决策树，停止的条件：样本个数为2，叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1) 

#Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
#kf=cross_validation.KFold(data_train.shape[0],n_folds=3,random_state=1)
kf=model_selection.KFold(n_splits=3,shuffle=False, random_state=1)


scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
print(scores)
#Take the mean of the scores (because we have one for each fold)
print(scores.mean())

[0.75420875 0.8013468  0.8013468 ]
0.7856341189674523


/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning

#增加决策树的个数到30棵决策树，交叉验证方法采用10折交叉验证
#30棵决策树，停止的条件：样本个数为2，叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=30,min_samples_split=2,min_samples_leaf=1) 

#Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
#kf=cross_validation.KFold(data_train.shape[0],n_folds=10,random_state=1)
kf=model_selection.KFold(n_splits=10,shuffle=False,random_state=1)

scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)
 
print(scores)
#Take the mean of the scores (because we have one for each fold)
print(scores.mean())

/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning


[0.74444444 0.80898876 0.78651685 0.82022472 0.85393258 0.85393258
 0.7752809  0.7752809  0.84269663 0.85393258]
0.8115230961298376

monkey_susu

关注

2
点赞
踩
5

收藏

觉得还不错? 一键收藏
7
评论
Titanic-乘客获救预测1

代码中数据集：https://github.com/jsusu/Titanic_passenger-survival-prediction/tree/master/titanic_data# Tatanic乘客生存预测1#数据分析库import pandas as pd#科学计算库import numpy as np from pandas import Series,DataFra...
复制链接

扫一扫