4-Titanic船员生存实战

数据预处理

import pandas as pd
titanic = pd.read_csv('./data/titanic_train.csv')
titanic.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
titanic.describe()
PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000714.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.6991180.5230080.38159432.204208
std257.3538420.4865920.83607114.5264971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000020.1250000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000038.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200

缺失项填充和字符数据映射成数字数据

titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.describe()
PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000891.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.3615820.5230080.38159432.204208
std257.3538420.4865920.83607113.0196971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000022.0000000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000035.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200
print(titanic['Sex'].unique())
['male' 'female']
titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1
titanic.describe()
PassengerIdSurvivedPclassSexAgeSibSpParchFare
count891.000000891.000000891.000000891.000000891.000000891.000000891.000000891.000000
mean446.0000000.3838382.3086420.35241329.3615820.5230080.38159432.204208
std257.3538420.4865920.8360710.47799013.0196971.1027430.80605749.693429
min1.0000000.0000001.0000000.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.0000000.00000022.0000000.0000000.0000007.910400
50%446.0000000.0000003.0000000.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.0000001.00000035.0000001.0000000.00000031.000000
max891.0000001.0000003.0000001.00000080.0000008.0000006.000000512.329200
print(titanic['Embarked'].unique())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic['Embarked']=='S', 'Embarked'] = 0
titanic.loc[titanic['Embarked']=='C', 'Embarked'] = 1
titanic.loc[titanic['Embarked']=='Q', 'Embarked'] = 2
print(titanic['Embarked'].unique())
['S' 'C' 'Q' nan]
[0 1 2]

使用线性回归算法实现

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = LinearRegression()
fold = KFold(n_splits=3, shuffle=False).split(titanic[predictors])

predictions = []
for train, test in fold:
    train_predictors = (titanic[predictors].iloc[train,:])
    train_target = titanic['Survived'].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
import numpy as np

predictions = np.concatenate(predictions, axis=0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = len(predictions[predictions == titanic['Survived']]) / len(predictions)
print(len(predictions[predictions == titanic['Survived']]))
print(len(predictions))
print(accuracy)
698
891
0.7833894500561167

逻辑回归算法实现

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression()
scores = cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
print(scores.mean())
0.7878787878787877


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

使用随机森林算法

titanic_test = pd.read_csv('./data/test.csv')
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())

titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0 
titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1

titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0
titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1
titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7856341189674523
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = KFold(n_splits=3, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.8159371492704826

再次进行特征处理

titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
# .apply方法生成一个新的series
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))
import re

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

titles = titanic['Name'].apply(get_title)
print(pd.value_counts(titles))

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

print(pd.value_counts(titles))
titanic['Title'] = titles
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Lady          1
Mme           1
Countess      1
Jonkheer      1
Sir           1
Don           1
Ms            1
Capt          1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title', 'NameLength']
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])
#得到原始的pvalues并转换
scores = -np.log10(selector.pvalues_)

plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation = 'vertical')
plt.show()

在这里插入图片描述

选出最佳的四个特征

predictors = ['Pclass', 'Sex', 'Title', 'NameLength']
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=4)
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())
0.7946331052664617

使用集成算法

from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
kf = KFold(n_splits=5, shuffle=False).split(titanic[predictors])
    
predictions = []
for train, test in kf:
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train,:], train_target)
        # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
        full_test_predictions.append(test_predictions)
    
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)
    
predictions = np.concatenate(predictions, axis=0)
accuracy = len(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
0.8226711560044894


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值