# 机器学习项目实战——泰坦尼克号获救预测

### 导入相关库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

%matplotlib inline


### 数据预处理

# 读取数据集

# 中位数填充缺失值
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Sex 数值映射 {'male': 0, 'female': 1}
df['Sex'] = 1 * (df['Sex'] == 'female')

# Embarked 缺失值填充众数'S'  用scipy.stats.mode()可求
# 数值映射{'S': 0, 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].fillna('S')
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


### 1. 使用线性回归预测

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

lr = LinearRegression()
fold = KFold(n_splits=5)

predictions = []
for train, test in  fold.split(titanic):
train_predictors = df[predictors].iloc[train, :]
train_target = df['Survived'][train]
lr.fit(train_predictors, train_target)

test_predictions = lr.predict(df[predictors].iloc[test, :])
predictions.append(test_predictions)

# 整合
predictions = np.concatenate(predictions,axis=0)
# 映射
predictions = 1 * (predictions > 0.5)
# 准确值计算
scores = np.equal(predictions, df['Survived']).mean()
print(scores).mean()

# output
0.7878787878787878


### 2. 使用逻辑回归预测

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# 创建对象
# 出现ConvergenceWarning时，加上 solver='liblinear
lr = LogisticRegression(random_state=0, solver='liblinear')
# 计算交叉验证分数
scores = cross_val_score(lr, df[predictors], df['Survived'], cv=5).mean()
print(scores)

# output
0.7890151277383717


### 3. 使用随机森林算法

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 创建随机森林对象
rf = RandomForestClassifier(n_estimators=50, random_state=0, min_samples_split=2, min_samples_leaf=1)
# 创建KFold对象
fold = KFold(n_splits=5)
# 交叉验证
scores = cross_val_score(rf, df[predictors], df['Survived'], cv=fold)
print(scores.mean())

# output
0.8058753373925052


### 4. 特征重要性

from sklearn.feature_selection import SelectKBest, f_classif # 选择最好特征

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 执行特征选择
selector = SelectKBest(f_classif, k=5)
selector.fit(df[predictors], df['Survived'])

# 将p-values转换成评分
scores = -np.log10(selector.pvalues_)

# 画图
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()


### 5. 集成算法

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 想要集成的算法
algorithms = [
[LogisticRegression(random_state=1, solver='liblinear'), predictors]
]

fold = KFold(n_splits=3)
predictions = []
for train, test in fold.split(df):
train_target = df['Survived'][train]
# 遍历两种分类器
all_test_predictions = []
for alg, predictors in algorithms:
alg.fit(df[predictors].iloc[train, :], train_target)
test_predictions = alg.predict_proba(df[predictors].iloc[test, :])[:, 1]
all_test_predictions.append(test_predictions)
# 求两种分类器的平均结果
test_predictions = (all_test_predictions[0] + all_test_predictions[1]) / 2
# 映射
test_predictions = 1 * (test_predictions > 0.5)
predictions.append(test_predictions)

# 合并
predictions = np.concatenate(predictions, axis=0)
# 计算准确度
scores = np.equal(predictions, df['Survived']).mean()
print(scores)

# output
0.8058361391694725

还能输入1000个字符

03-21 7335

12-29 1万+

02-29 189

09-04 768

06-14 1万+

11-24

05-13 381

03-28 9960

08-10 84

05-06 247

05-05 1541

08-12 27

01-16 211

03-19 82万+

04-14 61万+

02-28 1万+

03-01 14万+

03-08 8万+

04-25 7万+

#### 总结了 150 余个神奇网站，你不来瞅瞅吗？

没有更多推荐了，返回首页

©️2019 CSDN 皮肤主题: 书香水墨 设计师: CSDN官方博客
应支付0元
点击重新获取
扫码支付

支付成功即可阅读