机器学习项目实战——泰坦尼克号获救预测

导入相关库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

%matplotlib inline

数据预处理

# 读取数据集
df = pd.read_csv(r'...\data\titanic_train.csv')
# df.head()

# 中位数填充缺失值
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Sex 数值映射 {'male': 0, 'female': 1}
df['Sex'] = 1 * (df['Sex'] == 'female')

# Embarked 缺失值填充众数'S'  用scipy.stats.mode()可求
# 数值映射{'S': 0, 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].fillna('S')
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

1. 使用线性回归预测

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

lr = LinearRegression()
fold = KFold(n_splits=5)

predictions = []
for train, test in  fold.split(titanic):
    train_predictors = df[predictors].iloc[train, :]
    train_target = df['Survived'][train]
    lr.fit(train_predictors, train_target)
    
    test_predictions = lr.predict(df[predictors].iloc[test, :])
    predictions.append(test_predictions)

# 整合
predictions = np.concatenate(predictions,axis=0)
# 映射
predictions = 1 * (predictions > 0.5)
# 准确值计算
scores = np.equal(predictions, df['Survived']).mean()
print(scores).mean()

# output
0.7878787878787878

2. 使用逻辑回归预测

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# 创建对象
# 出现ConvergenceWarning时,加上 solver='liblinear
lr = LogisticRegression(random_state=0, solver='liblinear')
# 计算交叉验证分数
scores = cross_val_score(lr, df[predictors], df['Survived'], cv=5).mean()
print(scores)

# output
0.7890151277383717

3. 使用随机森林算法

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 创建随机森林对象
rf = RandomForestClassifier(n_estimators=50, random_state=0, min_samples_split=2, min_samples_leaf=1)
# 创建KFold对象
fold = KFold(n_splits=5)
# 交叉验证
scores = cross_val_score(rf, df[predictors], df['Survived'], cv=fold)
print(scores.mean())

# output
0.8058753373925052

4. 特征重要性

from sklearn.feature_selection import SelectKBest, f_classif # 选择最好特征

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 执行特征选择
selector = SelectKBest(f_classif, k=5)
selector.fit(df[predictors], df['Survived'])

# 将p-values转换成评分
scores = -np.log10(selector.pvalues_)

# 画图
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

在这里插入图片描述

5. 集成算法

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 想要集成的算法
algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=50, max_depth=3), predictors],
    [LogisticRegression(random_state=1, solver='liblinear'), predictors]
]

fold = KFold(n_splits=3)
predictions = []
for train, test in fold.split(df):
    train_target = df['Survived'][train]
    # 遍历两种分类器
    all_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(df[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(df[predictors].iloc[test, :])[:, 1]
        all_test_predictions.append(test_predictions)
    # 求两种分类器的平均结果
    test_predictions = (all_test_predictions[0] + all_test_predictions[1]) / 2
    # 映射
    test_predictions = 1 * (test_predictions > 0.5)
    predictions.append(test_predictions)

# 合并
predictions = np.concatenate(predictions, axis=0)
# 计算准确度
scores = np.equal(predictions, df['Survived']).mean()
print(scores)

# output
0.8058361391694725
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 书香水墨 设计师: CSDN官方博客
应支付0元
点击重新获取
扫码支付

支付成功即可阅读