【案例】泰坦尼克获救预测

CHERISHGF

已于 2023-09-14 08:40:35 修改

阅读量324

点赞数

分类专栏：数据挖掘学习笔记文章标签：机器学习逻辑回归回归随机森林

于 2020-04-15 14:23:32 首次发布

本文链接：https://blog.csdn.net/CHERISHGF/article/details/105516358

版权

数据挖掘学习笔记专栏收录该内容

27 篇文章 0 订阅

订阅专栏

背景

分类任务，什么样的人会获救
891份样本，7个特征
有些特征存在缺失需要补充，有些特征意义不大需要删除不参与建模
分析每个特征对最终是否被救的重要度

数据分析流程

数据概览

分类数据展示频率直方图，确认是否有分布不均的情况

可以用直方图展示，也可以计算出频率用条形图展示

count_classes = pd.value_counts(data_in['Sex'], sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")


fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
a = data_in.Survived.value_counts()
b = data_in.Pclass.value_counts()
c = data_in.Sex.value_counts()
d = data_in.SibSp.value_counts()
e = data_in.Parch.value_counts()
f = data_in.Embarked.value_counts()
ax11.bar(list(a.index.values), a.values, align='center', width=0.35)
ax12.bar(list(b.index.values), b.values, align='center', width=0.35)
ax13.bar(list(c.index.values), c.values, align='center', width=0.35)
ax21.bar(list(d.index.values), d.values, align='center', width=0.35)
ax22.bar(list(e.index.values), e.values, align='center', width=0.35)
ax23.bar(list(f.index.values), f.values, align='center', width=0.35)

# 单独画图
plt.hist(data_in.Survived)

# 添加文本
for i in a:
    print(i)
    plt.text(x=0, y=i + 0.5, s=i)

plt.show()

数值数据进行相关性分析，输出散点图或者热力图

data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

# 仅分析两列之间的相关性,连续变量不能有空值
r1 = np.corrcoef(data_in['Age'], data_in['Fare'])
r2 = np.corrcoef(data_in, rowvar=False)

# 散点画图显示相关性
r3 = ss.pearsonr(data_in['Age'], data_in['Fare'])  # 相关关系数值，二者不相关的概率P值
r4 = data_in['Age'].corr(data_in['Fare'])
sns.pairplot(data_in[['Age', 'Fare']])
pd.plotting.scatter_matrix(data_in[['Age', 'Fare']], figsize=(12, 12), range_padding=0.5)
# 热力图显示
figure, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(data_in[['Age', 'Fare']].corr(), square=True, annot=True, ax=ax)

数据预处理

删除数据缺失90%以上的列

# 删除数据缺失90%的列
def missing_percent(df):
    nan_percent = 100 * (df.isnull().sum() / len(df))
    # df.isnull().sum()统计每一列的缺失值数量
    # 再除上len()得到每一列的缺失值比例——小数形式
    # *100得到百分数
    nan_percent = nan_percent[nan_percent > 0].sort_values()
    # 得到每列的缺失值的占比，升序排序
    # >0是为了筛掉没有缺失值的列，只返回有缺失值的
    return nan_percent


miss_data = missing_percent(data_in)
data_in.drop(columns=miss_data[miss_data > 90].index.values, inplace=True)

其于缺失值处理，均值或中位数填充

data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

将分类变量进行编码

# 将分类变量进行编码
class_encoder = LabelEncoder()
# 转化的时候不能有空值
data_in['Sex'] = class_encoder.fit_transform(data_in['Sex'].values)
# 字典转化
Embarked_mapping = {'S': 3, 'C': 2, 'Q': 1}
data_in['Embarked'] = data_in['Embarked'].map(Embarked_mapping)
data_in['Embarked'] = data_in['Embarked'].fillna(0)
data_in['Embarked'] = data_in['Embarked'].astype(int)

给年龄进行分箱操作

# 给年龄分箱,两个：cut和qcut，即等宽和等频
data_in['Age_new'] = pd.cut(data_in['Age'], 4, labels=(1, 2, 3, 4))

新增名字长度特征

# 新增特征，计算名字长度
data_in['name_len'] = data_in['Name'].apply(lambda x: len(x.split(',')[0]))

模型构建

划分数据集

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

随机森林建模

# 建模
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)
# 训练
alg.fit(x_train, y_train)

评价模型

# 预测
y_pred = alg.predict(x_test)
# 模型评价
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))  # 输出分类结果矩阵
print("Classification Report:")
print(classification_report(y_test, y_pred))  # 输出混淆矩阵
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
print(alg.score(x_test, y_test))

输出特征重要度

# 计算特征重要性
importances = alg.feature_importances_
print(importances)
# 特征重要性画图
plt.barh(range(len(importances)), importances)
# 添加标题
plt.title("Feature Importances")
# 添加特征名称
plt.yticks(range(len(importances)), predictors)
# 显示图像
plt.show()

交叉验证

# 交叉验证
kf = model_selection.KFold(X.shape[0], random_state=1)
scores = model_selection.cross_val_score(alg, X, y, cv=kf)
print(scores.mean())

完整代码

import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier

titanic = pd.read_csv("E:\\ai\\main\\titanic_train.csv")
# 年龄空值填充
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
# 枚举特征转换
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

# 逻辑回归
# 用到的特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 线性回归
alg = LinearRegression()
kf = KFold(3, shuffle=False)
kf.get_n_splits(titanic)

predictions = []
for train, test in kf.split(titanic):
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train, :])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test, :])
    predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.  Concatenate them into one.
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)

# 逻辑回归
alg = LogisticRegression(random_state=1, max_iter=1000)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

# 随机森林
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=4, min_samples_leaf=2)
kf = KFold(3)
kf.get_n_splits(titanic)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

# 增加特征
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The .apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))


# 正则表达式获取名字中的称谓
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""


titles = titanic["Name"].apply(get_title)

# 将称谓转变为数值
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8,
                 "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k, v in title_mapping.items():
    titles[titles == k] = v

titanic["Title"] = titles

# 使用新增特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]

# 显示每个特征的重要度
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
# Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

# 只选最优的特征进行预测
# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print(scores.mean())

# 集成算法 逻辑回归进行集成
# 参数
algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),
     ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", ]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

# Initialize the cross validation folds
kf = KFold(3)
kf.get_n_splits(titanic)

predictions = []
for train, test in kf.split(titanic):
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

predictions = np.concatenate(predictions, axis=0)

accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

data_in = pd.read_csv('titanic_train.csv')


# 删除数据缺失90%的列
def missing_percent(df):
    nan_percent = 100 * (df.isnull().sum() / len(df))
    # df.isnull().sum()统计每一列的缺失值数量
    # 再除上len()得到每一列的缺失值比例——小数形式
    # *100得到百分数
    nan_percent = nan_percent[nan_percent > 0].sort_values()
    # 得到每列的缺失值的占比，升序排序
    # >0是为了筛掉没有缺失值的列，只返回有缺失值的
    return nan_percent


miss_data = missing_percent(data_in)
data_in.drop(columns=miss_data[miss_data > 90].index.values, inplace=True)

# 处理缺失数据 均值填充
data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

# 对连续数值特征进行相关性分析
data_col = ['Age', 'Fare', 'SibSp', 'Parch', ]
data_cor = data_in[data_col]
data_cor = data_cor.fillna(data_cor.mean())
# 显示全部列相关系数
r5 = data_cor.corr()

# 将分类变量进行编码
class_encoder = LabelEncoder()
# 转化的时候不能有空值
data_in['Sex'] = class_encoder.fit_transform(data_in['Sex'].values)
# 字典转化
Embarked_mapping = {'S': 3, 'C': 2, 'Q': 1}
data_in['Embarked'] = data_in['Embarked'].map(Embarked_mapping)
data_in['Embarked'] = data_in['Embarked'].fillna(0)
data_in['Embarked'] = data_in['Embarked'].astype(int)

# 给年龄分箱,两个：cut和qcut，即等宽和等频
data_in['Age_new'] = pd.cut(data_in['Age'], 4, labels=(1, 2, 3, 4))
# 新增特征，计算名字长度
data_in['name_len'] = data_in['Name'].apply(lambda x: len(x.split(',')[0]))

# 删除无用字段
data_in.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# 随机森林建模


predictors = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
              'Fare', 'Embarked', 'Age_new', 'name_len']
X = data_in[predictors]
y = data_in["Survived"]

from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 建模
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)

# 训练
alg.fit(x_train, y_train)

# 预测
y_pred = alg.predict(x_test)
# 模型评价
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))  # 输出分类结果矩阵
print("Classification Report:")
print(classification_report(y_test, y_pred))  # 输出混淆矩阵
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
print(alg.score(x_test, y_test))

# 计算特征重要性
importances = alg.feature_importances_
print(importances)
# 特征重要性画图
plt.barh(range(len(importances)), importances)
# 添加标题
plt.title("Feature Importances")
# 添加特征名称
plt.yticks(range(len(importances)), predictors)
# 显示图像
plt.show()

# 交叉验证
kf = model_selection.KFold(X.shape[0], random_state=1)
scores = model_selection.cross_val_score(alg, X, y, cv=kf)
print(scores.mean())

数据自取

链接：https://pan.baidu.com/s/1N0PJ9mr-ZaBnWOxN0FKUFA 提取码：1234

CHERISHGF

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【案例】泰坦尼克获救预测

背景分类任务，什么样的人会获取 891份样本，7个特征有些特征存在缺失需要补充，有些特征意义不大需要删除不参与建模分析每个特征对最终是否被救的重要度年龄缺失值填充# 中值填充titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())性别类别转换# 类别转换print (titanic["Sex"...
复制链接

扫一扫