【案例】泰坦尼克获救预测

背景

  • 分类任务,什么样的人会获救
  • 891份样本,7个特征
  • 有些特征存在缺失需要补充,有些特征意义不大需要删除不参与建模
  • 分析每个特征对最终是否被救的重要度

数据分析流程

数据概览

分类数据展示频率直方图,确认是否有分布不均的情况

可以用直方图展示,也可以计算出频率用条形图展示

count_classes = pd.value_counts(data_in['Sex'], sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")


fig, ((ax11, ax12, ax13), (ax21, ax22, ax23)) = plt.subplots(2, 3)
a = data_in.Survived.value_counts()
b = data_in.Pclass.value_counts()
c = data_in.Sex.value_counts()
d = data_in.SibSp.value_counts()
e = data_in.Parch.value_counts()
f = data_in.Embarked.value_counts()
ax11.bar(list(a.index.values), a.values, align='center', width=0.35)
ax12.bar(list(b.index.values), b.values, align='center', width=0.35)
ax13.bar(list(c.index.values), c.values, align='center', width=0.35)
ax21.bar(list(d.index.values), d.values, align='center', width=0.35)
ax22.bar(list(e.index.values), e.values, align='center', width=0.35)
ax23.bar(list(f.index.values), f.values, align='center', width=0.35)

# 单独画图
plt.hist(data_in.Survived)

# 添加文本
for i in a:
    print(i)
    plt.text(x=0, y=i + 0.5, s=i)

plt.show()

数值数据进行相关性分析,输出散点图或者热力图

data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

# 仅分析两列之间的相关性,连续变量不能有空值
r1 = np.corrcoef(data_in['Age'], data_in['Fare'])
r2 = np.corrcoef(data_in, rowvar=False)

# 散点画图显示相关性
r3 = ss.pearsonr(data_in['Age'], data_in['Fare'])  # 相关关系数值,二者不相关的概率P值
r4 = data_in['Age'].corr(data_in['Fare'])
sns.pairplot(data_in[['Age', 'Fare']])
pd.plotting.scatter_matrix(data_in[['Age', 'Fare']], figsize=(12, 12), range_padding=0.5)
# 热力图显示
figure, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(data_in[['Age', 'Fare']].corr(), square=True, annot=True, ax=ax)

数据预处理

删除数据缺失90%以上的列

# 删除数据缺失90%的列
def missing_percent(df):
    nan_percent = 100 * (df.isnull().sum() / len(df))
    # df.isnull().sum()统计每一列的缺失值数量
    # 再除上len()得到每一列的缺失值比例——小数形式
    # *100得到百分数
    nan_percent = nan_percent[nan_percent > 0].sort_values()
    # 得到每列的缺失值的占比,升序排序
    # >0是为了筛掉没有缺失值的列,只返回有缺失值的
    return nan_percent


miss_data = missing_percent(data_in)
data_in.drop(columns=miss_data[miss_data > 90].index.values, inplace=True)

其于缺失值处理,均值或中位数填充

data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

将分类变量进行编码

# 将分类变量进行编码
class_encoder = LabelEncoder()
# 转化的时候不能有空值
data_in['Sex'] = class_encoder.fit_transform(data_in['Sex'].values)
# 字典转化
Embarked_mapping = {'S': 3, 'C': 2, 'Q': 1}
data_in['Embarked'] = data_in['Embarked'].map(Embarked_mapping)
data_in['Embarked'] = data_in['Embarked'].fillna(0)
data_in['Embarked'] = data_in['Embarked'].astype(int)

给年龄进行分箱操作

# 给年龄分箱,两个:cut和qcut,即等宽和等频
data_in['Age_new'] = pd.cut(data_in['Age'], 4, labels=(1, 2, 3, 4))

新增名字长度特征

# 新增特征,计算名字长度
data_in['name_len'] = data_in['Name'].apply(lambda x: len(x.split(',')[0]))

模型构建

划分数据集

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

随机森林建模

# 建模
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)
# 训练
alg.fit(x_train, y_train)

评价模型

# 预测
y_pred = alg.predict(x_test)
# 模型评价
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))  # 输出分类结果矩阵
print("Classification Report:")
print(classification_report(y_test, y_pred))  # 输出混淆矩阵
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
print(alg.score(x_test, y_test))

输出特征重要度

# 计算特征重要性
importances = alg.feature_importances_
print(importances)
# 特征重要性画图
plt.barh(range(len(importances)), importances)
# 添加标题
plt.title("Feature Importances")
# 添加特征名称
plt.yticks(range(len(importances)), predictors)
# 显示图像
plt.show()

交叉验证

# 交叉验证
kf = model_selection.KFold(X.shape[0], random_state=1)
scores = model_selection.cross_val_score(alg, X, y, cv=kf)
print(scores.mean())

完整代码

import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier

titanic = pd.read_csv("E:\\ai\\main\\titanic_train.csv")
# 年龄空值填充
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
# 枚举特征转换
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

# 逻辑回归
# 用到的特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# 线性回归
alg = LinearRegression()
kf = KFold(3, shuffle=False)
kf.get_n_splits(titanic)

predictions = []
for train, test in kf.split(titanic):
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train, :])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test, :])
    predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.  Concatenate them into one.
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)

# 逻辑回归
alg = LogisticRegression(random_state=1, max_iter=1000)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

# 随机森林
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=4, min_samples_leaf=2)
kf = KFold(3)
kf.get_n_splits(titanic)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

# 增加特征
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The .apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))


# 正则表达式获取名字中的称谓
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""


titles = titanic["Name"].apply(get_title)

# 将称谓转变为数值
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8,
                 "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k, v in title_mapping.items():
    titles[titles == k] = v

titanic["Title"] = titles

# 使用新增特征
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]

# 显示每个特征的重要度
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
# Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

# 只选最优的特征进行预测
# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
print(scores.mean())

# 集成算法 逻辑回归进行集成
# 参数
algorithms = [
    [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),
     ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", ]],
    [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]

# Initialize the cross validation folds
kf = KFold(3)
kf.get_n_splits(titanic)

predictions = []
for train, test in kf.split(titanic):
    train_target = titanic["Survived"].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each fold
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
        full_test_predictions.append(test_predictions)
    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
    test_predictions[test_predictions <= .5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

predictions = np.concatenate(predictions, axis=0)

accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

data_in = pd.read_csv('titanic_train.csv')


# 删除数据缺失90%的列
def missing_percent(df):
    nan_percent = 100 * (df.isnull().sum() / len(df))
    # df.isnull().sum()统计每一列的缺失值数量
    # 再除上len()得到每一列的缺失值比例——小数形式
    # *100得到百分数
    nan_percent = nan_percent[nan_percent > 0].sort_values()
    # 得到每列的缺失值的占比,升序排序
    # >0是为了筛掉没有缺失值的列,只返回有缺失值的
    return nan_percent


miss_data = missing_percent(data_in)
data_in.drop(columns=miss_data[miss_data > 90].index.values, inplace=True)

# 处理缺失数据 均值填充
data_in['Age'] = data_in['Age'].fillna(data_in['Age'].mean())
data_in['Fare'] = data_in['Fare'].fillna(data_in['Fare'].mean())

# 对连续数值特征进行相关性分析
data_col = ['Age', 'Fare', 'SibSp', 'Parch', ]
data_cor = data_in[data_col]
data_cor = data_cor.fillna(data_cor.mean())
# 显示全部列相关系数
r5 = data_cor.corr()

# 将分类变量进行编码
class_encoder = LabelEncoder()
# 转化的时候不能有空值
data_in['Sex'] = class_encoder.fit_transform(data_in['Sex'].values)
# 字典转化
Embarked_mapping = {'S': 3, 'C': 2, 'Q': 1}
data_in['Embarked'] = data_in['Embarked'].map(Embarked_mapping)
data_in['Embarked'] = data_in['Embarked'].fillna(0)
data_in['Embarked'] = data_in['Embarked'].astype(int)

# 给年龄分箱,两个:cut和qcut,即等宽和等频
data_in['Age_new'] = pd.cut(data_in['Age'], 4, labels=(1, 2, 3, 4))
# 新增特征,计算名字长度
data_in['name_len'] = data_in['Name'].apply(lambda x: len(x.split(',')[0]))

# 删除无用字段
data_in.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# 随机森林建模


predictors = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
              'Fare', 'Embarked', 'Age_new', 'name_len']
X = data_in[predictors]
y = data_in["Survived"]

from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 建模
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)

# 训练
alg.fit(x_train, y_train)

# 预测
y_pred = alg.predict(x_test)
# 模型评价
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))  # 输出分类结果矩阵
print("Classification Report:")
print(classification_report(y_test, y_pred))  # 输出混淆矩阵
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
print(alg.score(x_test, y_test))

# 计算特征重要性
importances = alg.feature_importances_
print(importances)
# 特征重要性画图
plt.barh(range(len(importances)), importances)
# 添加标题
plt.title("Feature Importances")
# 添加特征名称
plt.yticks(range(len(importances)), predictors)
# 显示图像
plt.show()

# 交叉验证
kf = model_selection.KFold(X.shape[0], random_state=1)
scores = model_selection.cross_val_score(alg, X, y, cv=kf)
print(scores.mean())

数据自取

链接:https://pan.baidu.com/s/1N0PJ9mr-ZaBnWOxN0FKUFA  提取码:1234 
 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值