【教学赛】金融数据分析赛题1：银行客户认购产品预测

纸箱sky

已于 2024-07-07 19:33:21 修改

阅读量919

点赞数 7

文章标签：金融数据分析人工智能

于 2024-06-19 01:20:20 首次发布

本文链接：https://blog.csdn.net/sky6803/article/details/139787378

版权

一、背景介绍

分类预测是机器学习领域的一个重要分支，旨在根据已有的数据特征将样本划分为不同的类别。在实际应用中，这种分类可以是二分类，如判断用户是否会订阅某项服务，也可以是多分类，如识别图像中的物体类型。本文将以一个二分类任务为例，即预测用户是否会订阅某项服务，来展示整个建模和分析过程。

比赛链接：https://tianchi.aliyun.com/competition/entrance/531993/introduction?spm=a2c22.12281925.0.0.61c271376biLa1

代码源码：

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier

# 定义一个函数用于将连续的duration列离散化为分组
def bin_duration(df, bins, column):
    # 使用pd.cut函数将指定列按照bins切分为不同的区间
    df['duration_group'] = pd.cut(df[column], bins, right=False)
    # 统计每个区间的频数
    time = df['duration_group'].value_counts().sort_index()
    # 绘制柱状图展示每个区间的频数占比
    plt.figure(figsize=(6, 2), dpi=120)
    sns.barplot(x=time.index, y=time, color='royalblue')
    # 添加百分比标签
    for x_loc, jobs in zip(range(len(time)), time):
        plt.text(x_loc, jobs + 2, '{:.1f}%'.format(jobs / sum(time) * 100), ha='center', va='bottom', fontsize=8)
    plt.xticks(fontsize=8)
    plt.yticks([])
    plt.ylabel('')
    plt.title('duration_group Distribution', size=8)
    sns.despine(left=True)
    plt.show()

# 绘制特征的分布图，区分训练集和测试集
def plot_feature_distribution(df, test, Nu_feature, Ca_feature):
    # 确保分类特征列表不包含目标列'subscribe'
    Ca_feature = [col for col in Ca_feature if col != 'subscribe']
    # 分别绘制数值特征和分类特征的分布
    plt.figure(figsize=(20, 15))
    for i, col in enumerate(Nu_feature, start=1):
        sns.kdeplot(df[col], color='red', label='Train', ax=plt.subplot(4, 4, i))
        sns.kdeplot(test[col], color='cyan', label='Test', ax=plt.subplot(4, 4, i))
        plt.subplot(4, 4, i).set_xlabel(col)
        plt.subplot(4, 4, i).set_ylabel('Density')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(20, 10))
    for j, col in enumerate(Ca_feature, start=1):
        sns.countplot(x=df[col], color='red', label='Train', ax=plt.subplot(len(Ca_feature), 1, j))
        sns.countplot(x=test[col], color='cyan', label='Test', ax=plt.subplot(len(Ca_feature), 1, j))
        plt.subplot(len(Ca_feature), 1, j).set_xlabel(col)
        plt.subplot(len(Ca_feature), 1, j).set_ylabel('Count')
    plt.tight_layout()
    plt.show()

# 对分类特征进行编码
def encode_categorical_features(df, cols):
    lb = preprocessing.LabelEncoder()
    for col in cols:
        df[col] = lb.fit_transform(df[col])  # 将分类特征转换为整数编码
    return df

# 准备训练和测试数据集
def prepare_data(df, test, Ca_feature):
    # 从训练集中移除ID列、目标列以及所有分类特征列
    X = df.drop(columns=['id', 'subscribe'] + Ca_feature)
    Y = df['subscribe']  # 提取目标列
    # 对测试集进行同样的处理，包括移除ID列和对分类特征进行编码
    test = test.drop(columns='id')
    test = encode_categorical_features(test, Ca_feature)
    # 确保测试集的列与训练集保持一致
    test = test[X.columns]
    return X, Y, test

# 训练模型并进行预测
def train_model_and_predict(X, Y, test):
    # 初始化LGBM分类器
    gbm = LGBMClassifier(n_estimators=600, learning_rate=0.01, boosting_type='gbdt',
                         objective='binary', max_depth=-1,
                         random_state=2022, metric='auc')
    # 初始化结果列表和平均AUC分数
    result = []
    mean_score = 0
    n_folds = 7
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
    
    # K折交叉验证
    for train_index, val_index in kf.split(X):
        x_train_fold, x_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = Y.iloc[train_index], Y.iloc[val_index]
        
        # 在每个折叠上训练模型
        gbm_fold = LGBMClassifier(**gbm.get_params())  # 使用当前gbm的参数重新初始化
        gbm_fold.fit(x_train_fold, y_train_fold)
        
        # 预测并计算AUC
        y_pred = gbm_fold.predict_proba(x_val_fold)[:, 1]
        print(f'Validation AUC: {roc_auc_score(y_val_fold, y_pred)}')
        mean_score += roc_auc_score(y_val_fold, y_pred) / n_folds
    
    # 使用全部训练数据训练最终模型并预测测试集
    y_pred_final = gbm.fit(X, Y).predict_proba(test)[:, 1]
    # 结果存储（此处简化为直接使用y_pred_final）
    cat_pre = y_pred_final
    # 转换预测概率为类别标签并保存
    ret = pd.DataFrame(cat_pre, columns=['subscribe'])
    ret['subscribe'] = np.where(ret['subscribe'] > 0.5, 'yes', 'no').astype('str')
    ret.to_csv('GBM_prediction.csv', index=False)
    print(f'Mean Validation AUC: {mean_score}')

if __name__ == "__main__":
    # 示例使用过程
    bins = [0, 143, 353, 1873, 5149]  # 示例bins划分
    df = pd.read_csv('train.csv')  # 读取训练数据
    test = pd.read_csv('test.csv')  # 读取测试数据
    
    # 分离数值特征和分类特征
    Nu_feature = df.select_dtypes(exclude=['object', 'category']).columns.tolist()
    Ca_feature = df.select_dtypes(include=['object', 'category']).columns.tolist()
    # 确保分类特征列表不包含'subscribe'
    if 'subscribe' in Ca_feature:
        Ca_feature.remove('subscribe')
    
    # 绘制特征分布图
    plot_feature_distribution(df, test, Nu_feature, Ca_feature)
    
    # 数据预处理
    X, Y, test = prepare_data(df, test, Ca_feature)
    
    # 训练模型并预测
    train_model_and_predict(X, Y, test)