机器学习，信用卡项目（二分类问题）完整代码 + 详细注释

波尔德
已于 2022-06-03 19:46:33 修改
阅读量539
点赞数
分类专栏：笔记学习生活文章标签： python 机器学习人工智能数据分析
于 2022-06-03 19:40:04 首次发布
本文链接：https://blog.csdn.net/weixin_44025103/article/details/125115193
版权
学习生活同时被 2 个专栏收录
129 篇文章 0 订阅
订阅专栏
笔记
112 篇文章 7 订阅
订阅专栏
有需要的小伙伴直接复制就行：
# 下采样完整版代码，加注释，超详细，直接复制就行

# 信用卡问题。在当前的数据集中有正常的数据(用0来表示)，也有异常的数据(用1来表示)
# 我们要做的就是通过训练数据集将测试集中正常的数据和异常的数据进行分类，这是一个典型的通过建模解决二分类任务。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 当前的列表示：
data = pd.read_csv("creditcard.csv")
# print('直接读取credicard表\n',data.head(5))
# class这一列就是标签列 class=0 正常样本 class=1 异常样本

# # 数据可视化
# count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
# count_classes.plot(kind = 'bar')
# plt.title("Fraud category histogram")
# plt.xlabel("Class")
# plt.ylabel("Frequency")
# plt.show()

from sklearn.preprocessing import StandardScaler

# 对Amount做标准化处理，并增添normAmount这一列数据
# reshape(-1,1) 1代表1列，-1表示让系统自动计算有多少行

data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)  # 删去Time和Amount两列
print('标准化后的creditcard表：\n', data.head(5))

# 下采样
X = data.loc[:, data.columns != 'Class']  # 训练数据 选择在这些样本中，列不等于class的样本，把它们划分为训练数据
y = data.loc[:, data.columns == 'Class']  # 标签

# 选择异常样本的个数和索引
number_records_fraud_len = len(data[data.Class == 1])  # class==1的样本有多少个
fraud_indices = np.array(data[data.Class == 1].index)  # 获取样本==1（异常样本）的下标
# print('获取样本==1（异常样本）的下标', fraud_indices)

# 选择正常样本的索引
normal_indices = data[data.Class == 0].index

# 在我们选择的索引中，随机选择“x”个数字（number_records_fraud）
# normal_indices：在哪选择？ 在28万个正常样本中随机选择
# number_records_fraud：选择多少个？ 随机选择样本==1的个数个样本
# replace：是否要代替？
random_normal_indices = np.random.choice(normal_indices, number_records_fraud_len, replace=False)
# 将其转变为array
random_normal_indices = np.array(random_normal_indices)

# 下采样后，正常数据和原本数据量就稀少的异常样本数据合并
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])

# 通过合并得到的索引，获得下采样数据
under_sample_data = data.iloc[under_sample_indices, :]

# 下采样之后的训练数据和标签数据
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']  # 训练数据
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']  # 标签

# Showing ratio
print("class=0的样本占总样本的百分比 ", len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("class=1的样本占总样本的百分比 ", len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("下采样数据后的样本总数： ", len(under_sample_data))

from sklearn.model_selection import train_test_split

# 交叉验证

# Whole dataset
# 0.3 指的是用dataset中30%的数据作为测试数据 用70%到的数据作为训练数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 所有数据集
print("Number transactions(交易) train dataset: ", len(X_train))
print("Number transactions(交易) test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train) + len(X_test))

# 对下采样数据再进行切分
# 但是，我们在最后的最后的最后，进行测试的时候。是拿原始数据集中test的样本进行测试
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                    , y_undersample
                                                                                                    , test_size=0.3
                                                                                                    , random_state=0)

# 下采样数据集
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample) + len(X_test_undersample))

# Recall = TP/(TP+FN)
# recall 召回率。是一种模型评估标准。
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
# KFold 在交叉验证时可以选择进行几倍的交叉验证(train切分成几份)
from sklearn.metrics import recall_score, confusion_matrix


def printing_Kfold_scores(x_train_data, y_train_data):
    # 导入 KFold的方式不同引起
    # from sklearn.cross_validation import KFold
    # fold = KFold(len(y_train_data),5,shuffle=False)

    # from sklearn.model_selection import KFold
    fold = KFold(5, shuffle=False)  # 将数据集分成5份，且不打乱顺序

    # 定义不同力度的正则化惩罚力度
    c_param_range = [0.01, 0.1, 1, 10, 100]

    # 展示结果用的表格
    results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
    results_table['C_parameter'] = c_param_range

    # k-fold 表示K折的交叉验证，这里会得到两个索引集合: 训练集 = indices[0], 验证集 = indices[1]
    j = 0
    # 循环遍历不同的参数
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('正则化惩罚力度: ', c_param)
        print('-------------------------------------------\n')
        print('')

        recall_accs = []

        # 一步步分解,以执行交叉验证
        for iteration, indices in enumerate(fold.split(x_train_data)):
            # 指定算法模型，并且给定参数
            # 可以选择l2惩罚项 或是 l1惩罚项
            # 注意，当绘制混淆矩阵时，对混淆矩阵参数的设定和这里一样
            lr = LogisticRegression(C=c_param, penalty='l2',dual=False)
            #lr = LogisticRegression(C = c_param, penalty = 'l1',solver='liblinear')


            # 训练模型，注意索引不要给错了，训练的时候一定传入的是训练集，所以x和y的索引都是0
            lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())

            # 建立好模型后，预测模型结果，这里用的就是验证集，索引为1
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)

            # 有了预测结果之后就可以来进行评估了，这里recall_score需要传入预测值和真实值。
            # 使用标签（y_train_data.iloc[indices[1],:].values）和预测值（y_pred_undersample）计算召回率
            recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)
            # 一会还要算平均，所以把每一步的结果都先保存起来。
            recall_accs.append(recall_acc)
            print('Iteration ', iteration, ': 召回率 = ', recall_acc)

        # 当执行完所有的交叉验证后，计算平均结果
        results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('平均召回率 ', np.mean(recall_accs))
        print('')

    # 找到最好的参数，哪一个Recall高，自然就是最好的了。
    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']

    # 打印最好的结果
    print('*********************************************************************************')
    print('效果最好的模型所选参数 = ', best_c)
    print('*********************************************************************************')

    return best_c

# 调用自定义函数print_Kfold_scores 并传入参数
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)

# 绘制混淆矩阵
def plot_confusion_matrix(cm, classes,
                          title,
                          theme=plt.cm.Blues # 蓝色 Blues Greens
                         ):
    """
    This function prints and plots the confusion matrix.
    """

    # interpolation代表的是插值运算，'nearest'只是选取了其中的一种插值方式。
    # cmap表示绘图时的样式，这里选择的是Blues/Oranges/Greens/Reds主题

    plt.imshow(cm, interpolation='nearest', cmap=theme)
    plt.title(title)
    # 绘图时加上颜色条
    plt.colorbar()
    # 获取传入的x的取值范围
    tick_marks = np.arange(len(classes))
    # 设置x轴和y轴的刻度标签
    plt.xticks(tick_marks, classes, rotation=0) #不旋转
    plt.yticks(tick_marks, classes)

    # 设置阈值
    thresh = cm.max() / 2.
    # 混淆矩阵的文字颜色
    # 上半部分蓝色，因此文字呈白色
    # 下半部分白色，因此文字呈黑色

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    # 自动调整子图参数，使之填充整个图像区域。
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




import itertools
# 创建lr对象
lr = LogisticRegression(C = best_c, penalty = 'l2') # 注意，此处的l1与l2 要与上述代码相符
# 采用LogisticRegression 计算值
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
# 下采样预测值
y_pred_undersample = lr.predict(X_test_undersample.values)


# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)

# 打印评估分数recall = TP / (FN + TP)
np.set_printoptions(precision=2)# 打印两位小数
print("Recall metric(指标) in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))


# 绘制非归一化混淆矩阵
class_names = [0,1] # x的取值范围(0,1)
plt.figure()
# 调用plot_confusion_matrix函数，并传入参数。
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')

plt.show()