机器学习项目实战——信用卡欺诈检测(过采样代码)

最新推荐文章于 2024-07-15 18:42:33 发布

韭浪

最新推荐文章于 2024-07-15 18:42:33 发布

阅读量557

点赞数

文章标签： python 机器学习逻辑回归数据分析深度学习

本文链接：https://blog.csdn.net/weixin_43326122/article/details/106264721

版权

import pandas as pd
from imblearn.over_sampling import SMOTE  # pip install imblearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import numpy as np


def printing_Kfold_scores(x_train_data, y_train_data):
    """交叉验证求最佳参数"""
    from sklearn.model_selection import KFold

    fold = KFold(5, shuffle=False)  # 分成5份进行交叉验证

    # 惩罚力度
    c_param_range = [0.01, 0.1, 1, 10, 100]

    result_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
    result_table['C_parameter'] = c_param_range
    
    j=0  # 惩罚力度index
    # 循环找到最好的惩罚力度
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter:', c_param)
        print('-------------------------------------------\n')
        
        recall_accs = []
        for iteration, indices in enumerate(fold.split(x_train_data)):
            # fold.split(x_train_data) --> [train_indices, test_indices]
            
            # 用特定的c参数调用逻辑回归模型
            lr = LogisticRegression(C = c_param, penalty='l1', solver='liblinear',max_iter=10000)
            # 警告 ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
            # 解决 增加solver='liblinear' max_iter=10000(默认1000)

            # 将x的训练值, y的训练值.ravel() 填充进lr
            lr.fit(x_train_data.iloc[indices[0], :], y_train_data[indices[0]].values.ravel())
             
            # 预测值 = lr.predict(x的验证值)
            y_pred = lr.predict(x_train_data.iloc[indices[1], :].values)
             
            # 用 y的验证值, y的预测值 计算recall，反映当前的c参数
            recall_acc = recall_score(y_train_data[indices[1]].values, y_pred)
            recall_accs.append(recall_acc)
            print('Iteration: {}, recall score = {}'.format(iteration, recall_acc))
             
        # 多次交叉验证的评分均值
        result_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')
         
    # 注意此处报错  源代码没有astype('float64')
    best_c = result_table.loc[result_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
    # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter', best_c)
    print('*********************************************************************************')
     
    return best_c


def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    绘制混淆矩阵
    cm: confusion_matrix 混淆矩阵对象
    classes: 类别，例如[0, 1]
    cmap: 样式
    """

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
 
    # 混淆矩阵的文字颜色
    # 上半部分蓝色，因此文字呈白色
    # 下半部分白色，因此文字呈黑色
    import itertools
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color="white" if cm[i, j] > cm.max() / 2 else "black"
        plt.text(j, i, cm[i, j], horizontalalignment="center")
 
    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")


# 读取数据
credit_cards = pd.read_csv('creditcard.csv')
# 划分特征、标签数据
features = credit_cards.iloc[:, :-1]
labels = credit_cards['Class']

# 划分训练集测试集　
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.2, random_state=0)

# 根据SMOTE算法得到过采样数据集
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(features_train, labels_train)


# 交叉验证
best_c = printing_Kfold_scores(os_features, os_labels)
# 创建lr对象
lr = LogisticRegression(C=best_c, penalty='l1', solver='liblinear', max_iter=10000)
lr.fit(os_features, os_labels)
y_pred = lr.predict(features_test.values)

# 计算混淆矩阵
cnf_matrix = confusion_matrix(labels_test, y_pred)
# 打印评估分数recall = TP / (FN + TP)
np.set_printoptions(precision=2)  # 打印两位小数
recall_acc = cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])
print("Recall metric in the testing dataset:", recall_acc)

# 绘制混淆矩阵
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names)
plt.show()