python 过采样-机器学习项目实战——信用卡欺诈检测(过采样代码)

最新推荐文章于 2024-03-04 20:46:55 发布

weixin_39812046

最新推荐文章于 2024-03-04 20:46:55 发布

阅读量472

点赞数

import pandas as pd

from imblearn.over_sampling import SMOTE # pip install imblearn

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, recall_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

import numpy as np

def printing_Kfold_scores(x_train_data, y_train_data):

"""交叉验证求最佳参数"""

from sklearn.model_selection import KFold

fold = KFold(5, shuffle=False) # 分成5份进行交叉验证

# 惩罚力度

c_param_range = [0.01, 0.1, 1, 10, 100]

result_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=["C_parameter", "Mean recall score"])

result_table["C_parameter"] = c_param_range

j=0 # 惩罚力度index

# 循环找到最好的惩罚力度

for c_param in c_param_range:

print("-------------------------------------------")

print("C parameter:", c_param)

print("------------------------------------------- ")

recall_accs = []

for iteration, indices in enumerate(fold.split(x_train_data)):

# fold.split(x_train_data) --> [train_indices, test_indices]

# 用特定的c参数调用逻辑回归模型

lr = LogisticRegression(C = c_param, penalty="l1", solver="liblinear",max_iter=10000)

# 警告 ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.

# 解决增加solver="liblinear" max_iter=10000(默认1000)

# 将x的训练值, y的训练值.ravel() 填充进lr

lr.fit(x_train_data.iloc[indices[0], :], y_train_data[indices[0]].values.ravel())

# 预测值 = lr.predict(x的验证值)

y_pred = lr.predict(x_train_data.iloc[indices[1], :].values)

# 用 y的验证值, y的预测值计算recall，反映当前的c参数

recall_acc = recall_score(y_train_data[indices[1]].values, y_pred)

recall_accs.append(recall_acc)

print("Iteration: {}, recall score = {}".format(iteration, recall_acc))

# 多次交叉验证的评分均值

result_table.loc[j, "Mean recall score"] = np.mean(recall_accs)

j += 1

print("")

print("Mean recall score ", np.mean(recall_accs))

print("")

# 注意此处报错源代码没有astype("float64")

best_c = result_table.loc[result_table["Mean recall score"].astype("float64").idxmax()]["C_parameter"]

# Finally, we can check which C parameter is the best amongst the chosen.

print("*********************************************************************************")

print("Best model to choose from cross validation is with C parameter", best_c)

print("*********************************************************************************")

return best_c

def plot_confusion_matrix(cm, classes, title="Confusion matrix", cmap=plt.cm.Blues):

"""

绘制混淆矩阵

cm: confusion_matrix 混淆矩阵对象

classes: 类别，例如[0, 1]

cmap: 样式

"""

plt.imshow(cm, interpolation="nearest", cmap=cmap)

plt.title(title)

plt.colorbar()

tick_marks = np.arange(len(classes))

plt.xticks(tick_marks, classes, rotation=0)

plt.yticks(tick_marks, classes)

# 混淆矩阵的文字颜色

# 上半部分蓝色，因此文字呈白色

# 下半部分白色，因此文字呈黑色

import itertools

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

color="white" if cm[i, j] > cm.max() / 2 else "black"

plt.text(j, i, cm[i, j], horizontalalignment="center")

plt.tight_layout()

plt.ylabel("True label")

plt.xlabel("Predicted label")

# 读取数据

credit_cards = pd.read_csv("creditcard.csv")

# 划分特征、标签数据

features = credit_cards.iloc[:, :-1]

labels = credit_cards["Class"]

# 划分训练集测试集

features_train, features_test, labels_train, labels_test = train_test_split(

features, labels, test_size=0.2, random_state=0)

# 根据SMOTE算法得到过采样数据集

oversampler = SMOTE(random_state=0)

os_features, os_labels = oversampler.fit_sample(features_train, labels_train)

# 交叉验证

best_c = printing_Kfold_scores(os_features, os_labels)

# 创建lr对象

lr = LogisticRegression(C=best_c, penalty="l1", solver="liblinear", max_iter=10000)

lr.fit(os_features, os_labels)

y_pred = lr.predict(features_test.values)

# 计算混淆矩阵

cnf_matrix = confusion_matrix(labels_test, y_pred)

# 打印评估分数recall = TP / (FN + TP)

np.set_printoptions(precision=2) # 打印两位小数

recall_acc = cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])

print("Recall metric in the testing dataset:", recall_acc)

# 绘制混淆矩阵

class_names = [0, 1]

plt.figure()

plot_confusion_matrix(cnf_matrix, classes=class_names)

plt.show()

weixin_39812046

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python 过采样-机器学习项目实战——信用卡欺诈检测(过采样代码)

import pandas as pdfrom imblearn.over_sampling import SMOTE # pip install imblearnfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import confusion_matrix, recall_scorefrom skl...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。