import pandas as pd
from imblearn.over_sampling import SMOTE # pip install imblearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import numpy as np
def printing_Kfold_scores(x_train_data, y_train_data):
"""交叉验证求最佳参数"""
from sklearn.model_selection import KFold
fold = KFold(5, shuffle=False) # 分成5份进行交叉验证
# 惩罚力度
c_param_range = [0.01, 0.1, 1, 10, 100]
result_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
result_table['C_parameter'] = c_param_range
j=0 # 惩罚力度index
# 循环找到最好的惩罚力度
for c_param in c_param_range:
print('-------------------------------------------')
print('C parameter:', c_param)
print('-------------------------------------------\n')
recall_accs = []
for iteration, indices in enumerate(fold.split(x_train_data)):
# fold.split(x_train_data) --> [train_indices, test_indices]
# 用特定的c参数调用逻辑回归模型
lr = LogisticRegression(C = c_param, penalty='l1', solver='liblinear',max_iter=10000)
# 警告 ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
# 解决 增加solver='liblinear' max_iter=10000(默认1000)
# 将x的训练值, y的训练值.ravel() 填充进lr
lr.fit(x_train_data.iloc[indices[0], :], y_train_data[indices[0]].values.ravel())
# 预测值 = lr.predict(x的验证值)
y_pred = lr.predict(x_train_data.iloc[indices[1], :].values)
# 用 y的验证值, y的预测值 计算recall,反映当前的c参数
recall_acc = recall_score(y_train_data[indices[1]].values, y_pred)
recall_accs.append(recall_acc)
print('Iteration: {}, recall score = {}'.format(iteration, recall_acc))
# 多次交叉验证的评分均值
result_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
# 注意此处报错 源代码没有astype('float64')
best_c = result_table.loc[result_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
# Finally, we can check which C parameter is the best amongst the chosen.
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter', best_c)
print('*********************************************************************************')
return best_c
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
"""
绘制混淆矩阵
cm: confusion_matrix 混淆矩阵对象
classes: 类别,例如[0, 1]
cmap: 样式
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
# 混淆矩阵的文字颜色
# 上半部分蓝色,因此文字呈白色
# 下半部分白色,因此文字呈黑色
import itertools
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
color="white" if cm[i, j] > cm.max() / 2 else "black"
plt.text(j, i, cm[i, j], horizontalalignment="center")
plt.tight_layout()
plt.ylabel("True label")
plt.xlabel("Predicted label")
# 读取数据
credit_cards = pd.read_csv('creditcard.csv')
# 划分特征、标签数据
features = credit_cards.iloc[:, :-1]
labels = credit_cards['Class']
# 划分训练集测试集
features_train, features_test, labels_train, labels_test = train_test_split(
features, labels, test_size=0.2, random_state=0)
# 根据SMOTE算法得到过采样数据集
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(features_train, labels_train)
# 交叉验证
best_c = printing_Kfold_scores(os_features, os_labels)
# 创建lr对象
lr = LogisticRegression(C=best_c, penalty='l1', solver='liblinear', max_iter=10000)
lr.fit(os_features, os_labels)
y_pred = lr.predict(features_test.values)
# 计算混淆矩阵
cnf_matrix = confusion_matrix(labels_test, y_pred)
# 打印评估分数recall = TP / (FN + TP)
np.set_printoptions(precision=2) # 打印两位小数
recall_acc = cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])
print("Recall metric in the testing dataset:", recall_acc)
# 绘制混淆矩阵
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names)
plt.show()
机器学习项目实战——信用卡欺诈检测(过采样代码)
最新推荐文章于 2024-07-15 18:42:33 发布