creditcard.csv文件可以在CSDN下载,
https://download.csdn.net/download/bbqqlover/10179806
两个common func的代码,
_common_func.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
import numpy as np
import pandas as pd
# 引入逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
# 引入pyplot模块
import matplotlib.pyplot as plt
import itertools
# 交叉训练,并返回最佳C值的函数
def print_KFold_scores(x_train_data, y_train_data):
fold = KFold(n_splits=5, shuffle=False)
c_param_range = [0.01, 0.1, 1, 10, 100]
# 构建一个两列, index为空的Dataframe
results_table = pd.DataFrame(index = range(len(c_param_range),2),columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
# 使用多个惩罚系数进行交叉验证
for c_param in c_param_range:
print("-------------------------------------------------------")
print('C Parameter:', c_param)
print("-------------------------------------------------------")
print('')
recall_accs = list()
it = 0
for train_idx, test_idx in fold.split(x_train_data):
# 实例化逻辑回归模型, 使用 L1正则化
lr = LogisticRegression(C = c_param, penalty = 'l1',solver='liblinear')
# 给逻辑回归模型喂 训练集数据
lr.fit(x_train_data.iloc[train_idx, :], y_train_data.iloc[train_idx, :].values.ravel())
# 对验证集 进行预测, 返回预测标签
y_pred_undersample = lr.predict(x_train_data.iloc[test_idx, :].values)
# 对交叉验证的某一轮(共5轮),计算recall
recall_acc = recall_score(y_train_data.iloc[test_idx, :].values, y_pred_undersample)
recall_accs.append(recall_acc)
# 打印交叉验证 当前这一轮的recall
print(f"Iteration: {it}, recall = {recall_acc}")
it += 1
# 计算某一惩罚力度下的平均recall,并打印
results_table.loc[j, "Mean recall score"] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
# 让recall值最大的最好的C 参数值
best_c = results_table.loc[results_table["Mean recall score"].astype(float).idxmax()]['C_parameter']
print("*******************************************************************************")
print("最佳C参数模型 = ", best_c)
print("*******************************************************************************")
return best_c
# 打印混淆矩阵的函数
def plot_confusion_matrix(cm, classes, title="Confusion Matrix", cmap=plt.cm.Blues):
'''
打印混淆矩阵的函数
:param cm: 混淆矩阵
:param classes: 类别
:param title: 画布标题
:param cmap: 颜色表
'''
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
# 打印横轴和纵轴坐标
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
# 限值, confusion map的最大值除以2,取浮点数
thresh = cm.max() / 2.
# 打印文本和颜色,将Confusion Matrix转秩打印
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
# 打印x轴和y轴的label
plt.ylabel('True label')
plt.xlabel('Predicted label')
加载数据,1_loaddata.py
'''
Created on 2020年4月7日
@author: Lenovo
'''
from sklearn.preprocessing import StandardScaler
from const import CSV_PATH
import pandas as pd
# 加载数据
data = pd.read_csv(CSV_PATH)
# 查看数据概览
print(data.shape)
print(data.head())
# 按Class进行groupBy,查看每种class有多少个
count_class = pd.value_counts(data['Class'], sort=True).sort_index()
print(count_class)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
print(data.head())
程序输出如下,0有28万个,1只有不到500个,数据不均衡。
下采样训练代码,2_downsample_train.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from card._common_func import print_KFold_scores
from const import CSV_PATH
import pandas as pd
# 忽略SKLearn警告
# import warnings
# warnings.filterwarnings('ignore')
# 下采样示例
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
X = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
y = data.loc[:, data.columns == 'Class']
# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)
# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]
# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 异常集 占 总集合的 百分比
print(len(under_sample_data[under_sample_data.Class == 1])/ len(under_sample_data))
# 将全集 分割成 训练集和测试集, 比例为 7:3
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)
# 将下采样集合 分割成 训练集和测试集, 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)
# 使用逻辑 回归模型 训练,并打印下采样的最佳 惩罚参数
best_c = print_KFold_scores(X_undersample_train, y_undersample_train)
程序输出如下,
可见经过5轮参数选择后[0.01, 0.1, 1, 10, 100],最佳 参数是 0.01
使用下采样得到的最佳C参数,预测下采样测试集,绘制混淆矩阵
3_downsample_predict.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
X = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
y = data.loc[:, data.columns == 'Class']
# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)
# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]
# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 将下采样集合 分割成 训练集和测试集, 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)
# 下采样预测, 预测下采样测试集
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 为模型喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())
# 对下采样测试集进行预测
y_pred_undersample = lr.predict(X_undersample_test)
# 使用gt label和predict label生成混淆矩阵
cnf_matrix = confusion_matrix(y_undersample_test,y_pred_undersample)
# 设置numpy打印精度为2位小数
np.set_printoptions(precision=2)
# 使用混淆矩阵计算recall TP/ (TP + FN)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 打印混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
程序输出如下,可以看出,此次预测,数据量较小,虽然recall达到了 0.91,但是由于FP值较多,精度不高,不足0.9
使用下采样最佳C参数 预测全测试集,4_downsample_predict_all.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
'''
Created on 2020年4月8日
@author: Lenovo
'''
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 使用下采样模型 预测所有测试集,FP比较多,精度较低
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
X = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
y = data.loc[:, data.columns == 'Class']
# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)
# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]
# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 将下采样集合 分割成 训练集和测试集, 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)
# 下采样预测, 预测下采样测试集
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 为模型喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())
# 对全部测试集进行预测
y_pred = lr.predict(X_test.values)
# 使用gt label和predict label生成混淆矩阵
cnf_matrix = confusion_matrix(y_test,y_pred)
# 设置numpy打印精度为2位小数
np.set_printoptions(precision=2)
# 使用混淆矩阵计算recall TP/ (TP + FN)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 打印混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
程序输出如下,Recall 0.93,精度惨不忍睹, FP太多
下采样C参数thresold限值设置。之前是直接输出0,1类别,此处输出概率,概率大于限值,认为是正类,概率小于限值,认为是负类。
5_downsample_threshold.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from card._common_func import plot_confusion_matrix
from const import CSV_PATH
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
X = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
y = data.loc[:, data.columns == 'Class']
# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)
# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]
# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 将下采样集合 分割成 训练集和测试集, 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)
# 下采样限值演示
lr = LogisticRegression(C = 0.01, penalty = 'l1', solver='liblinear')
# 喂入下采样训练集数据
lr.fit(X_undersample_train, y_undersample_train.values.ravel())
# 预测下采样测试集元素出现的概率,(原先返回1,0, 现在返回概率)
y_pred_undersample_proba = lr.predict_proba(X_undersample_test.values)
np.set_printoptions(precision=2)
# 概率大于相应限值,说明是正例, 小于相应限值, 说明是反例
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
j = 1
for i in thresholds:
# 大于限值转换成1, 小于限值,转换成0, 转换成n行1列
y_test_predictions = y_pred_undersample_proba[:,1] > i
# 三行三列,第j副子图
plt.subplot(3,3,j)
j += 1
# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_undersample_test, y_test_predictions)
np.set_printoptions(precision=2)
# 打印当前recall
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 显示混淆矩阵图
class_names = [0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
plt.show()
程序输出如下,可以看出,将threshold设置成0.5, 0.6时,FP和FN都比较小,此时precison和recall达到平衡。再更大时,FN增加,recall降低。
全集训练,寻找best-c,不做采样
6_all_sample_train.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
'''
Created on 2020年4月8日
@author: Lenovo
'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from card._common_func import print_KFold_scores
from const import CSV_PATH
import numpy as np
import pandas as pd
# 使用所有训练集 寻找 最佳C参数, 发现样本不均衡 时, recall较低
# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
X = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
y = data.loc[:, data.columns == 'Class']
# 统计异常样本个数
num_records_fraud = len(data[data.Class == 1])
print(num_records_fraud)
# 异常样本索引列表
fraud_indexes = np.array(data[data.Class == 1].index)
# 随机抽取与异常样本数量相当的正常样本索引
normal_indexes = data[data.Class == 0].index
random_normal_indexes = np.random.choice(normal_indexes, num_records_fraud, replace=False)
random_normal_indexes = np.array(random_normal_indexes)
# 将下采样的正常和异常样本 索引合并
under_sample_index = np.concatenate([fraud_indexes, random_normal_indexes])
# 在总样本中抽取下采样样本
under_sample_data = data.iloc[under_sample_index, :]
# 下采样特征
X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
# 下采样标签
y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 获取全集的 训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 将下采样集合 分割成 训练集和测试集, 比例为 7:3
X_undersample_train, X_undersample_test, y_undersample_train, y_undersample_test = train_test_split(X_under_sample, y_under_sample, test_size=0.3, random_state=0)
best_c = print_KFold_scores(X_train, y_train)
程序输出如下,
可以看出,因为样本不均衡,全量训练时,recall较低,不可采用
使用smote算法过采样,寻找最佳C参数
7_smote_train.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from card._common_func import print_KFold_scores
from imblearn.over_sampling import SMOTE
from const import CSV_PATH
import pandas as pd
# 过采样, 使用 smote算法生成样本
# 引入逻辑回归模型
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
features = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
labels = data.loc[:, data.columns == 'Class']
# 分离训练集和测试集
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)
# 使用SMOTE算法进行过采样
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_resample(features_train, labels_train)
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
# 寻找best_c C参数
best_c = print_KFold_scores(os_features,os_labels)
输出如下,可以看出过采样最佳C参数为100,
使用过采样模型预测所有过采样测试集,
8_smote_predict.py
'''
Created on 2020年4月8日
@author: Lenovo
'''
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics._classification import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from card._common_func import plot_confusion_matrix
from const import CSV_PATH
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 过采样, 使用最佳C参数预测所有测试集
# 加载数据
data = pd.read_csv(CSV_PATH)
# Amount特征值太大,进行正规化
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 删除无用的Time列和原 Amount列
data = data.drop(['Time', 'Amount'], axis=1)
# 获取特征列,所有行,列名不是Class的列
features = data.loc[:, data.columns != 'Class']
# 获取标签列,所有行,列名是Class的列
labels = data.loc[:, data.columns == 'Class']
# 分离训练集和测试集
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)
# 使用SMOTE算法进行过采样
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_resample(features_train, labels_train)
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
lr = LogisticRegression(C = 100, penalty = 'l1', solver='liblinear')
# 把所有过采样数据集都喂进去
lr.fit(os_features,os_labels.values.ravel())
# 预测测试集
y_pred = lr.predict(features_test.values)
# 计算混淆矩阵
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)
# 打印recall
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 绘制混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
程序输出如下,
可以看到,使用过采样准确率和召回率都达到了一个比较好的值。