import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
from imblearn.over_sampling import SMOTE
data = pd.read_csv('creditcard.csv')
print(data.shape)
print(data.columns)
# print(data.head(100))
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()
# 归一化
data['new_Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
# 丢掉某些无用列
data = data.drop(['Time', 'Amount'], axis=1)
# 初始化数据
X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']
# 获取异常样本的个数
number_records_fraud = len(data[data.Class == 1])
# 获取异常样本的索引
fraud_index = np.array(data[data.Class == 1].index)
# 获取正常样本的个数
number_records_normal = len(data[data.Class == 0])
# 获取正常样本的索引
normal_index = data[data.Class == 0].index
# 下采样,采取与样本少的数量一样的数据
# 随机选择样本
random_normal_index = np.random.choice(normal_index, number_records_fraud, replace=False)
random_normal_index = np.array(random_normal_index)
# print(len(random_normal_index))=492
# 将随机选择的样本index与fraud样本的索引连接成一个新的array
under_sample_index = np.concatenate([random_normal_index, fraud_index])
# print(len(under_sample_index))=984
# 根据下采样的索引获取下采样的数据集
under_sample_data = data.iloc[under_sample_index]
# print(len(under_sample_data))=984
X_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns != 'Class']]
y_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns == 'Class']]
# 另外一种写法,待会验证一下
# X_under_sample_data = under_sample_data.loc[under_sample_data.columns != 'Class']
# y_under_sample_data = under_sample_data.loc[under_sample_data.columns == 'Class']
# The whole dataset 全部数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# The under_sample dataset 下采样数据集
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample_data, y_under_sample_data, test_size=0.3, random_state=0)
def printing_Kfold_scores(x_train_data, y_train_data):
# 生成交叉验证的参数,会得到二维列表train_index 和 test_index
kfold = KFold(n_splits=5, shuffle=False)
# 不同的正则项参数:惩罚力度
c_param_range = [0.01, 0.1, 1, 10, 100]
# fold 中有两个列表,train_index 和 test_index
j = 0
for c_param in c_param_range:
# 这里for循环是为了使用不同的惩罚力度来初始化正则项
print('-----------------------------------')
print('C Parameter:', c_param)
print('-----------------------------------')
print('')
recall_accs = []
for iteration, index in enumerate(kfold.split(x_train_data), start=1):
# for循环里面是使用5次交叉验证训练
# 使用惩罚力度调用逻辑回归模型
# 模型初始化
lr = LogisticRegression(C = c_param, penalty = 'l1')
# 训练模型
lr.fit(x_train_data.iloc[index[0], :].values, y_train_data.iloc[index[0], :].values.ravel())
# 用训练的模型预测数据
y_predicted_undersample = lr.predict(x_train_data.iloc[index[1], :].values)
recall_acc = recall_score(y_train_data.iloc[index[1], :].values, y_predicted_undersample)
recall_accs.append(recall_acc)
print('Iteration:', iteration, ': Recall Score = ', recall_acc)
print('Mean Recall Score:',np.mean(recall_accs))
# y_predicted_undersample = printing_Kfold_scores(X_train_undersample, y_train_undersample)
# y_predicted_undersample = printing_Kfold_scores(X, y_train_undersample)
kfold = KFold(n_splits=5, shuffle=False)
recall_accs = []
for iteration, indexs in enumerate(kfold.split(X_train_undersample), start=1):
lr = LogisticRegression(C=0.01, penalty='l1')
lr.fit(X_train_undersample.iloc[indexs[0], :].values, y_train_undersample.iloc[indexs[0], :].values.ravel())
# 预测下采样数据
# y_predicted_labels = lr.predict(X_test_undersample.values)
# recall_acc = recall_score(y_test_undersample, y_predicted_labels)
# 预测所有数据
y_predicted_labels = lr.predict(X_test.values)
recall_acc = recall_score(y_test, y_predicted_labels)
# 预测过采样数据
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# over_sampler = SMOTE(random_state=0)
# os_X, os_y = over_sampler.fit_sample(X_train, y_train)
# y_predicted_labels = lr.predict(X_test.values)
# recall_acc = recall_score(y_test, y_predicted_labels)
print('Recall:',recall_acc)
recall_accs.append(recall_acc)
print('Recall Means:', np.mean(recall_accs))