import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
DATA = pd.read_csv("creditcard.csv")
#查看数据是否平衡--------------------------------------------------------------------------------------------------------------------------------
count_classes = pd.value_counts(DATA["Class"], sort=True).sort_index()
count_classes.plot(kind="bar")#pandas可以做简单的图
plt.show()
#数据标准化------------------------------------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
DATA["normAmount"] = StandardScaler().fit_transform(DATA["Amount"].values.reshape(-1, 1))#StandardScaler类的实例可以进行数据标准化,-1的意思是自动选取
DATA = DATA.drop(["Time", "Amount"], axis=1)
# 下采样过程------------------------------------------------------------------------------------------------------------
#下采样的意义是从比较多的那类数据中随机选取比较少的那类数据的数量的样本
DATA_matrix = DATA.values
X = DATA_matrix[:, DATA.columns != "Class"]
y = DATA_matrix[:, DATA.columns == "Class"]
number_records_fraud = len(DATA[DATA["Class"] == 1])
fraud_indices = np.array(DATA[DATA["Class"] == 1].index)
norm_indices = np.array(DATA[DATA["Class"] == 0].index)
random_norm_indics = np.random.choice(norm_indices, number_records_fraud, replace=False)#np.random.choice()函数需要传入两个参数,第一个参数是需要随机选择的array,第二个参数为选取的个数,replace参数控制是否是放回抽样
random_norm_indics = np.array(random_norm_indics)
under_sample_indices = np.concatenate([fraud_indices, random_norm_indics])#合并操作
under_sample = DATA.iloc[under_sample_indices, :]
x_under_sample = under_sample.values[:, DATA.columns != "Class"]
y_under_sample = under_sample.values[:, DATA.columns == "Class"]
# 交叉验证----------------------------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)#train_test_split()函数需要两个参数,第一个参数为feature阵,第二个为label阵,最后会返回四个结果(结果中X_train和y_train以及X_test和y_test会一一对应)
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undersample = train_test_split(x_under_sample,
y_under_sample,
train_size=0.3,
random_state=0)
#原始数据集也做切分的意义在于最终测试时是拿原始数据测试
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, recall_score
def printing_kfold_score(x_train_data, y_train_data):
fold = KFold(5, shuffle=False)#先用KFold类创建一个实例,调用该实例的split方法就可以实现交叉验证
c_param_range = [0.01, 0.1, 1, 10, 100]
results_table = pd.DataFrame(columns=['C_parameter', 'Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
for c_param in c_param_range:
recall_accs = []
for train_indices, test_indices in fold.split(x_train_data):#这里会返回五次两组索引,第一组为训练索引,第二组为测试索引
lr = LogisticRegression(C=c_param, penalty="l1")#LogisticRegression函数的用法:先创建实例,后利用该实例fit训练集,之后predict测试集
x_train_data = pd.DataFrame(x_train_data)
y_train_data = pd.DataFrame(y_train_data)
lr.fit(x_train_data.iloc[train_indices - 1, :], y_train_data.iloc[train_indices - 1, :].values.ravel())
y_pred_undersample = lr.predict(x_train_data.iloc[test_indices, :].values)
recall_acc = recall_score(y_train_data.iloc[test_indices, :].values, y_pred_undersample)#recall_score函数的两个参数:y_true, y_pred
recall_accs.append(recall_acc)
print("第%s次" % (j + 1))
print(recall_acc)
results_table.loc[j, "Mean recall score"] = np.mean(recall_accs)
j += 1
print(np.mean(recall_accs))
print(results_table.head())
best_c = results_table.loc[results_table["Mean recall score"].astype("float").idxmax()]["C_parameter"]#idmax()函数很好用
print(best_c)
return best_c
best_c = printing_kfold_score(x_train_undersample, y_train_undersample)
# 混淆矩阵-----------------------------------------------------------------------------------------------------------------
import itertools#itertools中的product()函数可以返回两个参数的笛卡尔积的元组
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
print(thresh)
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1轴])):#会返回(0,0)(0,1)(1,0)(1,1)
print(i, j)
plt.text(j, i, cm[i, j],#plt.text()函数中的前两个参数代表x轴和y位置
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()#图像外部边缘的自动调整
plt.ylabel('True label')
plt.xlabel('Predicted label')
lr = LogisticRegression(C=best_c, penalty='l1')
lr.fit(x_train_undersample, y_train_undersample.ravel())
y_pred_undersample = lr.predict(x_test_undersample)
cnf_matrix = confusion_matrix(y_test_undersample, y_pred_undersample)#confusion_matrix()中的两个参数为y_true, y_pred
np.set_printoptions(precision=2)#set_printoptions函数用来控制显示精度
print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
lr = LogisticRegression(C=best_c, penalty='l1')
lr.fit(x_train_undersample, y_train_undersample.ravel())
y_pred = lr.predict(x_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
# 选择合适的阀值------------------------------------------------------------------------------------------------------
lr = LogisticRegression(C=0.01, penalty='l1')
lr.fit(x_train_undersample, y_train_undersample.ravel())
y_pred_undersample_proba = lr.predict_proba(x_test_undersample)
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
plt.figure(figsize=(10, 10))
j = 1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[:, 1] > i
# print(y_test_predictions_high_recall)
plt.subplot(3, 3, j)
j += 1
cnf_matrix = confusion_matrix(y_test_undersample, y_test_predictions_high_recall)#confusion_matrix可以传入True、False参数
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
class_names = [0, 1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s' % i)
plt.show()
# 过采样---------------------------------------------------------------------------------------------------------------------------
#过采样的思想是将两类样本中比较少的那一类生成一些数据,让两类样本平衡
from imblearn.over_sampling import SMOTE#利用SMOTE算法生成新的样本
DATA = pd.read_csv("creditcard.csv")
columns = DATA.columns
features_columns = columns.delete(len(columns) - 1)
features = DATA[features_columns]
label = DATA["Class"]
feature_train, feature_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=0)
Oversample = SMOTE(random_state=0)#SMOTE函数实例的结果再调用fit_resample就可以生成数据,需要传入的参数是训练集feature和训练集label
os_features, os_labels = Oversample.fit_resample(feature_train, label_train)
print(len(os_labels[os_labels == 1]))
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_kfold_score(os_features, os_labels)
lr = LogisticRegression(C=best_c, penalty="l1")
lr.fit(os_features, os_labels.values.ravel())
y_pred = lr.predict(feature_test.values)
conf_matrix = confusion_matrix(label_test, y_pred)
np.set_printoptions(precision=2)
print("recall = %s " % (conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])))
calss_names = [0, 1]
plt.figure()
plot_confusion_matrix(conf_matrix, classes=class_names, title="Confusion matrix")
plt.show()
解决数据不均衡问题的两个方法:下采样和过采样。使用下采样虽然recall值比较高,但是会有较高的误杀率。使用过采样recall值可能会降低但是accuracy会上升,误杀会下降。
SMOTE算法的原理是:
1.对于少数类中没一个样本x,以欧式距离为标准计算它到少数类样本集中所有样本的距离,得到其k近邻
2.选择n个样本(n为生成的倍数)
3.套用
x
n
e
v
=
x
+
rand
(
0
,
1
)
×
(
x
~
−
x
)
x_{n e v}=x+\operatorname{rand}(0,1) \times(\widetilde{x}-x)
xnev=x+rand(0,1)×(x
−x) 生成新的样本
正则惩罚项: 在机器学习特别是深度学习模型越复杂对当前训练集的拟合效果一定越高,但是泛化能力可能会很差。所以我们对损失函数中加入一个正则化惩罚项,正则项的重要程度可以通过正则化项的系数体现
交叉验证的步骤:
1.将数据洗牌
2.按照比例截取一定的数据作为测试集
3.在训练集中应用交叉验证(解决数据不平衡也在train集部分,test集是不动的)
对明显和其他指标数量级不一样的指标需要做标准化处理,因为机器学习算法会自动认为数值大的指标重要一些。