在前几个博客,我们将各个部分进行了拆分,现在写一个整体的代码
1.统计两种标签的个数,画直方图
2. 变量与标签的拆分, 训练集与测试集数据的拆分(train_test_split), 对训练数据进行下采样
3. 使用交叉验证进行超参数正则化因子的选择 KFold
4. 混淆矩阵的绘制,即准确度,召回率,F1score的说明
5. 概率阈值的逻辑回归对召回率和准确度结果的影响
6.对数据进行过采样
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split # 数据读取 data = pd.read_csv('creditcard.csv') data['Normal_Amount'] = StandardScaler().fit_transform(np.array(data['Amount']).reshape(-1, 1)) print(data.head()) data = data.drop(['Time', 'Normal_Amount'], axis=1) X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class'] # 1. 统计个数画图 count_class = pd.value_counts(data.Class, sort=True).sort_index() count_class.plot(kind='bar') plt.show() # 2 进行数据的下采样 negtive_len = len(data[data.Class==1]) negtive_index = data[data.Class==1].index # 获得正常样本的数据便签 normal_len = len(data[data.Class==0]) normal_index = data[data.Class==0].index # 随机抽取 under_normal_index = np.random.choice(normal_index, negtive_len) # 将两个样本的索引进行合并 under_index = np.concatenate([negtive_index, under_normal_index]) under_data = data.iloc[under_index, :] under_x = under_data.loc[:, under_data.columns != 'Class'] under_y = under_data.loc[:, under_data.columns == 'Class'] # 进行整体数据的拆分 train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0) # 进行下采样数据的拆分 under_train_x, under_text_x, under_train_y, under_test_y = train_test_split(under_x, under_y, test_size=0.3, random_state=0) from sklearn.cross_validation import KFold from sklearn.linear_model import LogisticRegression from sklearn.metrics import recall_score # 3.使用交叉验证来选择参数 def printing_KFold_score(train_x, train_y): """ 进行数据的交叉验证 :param train_x:输入的数据的变量 :param train_y:输入数据的标签 :return: 返回最佳的参数 """ # 对数据的索引进行拆分 fold = KFold(len(train_x), 5, shuffle=False) # 正则化参数 c_parameter = [0.01, 0.1, 1, 10, 100] # 建立DataFrame用于参数和recall得分的储存 train_score = pd.DataFrame(index=range(len(c_parameter), 2), columns=['c_parameter', 'F_score_mean']) train_score['c_parameter'] = c_parameter for c in c_parameter: scores = [] for iter, fol in enumerate(fold, start=1): lr = LogisticRegression(C=c, penalty='l1') lr.fit(train_x.iloc[fol[0], :], train_y.iloc[fol[0], :]) pred_y = lr.predict(train_x.iloc[fol[1], :]) # 导入recall_score模块进行计算 score = recall_score(train_y.iloc[fol[1], :], pred_y) print('{} {}'.format(iter, score)) scores.append(score) mean_score = np.mean(scores) train_score['F_score_mean'] = mean_score print(train_score) # 根据索引, idxmax() 表示获得最大值的索引,获得最佳的best_parameter best_parameter = train_score.iloc[train_score['F_score_mean'].idxmax(), :]['c_parameter'] print('the best_parameter is {}'.format(best_parameter)) return best_parameter best_c = printing_KFold_score(under_train_x, under_train_y) import itertools # 4. 画出混淆矩阵, 导入confusion_matrix def plot_matrix(conf, classes, title='confusion matrix', cmap=plt.cm.Blues): """ :param conf: 输入的混淆矩阵 :param classes: 混淆矩阵的类别数 :param title: 图形的标题 :param cmap: 图形的颜色风格 :return: """ # 展示直方图 plt.imshow(conf, cmap=cmap) # 图片标题 plt.title(title) # 图片颜色条 plt.colorbar() # 设置x轴和y轴位置 x_index = np.array(classes) # 第一个参数是位置,第二个参数是标签名 plt.xticks(x_index, classes, rotation=0) plt.yticks(x_index, classes) conf_mean = conf.max() / 2 # itertools.product # [0, 1] & [0, 1] # [0, 0], [0, 1], [1, 0], [1, 1] # 将数字添加到混合矩阵中 for i, j in itertools.product(range(conf.shape[0]), range(conf.shape[1])): plt.text(j, i, conf[i, j], horizontalalignment='center', color='white'if conf[i, j] > conf_mean else 'black') # 画出的图更加的紧凑 plt.tight_layout() from sklearn.metrics import confusion_matrix # 建立逻辑回归模型 lr = LogisticRegression(C=best_c, penalty='l1') # 模型训练 lr.fit(under_train_x, under_train_y) # 模型预测 pred_y = lr.predict(under_text_x) # 获得混合矩阵 conf = confusion_matrix(under_test_y, pred_y) # 画图 plot_matrix(conf, classes=[0, 1]) # accrurracy # 精度 accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1]) # 召回率 recall = conf[1, 1] / (conf[1, 0] + conf[1, 1]) # F1得分 F1_score = 1 / (accurracy + recall) plt.show() # # # # 使用当前的测试数据进行测试 # # pred_y = lr.predict(test_x) # # 获得混合矩阵 # conf = confusion_matrix(test_y, pred_y) # # 画图 # plot_matrix(conf, classes=[0, 1]) # plt.show() # # 5. 测试不同的概率阈值的逻辑回归对准确率和召回率的影响 lr = LogisticRegression(C=best_c, penalty='l1') lr.fit(under_train_x, under_train_y) pred_array = np.array(lr.predict_proba(under_text_x)) thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] j = 1 for threshold in thresholds: pred_y_new = np.zeros([len(under_text_x), 1]) pred_y_new[pred_array[:, 1] > threshold] = 1 # 获得矩阵 plt.subplot(3, 3, j) conf = confusion_matrix(under_test_y, pred_y_new) # 画图 plot_matrix(conf, classes=[0, 1], title='threshod is {}'.format(threshold)) accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1]) # 召回率 recall = conf[1, 1] / (conf[1, 0] + conf[1, 1]) j = j + 1 plt.show() # # # 6. 进行数据过采样操作 from imblearn.over_sampling import SMOTE from sklearn.cross_validation import train_test_split X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class'] train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0) overstamp = SMOTE(random_state=0) SMOTE_train_x, SMOTE_train_y = overstamp.fit_sample(train_x, train_y) # 统计数据的标签0,1个数 print(pd.value_counts(SMOTE_train_y, sort=True).sort_index())