6.逻辑回归项目实战-信用卡欺诈检测代码调试

最新推荐文章于 2024-03-09 11:52:10 发布

MaliciousSoftware

最新推荐文章于 2024-03-09 11:52:10 发布

阅读量831

点赞数 3

分类专栏：机器学习-唐宇迪老师文章标签： python 机器学习

本文链接：https://blog.csdn.net/weixin_47038938/article/details/115865718

版权

机器学习-唐宇迪老师专栏收录该内容

3 篇文章 6 订阅

订阅专栏

信用卡欺诈检测

《跟着迪哥学Python数据分析与机器学习实战》

报错一

data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))

AttributeError: ‘Series’ object has no attribute ‘reshape’
在这里插入图片描述
修改
data是dataFrame数据结构，data[‘Amount’]取dataframe的一个column，输出格式为series，series不具有reshape属性，用values方法将series对象转化成numpy的ndarray，再用ndarray的reshape方法

data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))

在这里插入图片描述
报错二

X=data.ix[:,data.columns!='Class']
y=data.ix[:,data.columns=='Class']

X_undersample=under_sample_data.ix[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.ix[:,under_sample_data.columns=='Class']

AttributeError: ‘DataFrame’ object has no attribute ‘ix’
在这里插入图片描述
修改
根据官方说明ix已被移除，可用.iloc替代

X=data.iloc[:,data.columns!='Class']
y=data.iloc[:,data.columns=='Class']

X_undersample=under_sample_data.iloc[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.iloc[:,under_sample_data.columns=='Class']

在这里插入图片描述
报错三

from sklearn.cross_validation import train_test_split

修改

from sklearn.model_selection import train_test_split

报错四

#fold = KFold(len(y_train_data),5,shuffle=False)
...
lr = LogisticRegression(C = c_param, penalty = 'l1')
...
best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

修改

fold=KFold(n_splits=5,shuffle=False)
...
lr = LogisticRegression(C = c_param, penalty='l2')
...
best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']

报错五

lr = LogisticRegression(C = c_param, penalty = 'l1')

修改

lr = LogisticRegression(C = c_param, penalty='l2')

报错六

best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

修改

best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']

报错七

os_features,os_labels=oversampler.fit_sample(features_train,labels_train)

修改

os_features,os_labels=oversampler.fit_resample(features_train,labels_train)

代码与注释

import pandas as pd  #数据分析处理库
import matplotlib.pyplot as plt  #数据可视化
import numpy as np  #科学计算库
%matplotlib inline

#读取数据
data=pd.read_csv("creditcard.csv")
data.head()
#默认展示数据前5行记录

在这里插入图片描述

#31列数据：Time、V1-V28、Amount贷款金额、Class分类结果，
#最后一列是Class，这是label值，0代表交易记录正常数据，1代表欺诈数据。

count_classes=pd.value_counts(data['Class'],sort=True).sort_index()
#计算0和1分别有多少个
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")  #欺诈等级直方图
plt.xlabel("Class")
plt.ylabel("Frequency")

在这里插入图片描述

#Class=0正常数据大概有28W，Class=1欺诈数据极少，分布不均匀
#通常有两种处理方法：
#1.过采样（让1变得和0一样多）； 
#2.下采样（在0中取出部分数据，数量与1一致）

#Amount与其他数据的取值范围相比太大了，所以先对Amount进行标准化：
from sklearn.preprocessing import StandardScaler
#使用StandardScaler方法对数据进行标准化处理
#data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
#进行fit_transform操作相当于执行公式（见课本P130）
#reshape(-1,1)将传入数据转换成一列的形式
data=data.drop(['Time','Amount'],axis=1)# 删除没用的两列数据
data.head()

在这里插入图片描述

#下采样
X=data.iloc[:,data.columns!='Class']  #特征值
y=data.iloc[:,data.columns=='Class']  #标签0和1

#Number of data points in the minority class少数类中的数据点数量
number_records_fraud=len(data[data.Class==1])  #Class=1欺诈数据点数量
fraud_indices=np.array(data[data.Class==1].index)

#Picking the indices of the normal classes
normal_indices=data[data.Class==0].index  #Class=0正常数据点数量

#Out of the indices we picked,randomly select "x" number (number_records_fraud)
#从我们挑选的指数中,随机选择“x”数字（数字\记录\欺诈）
random_normal_indices=np.random.choice(normal_indices,number_records_fraud,replace=False)  #在0中随机取出部分数据，数量与1一致
random_normal_indices=np.array(random_normal_indices)

#Appending the 2 indices
under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])  #下采样=1的数据量+0的数据量

#Under sample dataset下采样数据集
under_sample_data=data.iloc[under_sample_indices,:]

X_undersample=under_sample_data.iloc[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.iloc[:,under_sample_data.columns=='Class']

#Showing ratio显示比率
print("Percentage of normal transactions:",len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data))
print("Percentage of fraud transactions:",len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data))
print("Total number of transactions in resampled data:",len(under_sample_data))

Percentage of normal transactions: 0.5
Percentage of fraud transactions: 0.5
Total number of transactions in resampled data: 984

#正常数据有50%欺诈数据有50%
#一共有984条数据

from sklearn.model_selection import train_test_split  #对原始数据集进行切分

#首先对原始数据集进行划分 然后对下采样数据集进行划分
#Whole dataset整个数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)  #测试集为0.3 随机切分

print("Number transactions train dataset:",len(X_train))
print("Number transactions test dataset:",len(X_test))
print("Total number of transactions:",len(X_train)+len(X_test))

#Undersampled dataset子抽样数据集
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample=train_test_split(
X_undersample,y_undersample,test_size=0.3,random_state=0)
print("")
print("Number transactions train dataset:",len(X_train_undersample))
print("Number transactions test dataset:",len(X_test_undersample))
print("Total number of transactions:",len(X_train_undersample)+len(X_test_undersample))

Number transactions train dataset: 199364
Number transactions test dataset: 85443
Total number of transactions: 284807

Number transactions train dataset: 688
Number transactions test dataset: 296
Total number of transactions: 984

#Recall=TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report

def printing_Kfold_scores(x_train_data,y_train_data):
    #fold = KFold(len(y_train_data),5,shuffle=False)
    fold=KFold(n_splits=5,shuffle=False)  #切分成5部分
    
    #Different C parameters不同的C参数
    c_param_range=[0.01,0.1,1,10,100]
    
    results_table=pd.DataFrame(index=range(len(c_param_range),2),columns=['C_parameter','Mean recall score'])
    results_table['C_parameter)']=c_param_range
    
    #the k-fold will give 2 lists:train_indices=indices[0],test_indices=indices[1]
    #k-fold将给出两个列表：训练指数=指数[0]，测试指数=指数[1]
    j=0
    for c_param in c_param_range:
        print('------------------------------')
        print('C parameter:',c_param)
        print('------------------------------')
        print('')
        
        i=0
        recall_accs=[]
        for iteration,indices in enumerate(fold.split(y_train_data),start=1):
            #Call the logistic regression model with a certain C parameter
            #调用具有一定C参数的逻辑回归模型
            #lr = LogisticRegression(C = c_param, penalty = 'l1')
            lr = LogisticRegression(C = c_param, penalty='l2')
            
            #Use the training data to fit the model. In this case, we use the portion of the fold to train the model
            #with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
            #使用训练数据来拟合模型。在这种情况下，我们使用折叠部分来训练具有索引[0]的模型。
            #然后我们用索引[1]预测分配为“测试交叉验证”的部分
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
            
            #Predict values using the test indices in the training data
            #利用训练数据中的测试指标进行预测
            y_pred_undersample=lr.predict(x_train_data.iloc[indices[1],:].values)
            
            #Calculate the recall score and append it to a list for recall scores representing the current c_parameter
            #计算召回分数并将其附加到表示当前C参数的召回分数列表中
            recall_acc=recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            i+=1
            print('Iteration',i,':recall score=',recall_acc)
            
        #The mean value of those recall scores is the metric we want to save and get hold of.
        #这些召回分数的平均值是我们想要保存和掌握的指标。
        results_table.loc[j,'Mean recall score']=np.mean(recall_accs)
        j+=1
        print('')
        print('Mean recall score',np.mean(recall_accs))
        print('')
        
    #best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
    
    #Finally, we can check which C parameter is the best amongst the chosen.
    #最后我们可以检查哪一个C参数是所选择的参数中最好的。
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    return best_c

best_c=printing_Kfold_scores(X_train_undersample,y_train_undersample)

C parameter: 0.01
------------------------------

Iteration 1 :recall score= 0.821917808219178
Iteration 2 :recall score= 0.8493150684931506
Iteration 3 :recall score= 0.9152542372881356
Iteration 4 :recall score= 0.918918918918919
Iteration 5 :recall score= 0.8787878787878788

Mean recall score 0.8768387823414523

------------------------------
C parameter: 0.1
------------------------------

Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9661016949152542
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.8939393939393939

Mean recall score 0.9091426124395708

------------------------------
C parameter: 1
------------------------------

Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9661016949152542
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9090909090909091

Mean recall score 0.9121729154698739

------------------------------
C parameter: 10
------------------------------

Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9830508474576272
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9242424242424242

Mean recall score 0.9185930490086515

------------------------------
C parameter: 100
------------------------------

Iteration 1 :recall score= 0.863013698630137
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9830508474576272
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9242424242424242

Mean recall score 0.9213327750360488

*********************************************************************************
Best model to choose from cross validation is with C parameter =  nan
*********************************************************************************

#原文链接：https://blog.csdn.net/weixin_45755332/article/details/113545313
# 模型评估：对模型选择较合适的参数
#recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report


def printing_Kfold_scores(x_train_data, y_train_data):
#   对数据进行5折分组；
    fold = KFold(n_splits=5, shuffle=True)
#   print(len(y_train_data))
    c_param_range = [0.01, 0.1, 1, 10, 100] # 惩罚力度参数；
    
    # 形成一个两列的数据，c_parameter为第一列，
    results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    
    j = 0
    # 循环的使用五个惩罚力度：通过k折检验来确定逻辑回归函数在加入惩罚项时，他对应的参数为best_C；
    for c_param in c_param_range:
        print('-----------------------------')
        print('C_parameter:', c_param)
        print('-----------------------------')
        i=0
        recall_accs = []
        # 对fold中进行遍历，fold中共有五组数据，start=1：下标从1开始
        # enumerate的作用是将fold数据结构组合为一个序列索引，同时列出数据以及下标；
        for iteration, indices in enumerate(fold.split(y_train_data), start=1):
            
            '''
            iteration 表示第几次循环；indices中返回值有两个，即两组值得下标，第一个为抽样后的剩余数据，用来作为训练集(训练集的训练集，实际的测试集不能被训练)
            一般占5分之4；第二个为抽样的数据，用来作为验证集，一般占5分之1；
            '''
            # 构建逻辑回归的样式，带有l2惩罚项；
            lr = LogisticRegression(C = c_param, penalty='l2')
            # 将数据放入模型中进行调整
            # x_train_data.iloc[indices[0],:]：4/5数据所对应的训练数据；1/5数据所对应的测试数据；
            # 将多维数据降为一维：
            #   ravel()：将数据拉成一维的，修改对原数据有影响；
            #   flatten()：返回的是复制的内容，修改对原数据没有影响；
            lr.fit(x_train_data.iloc[indices[0],:], y_train_data.iloc[indices[0],:].values.ravel())
            
            # 利用交叉验证进行预测:利用取出的数据进行验证，indices中的第二维是抽取出来的1/5的数据，用来进行交叉验证的；
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            
            # print(y_pred_undersample) 验证出来的
            # 验证召回率：正确的结果有多少被给出了；
            # a=y_train_data.iloc[indices[1],:].values：总的正确结果数：
            # b=y_pred_undersample：预测结果是正确的：sum(a,b)一致
            # recall_acc = sum(a,b)/sum(a)；
            
            # 准确率：给出的结果有多少是正确的；
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values, y_pred_undersample)
            recall_accs.append(recall_acc)
#             print('Iteration', iteration, ':recall score=', recall_acc)
            
            i+=1
            print('Iteration ', i,': recall score = ', recall_acc)

            
            # 在某一惩罚力度下，5组数据形成的集合，最终求平均值；
        results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        j+=1
        print('')
        print('Mean recall score', np.mean(recall_accs))
        print('')
        # 最大值所对应的索引值   
    best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']

    print('*******************************************************************')
    print('Best model to choose from cross validation is with C parameter =', best_c)
    print('*******************************************************************')
    return best_c
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)

-----------------------------
C_parameter: 0.01
-----------------------------
Iteration  1 : recall score =  0.9242424242424242
Iteration  2 : recall score =  0.921875
Iteration  3 : recall score =  0.8481012658227848
Iteration  4 : recall score =  0.9166666666666666
Iteration  5 : recall score =  0.765625

Mean recall score 0.8753020713463752

-----------------------------
C_parameter: 0.1
-----------------------------
Iteration  1 : recall score =  0.8793103448275862
Iteration  2 : recall score =  0.9473684210526315
Iteration  3 : recall score =  0.8428571428571429
Iteration  4 : recall score =  0.8787878787878788
Iteration  5 : recall score =  0.92

Mean recall score 0.8936647575050479

-----------------------------
C_parameter: 1
-----------------------------
Iteration  1 : recall score =  0.9230769230769231
Iteration  2 : recall score =  0.9014084507042254
Iteration  3 : recall score =  0.9230769230769231
Iteration  4 : recall score =  0.9166666666666666
Iteration  5 : recall score =  0.9027777777777778

Mean recall score 0.9134013482605032

-----------------------------
C_parameter: 10
-----------------------------
Iteration  1 : recall score =  0.9102564102564102
Iteration  2 : recall score =  0.9130434782608695
Iteration  3 : recall score =  1.0
Iteration  4 : recall score =  0.8873239436619719
Iteration  5 : recall score =  0.9076923076923077

Mean recall score 0.923663227974312

-----------------------------
C_parameter: 100
-----------------------------
Iteration  1 : recall score =  0.9041095890410958
Iteration  2 : recall score =  0.9482758620689655
Iteration  3 : recall score =  0.9066666666666666
Iteration  4 : recall score =  0.8918918918918919
Iteration  5 : recall score =  0.9384615384615385

Mean recall score 0.9178811096260316

*******************************************************************
Best model to choose from cross validation is with C parameter = 10.0
*******************************************************************

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

import itertools
lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

在这里插入图片描述

lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred=lr.predict(X_test.values)

#Compute confusion matrix
cnf_matrix=confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset:" , cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

#Plot non-normalized confusion matrix
class_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,
                     classes=class_names,
                     title='Confusion matrix')
plt.show()

在这里插入图片描述

best_c=printing_Kfold_scores(X_train,y_train)

-----------------------------
C_parameter: 0.01
-----------------------------
Iteration  1 : recall score =  0.5555555555555556
Iteration  2 : recall score =  0.6290322580645161
Iteration  3 : recall score =  0.6233766233766234
Iteration  4 : recall score =  0.5131578947368421
Iteration  5 : recall score =  0.6567164179104478

Mean recall score 0.595567749928797

-----------------------------
C_parameter: 0.1
-----------------------------
Iteration  1 : recall score =  0.5857142857142857
Iteration  2 : recall score =  0.5714285714285714
Iteration  3 : recall score =  0.618421052631579
Iteration  4 : recall score =  0.6142857142857143
Iteration  5 : recall score =  0.6363636363636364

Mean recall score 0.6052426520847572

-----------------------------
C_parameter: 1
-----------------------------
Iteration  1 : recall score =  0.6666666666666666
Iteration  2 : recall score =  0.6052631578947368
Iteration  3 : recall score =  0.603448275862069
Iteration  4 : recall score =  0.5875
Iteration  5 : recall score =  0.6

Mean recall score 0.6125756200846946

-----------------------------
C_parameter: 10
-----------------------------
Iteration  1 : recall score =  0.5205479452054794
Iteration  2 : recall score =  0.6301369863013698
Iteration  3 : recall score =  0.6764705882352942
Iteration  4 : recall score =  0.5254237288135594
Iteration  5 : recall score =  0.6805555555555556

Mean recall score 0.6066269608222516

-----------------------------
C_parameter: 100
-----------------------------
Iteration  1 : recall score =  0.5526315789473685
Iteration  2 : recall score =  0.5972222222222222
Iteration  3 : recall score =  0.6507936507936508
Iteration  4 : recall score =  0.6363636363636364
Iteration  5 : recall score =  0.5964912280701754

Mean recall score 0.6067004632794106

*******************************************************************
Best model to choose from cross validation is with C parameter = 1.0
*******************************************************************

lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample=lr.predict(X_test.values)

#Compute confusion matrix
cnf_matrix=confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset:",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

#Plot non-normalized confusion matrix
class_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,
                     classes=class_names,
                     title='Confusion matrix')
plt.show()

在这里插入图片描述

lr = LogisticRegression(C = 0.01, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba=lr.predict_proba(X_test_undersample.values)

thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j=1
for i in thresholds:
    y_test_predictions_high_recall=y_pred_undersample_proba[:,1]>i
    
    plt.subplot(3,3,j)
    j+=1
    
    #Compute confusion matrix
    cnf_matrix=confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    np.set_printoptions(precision=2)
    
    print("Recall metric in the testing dataset:",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
    
    #Plot non-normalized confusion matrix
    class_names=[0,1]
    plot_confusion_matrix(cnf_matrix,
                     classes=class_names,
                     title='threshold >= %s' %i)

在这里插入图片描述

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

credit_cards=pd.read_csv('creditcard.csv')

columns=credit_cards.columns
#The labels are in the last column ('Class'). Simply remove it to obtain features columns
features_columns=columns.delete(len(columns)-1)

features=credit_cards[features_columns]
labels=credit_cards['Class']

features_train,features_test,labels_train,labels_test = train_test_split(features,
                                                                      labels,
                                                                      test_size=0.2,
                                                                      random_state=0)

oversampler=SMOTE(random_state=0)
#os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
os_features,os_labels=oversampler.fit_resample(features_train,labels_train)

len(os_labels[os_labels==1])

227454

os_features=pd.DataFrame(os_features)
os_labels=pd.DataFrame(os_labels)
best_c=printing_Kfold_scores(os_features,os_labels)

-----------------------------
C_parameter: 0.01
-----------------------------
Iteration  1 : recall score =  0.9630360864319404
Iteration  2 : recall score =  0.9624495525530795
Iteration  3 : recall score =  0.9630808402291534
Iteration  4 : recall score =  0.9605958142721386
Iteration  5 : recall score =  0.9627658404627328

Mean recall score 0.962385626789809

-----------------------------
C_parameter: 0.1
-----------------------------
Iteration  1 : recall score =  0.9628027108766198
Iteration  2 : recall score =  0.9434925103930669
Iteration  3 : recall score =  0.9623599439775911
Iteration  4 : recall score =  0.9626233977530066
Iteration  5 : recall score =  0.9626480346274691

Mean recall score 0.9587853195255507

-----------------------------
C_parameter: 1
-----------------------------
Iteration  1 : recall score =  0.9628398458998348
Iteration  2 : recall score =  0.9429591970898891
Iteration  3 : recall score =  0.962535198873636
Iteration  4 : recall score =  0.9628606167790428
Iteration  5 : recall score =  0.963679327848766

Mean recall score 0.9589748372982336

-----------------------------
C_parameter: 10
-----------------------------
Iteration  1 : recall score =  0.942792664490261
Iteration  2 : recall score =  0.9436078604548466
Iteration  3 : recall score =  0.9633548160042232
Iteration  4 : recall score =  0.9611077084609327
Iteration  5 : recall score =  0.9623544460127029

Mean recall score 0.9546434990845933

-----------------------------
C_parameter: 100
-----------------------------
Iteration  1 : recall score =  0.9624321294321954
Iteration  2 : recall score =  0.9624070050694582
Iteration  3 : recall score =  0.96179676318397
Iteration  4 : recall score =  0.9640471729306734
Iteration  5 : recall score =  0.9624035426166258

Mean recall score 0.9626173226465845

*******************************************************************
Best model to choose from cross validation is with C parameter = 100.0
*******************************************************************

lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

在这里插入图片描述

#原文链接：https://blog.csdn.net/weixin_44507435/article/details/104944917
#柱状图与堆叠图
fig,axes=plt.subplots(4,1,figsize=(10,10))
s=pd.Series(np.random.randint(0,10,16),index=list('abcdefghijklmnop'))
df=pd.DataFrame(np.random.rand(10,3),columns=['a','b','c'])

#单系列柱状图
s.plot(kind='bar',color='k',grid=True,alpha=0.5,ax=axes[0])  #ax参数 选择第几个子图
#多系列柱状图
df.plot(kind='bar',ax=axes[1],grid=True,colormap='Reds_r')
#多系列堆叠图
df.plot(kind='bar',ax=axes[2],grid=True,colormap='Blues_r',stacked=True)  #stacked堆叠
df.plot.barh(ax=axes[3],grid=True,stacked=True,colormap='BuGn_r')

在这里插入图片描述

#原文链接：https://blog.csdn.net/weixin_44507435/article/details/104944917
plt.figure(figsize=(10,4))
x=np.arange(10)
y1=np.random.rand(10)
y2=-np.random.rand(10)

plt.bar(x,y1,width=1,facecolor='yellowgreen',edgecolor='white',yerr=y1*0.1)
plt.bar(x,y2,width=1,facecolor='lightskyblue',edgecolor='white',yerr=y2*0.1)
#x,y参数:x,y值
#width:宽度比例
#facecolor柱状图里填充的颜色、edgecolor是边框的颜色
#left-每个柱x轴左边界,bottom-每个柱y轴下边界 → bottom扩展即可化为甘特图 Gantt Chart
#align:决定整个bar图分布，默认left表示默认从左边界开始绘制,center会将图绘制在中间位置
#xerr/yerr:x/y方向error bar

for i,j in zip(x,y1):
    plt.text(i+0.3,j-0.15,'%.2f' % j,color='white')
for i,j in zip(x,y2):
    plt.text(i+0.3,j+0.05,'%.2f' % -j,color='white')
#给图添加text
#zip()函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。

在这里插入图片描述

#原文链接：https://blog.csdn.net/weixin_44507435/article/details/104944917
#table(cellText=None,cellColours=None,cellLoc='right',colWidths=None,rowLabels=None,rowColours=None,rowLoc='left',
#colLabels=None, colColours=None, colLoc='center',loc='bottom', bbox=None)

data = [[ 66386, 174296,  75131, 577908,  32015],
        [ 58230, 381139,  78045,  99308, 160454],
        [ 89135,  80552, 152558, 497981, 603535],
        [ 78415,  81858, 150656, 193263,  69638],
        [139361, 331509, 343164, 781380,  52269]]
columns=('Freeze', 'Wind', 'Flood', 'Quake', 'Hail')
rows=['%d year' % x for x in (100, 50, 20, 10, 5)]
df=pd.DataFrame(data,columns=('Freeze', 'Wind', 'Flood', 'Quake', 'Hail'),
                 index=['%d year' % x for x in (100, 50, 20, 10, 5)])
print(df)
df.plot(kind='bar',grid=True,colormap='Blues_r',stacked=True,figsize=(8,3))

#创建堆叠图
plt.table(cellText=data,
          cellLoc='center',
          cellColours=None,
          rowLabels=rows,
          rowColours=plt.cm.BuPu(np.linspace(0, 0.5,5))[::-1],  # BuPu可替换成其他colormap
          colLabels=columns,
          colColours=plt.cm.Reds(np.linspace(0, 0.5,5))[::-1], 
          rowLoc='right',
          loc='bottom')
#cellText：表格文本
#cellLoc：cell内文本对齐位置
#rowLabels：行标签
#colLabels：列标签
#rowLoc：行标签对齐位置
#loc：表格位置 → left，right，top，bottom

plt.xticks([])
#不显示x轴标注