信用卡欺诈检测
《跟着迪哥学Python数据分析与机器学习实战》
报错一
data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))
AttributeError: ‘Series’ object has no attribute ‘reshape’
修改
data是dataFrame数据结构,data[‘Amount’]取dataframe的一个column,输出格式为series,series不具有reshape属性,用values方法将series对象转化成numpy的ndarray,再用ndarray的reshape方法
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
报错二
X=data.ix[:,data.columns!='Class']
y=data.ix[:,data.columns=='Class']
X_undersample=under_sample_data.ix[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.ix[:,under_sample_data.columns=='Class']
AttributeError: ‘DataFrame’ object has no attribute ‘ix’
修改
根据官方说明ix已被移除,可用.iloc替代
X=data.iloc[:,data.columns!='Class']
y=data.iloc[:,data.columns=='Class']
X_undersample=under_sample_data.iloc[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.iloc[:,under_sample_data.columns=='Class']
报错三
from sklearn.cross_validation import train_test_split
修改
from sklearn.model_selection import train_test_split
报错四
#fold = KFold(len(y_train_data),5,shuffle=False)
...
lr = LogisticRegression(C = c_param, penalty = 'l1')
...
best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
修改
fold=KFold(n_splits=5,shuffle=False)
...
lr = LogisticRegression(C = c_param, penalty='l2')
...
best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
报错五
lr = LogisticRegression(C = c_param, penalty = 'l1')
修改
lr = LogisticRegression(C = c_param, penalty='l2')
报错六
best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
修改
best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
报错七
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
修改
os_features,os_labels=oversampler.fit_resample(features_train,labels_train)
代码与注释
import pandas as pd #数据分析处理库
import matplotlib.pyplot as plt #数据可视化
import numpy as np #科学计算库
%matplotlib inline
#读取数据
data=pd.read_csv("creditcard.csv")
data.head()
#默认展示数据前5行记录
#31列数据:Time、V1-V28、Amount贷款金额、Class分类结果,
#最后一列是Class,这是label值,0代表交易记录正常数据,1代表欺诈数据。
count_classes=pd.value_counts(data['Class'],sort=True).sort_index()
#计算0和1分别有多少个
count_classes.plot(kind='bar')
plt.title("Fraud class histogram") #欺诈等级直方图
plt.xlabel("Class")
plt.ylabel("Frequency")
#Class=0正常数据大概有28W,Class=1欺诈数据极少,分布不均匀
#通常有两种处理方法:
#1.过采样(让1变得和0一样多);
#2.下采样(在0中取出部分数据,数量与1一致)
#Amount与其他数据的取值范围相比太大了,所以先对Amount进行标准化:
from sklearn.preprocessing import StandardScaler
#使用StandardScaler方法对数据进行标准化处理
#data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
#进行fit_transform操作相当于执行公式(见课本P130)
#reshape(-1,1)将传入数据转换成一列的形式
data=data.drop(['Time','Amount'],axis=1)# 删除没用的两列数据
data.head()
#下采样
X=data.iloc[:,data.columns!='Class'] #特征值
y=data.iloc[:,data.columns=='Class'] #标签0和1
#Number of data points in the minority class少数类中的数据点数量
number_records_fraud=len(data[data.Class==1]) #Class=1欺诈数据点数量
fraud_indices=np.array(data[data.Class==1].index)
#Picking the indices of the normal classes
normal_indices=data[data.Class==0].index #Class=0正常数据点数量
#Out of the indices we picked,randomly select "x" number (number_records_fraud)
#从我们挑选的指数中,随机选择“x”数字(数字\记录\欺诈)
random_normal_indices=np.random.choice(normal_indices,number_records_fraud,replace=False) #在0中随机取出部分数据,数量与1一致
random_normal_indices=np.array(random_normal_indices)
#Appending the 2 indices
under_sample_indices=np.concatenate([fraud_indices,random_normal_indices]) #下采样=1的数据量+0的数据量
#Under sample dataset下采样数据集
under_sample_data=data.iloc[under_sample_indices,:]
X_undersample=under_sample_data.iloc[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.iloc[:,under_sample_data.columns=='Class']
#Showing ratio显示比率
print("Percentage of normal transactions:",len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data))
print("Percentage of fraud transactions:",len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data))
print("Total number of transactions in resampled data:",len(under_sample_data))
Percentage of normal transactions: 0.5
Percentage of fraud transactions: 0.5
Total number of transactions in resampled data: 984
#正常数据有50%欺诈数据有50%
#一共有984条数据
from sklearn.model_selection import train_test_split #对原始数据集进行切分
#首先对原始数据集进行划分 然后对下采样数据集进行划分
#Whole dataset整个数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0) #测试集为0.3 随机切分
print("Number transactions train dataset:",len(X_train))
print("Number transactions test dataset:",len(X_test))
print("Total number of transactions:",len(X_train)+len(X_test))
#Undersampled dataset子抽样数据集
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample=train_test_split(
X_undersample,y_undersample,test_size=0.3,random_state=0)
print("")
print("Number transactions train dataset:",len(X_train_undersample))
print("Number transactions test dataset:",len(X_test_undersample))
print("Total number of transactions:",len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset: 199364
Number transactions test dataset: 85443
Total number of transactions: 284807
Number transactions train dataset: 688
Number transactions test dataset: 296
Total number of transactions: 984
#Recall=TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
def printing_Kfold_scores(x_train_data,y_train_data):
#fold = KFold(len(y_train_data),5,shuffle=False)
fold=KFold(n_splits=5,shuffle=False) #切分成5部分
#Different C parameters不同的C参数
c_param_range=[0.01,0.1,1,10,100]
results_table=pd.DataFrame(index=range(len(c_param_range),2),columns=['C_parameter','Mean recall score'])
results_table['C_parameter)']=c_param_range
#the k-fold will give 2 lists:train_indices=indices[0],test_indices=indices[1]
#k-fold将给出两个列表:训练指数=指数[0],测试指数=指数[1]
j=0
for c_param in c_param_range:
print('------------------------------')
print('C parameter:',c_param)
print('------------------------------')
print('')
i=0
recall_accs=[]
for iteration,indices in enumerate(fold.split(y_train_data),start=1):
#Call the logistic regression model with a certain C parameter
#调用具有一定C参数的逻辑回归模型
#lr = LogisticRegression(C = c_param, penalty = 'l1')
lr = LogisticRegression(C = c_param, penalty='l2')
#Use the training data to fit the model. In this case, we use the portion of the fold to train the model
#with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
#使用训练数据来拟合模型。在这种情况下,我们使用折叠部分来训练具有索引[0]的模型。
#然后我们用索引[1]预测分配为“测试交叉验证”的部分
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
#Predict values using the test indices in the training data
#利用训练数据中的测试指标进行预测
y_pred_undersample=lr.predict(x_train_data.iloc[indices[1],:].values)
#Calculate the recall score and append it to a list for recall scores representing the current c_parameter
#计算召回分数并将其附加到表示当前C参数的召回分数列表中
recall_acc=recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
i+=1
print('Iteration',i,':recall score=',recall_acc)
#The mean value of those recall scores is the metric we want to save and get hold of.
#这些召回分数的平均值是我们想要保存和掌握的指标。
results_table.loc[j,'Mean recall score']=np.mean(recall_accs)
j+=1
print('')
print('Mean recall score',np.mean(recall_accs))
print('')
#best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
#Finally, we can check which C parameter is the best amongst the chosen.
#最后我们可以检查哪一个C参数是所选择的参数中最好的。
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
best_c=printing_Kfold_scores(X_train_undersample,y_train_undersample)
C parameter: 0.01
------------------------------
Iteration 1 :recall score= 0.821917808219178
Iteration 2 :recall score= 0.8493150684931506
Iteration 3 :recall score= 0.9152542372881356
Iteration 4 :recall score= 0.918918918918919
Iteration 5 :recall score= 0.8787878787878788
Mean recall score 0.8768387823414523
------------------------------
C parameter: 0.1
------------------------------
Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9661016949152542
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.8939393939393939
Mean recall score 0.9091426124395708
------------------------------
C parameter: 1
------------------------------
Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9661016949152542
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9090909090909091
Mean recall score 0.9121729154698739
------------------------------
C parameter: 10
------------------------------
Iteration 1 :recall score= 0.8493150684931506
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9830508474576272
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9242424242424242
Mean recall score 0.9185930490086515
------------------------------
C parameter: 100
------------------------------
Iteration 1 :recall score= 0.863013698630137
Iteration 2 :recall score= 0.8904109589041096
Iteration 3 :recall score= 0.9830508474576272
Iteration 4 :recall score= 0.9459459459459459
Iteration 5 :recall score= 0.9242424242424242
Mean recall score 0.9213327750360488
*********************************************************************************
Best model to choose from cross validation is with C parameter = nan
*********************************************************************************
#原文链接:https://blog.csdn.net/weixin_45755332/article/details/113545313
# 模型评估:对模型选择较合适的参数
#recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
def printing_Kfold_scores(x_train_data, y_train_data):
# 对数据进行5折分组;
fold = KFold(n_splits=5, shuffle=True)
# print(len(y_train_data))
c_param_range = [0.01, 0.1, 1, 10, 100] # 惩罚力度参数;
# 形成一个两列的数据,c_parameter为第一列,
results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
# 循环的使用五个惩罚力度:通过k折检验来确定逻辑回归函数在加入惩罚项时,他对应的参数为best_C;
for c_param in c_param_range:
print('-----------------------------')
print('C_parameter:', c_param)
print('-----------------------------')
i=0
recall_accs = []
# 对fold中进行遍历,fold中共有五组数据,start=1:下标从1开始
# enumerate的作用是将fold数据结构组合为一个序列索引,同时列出数据以及下标;
for iteration, indices in enumerate(fold.split(y_train_data), start=1):
'''
iteration 表示第几次循环;indices中返回值有两个,即两组值得下标,第一个为抽样后的剩余数据,用来作为训练集(训练集的训练集,实际的测试集不能被训练)
一般占5分之4;第二个为抽样的数据,用来作为验证集,一般占5分之1;
'''
# 构建逻辑回归的样式,带有l2惩罚项;
lr = LogisticRegression(C = c_param, penalty='l2')
# 将数据放入模型中进行调整
# x_train_data.iloc[indices[0],:]:4/5数据所对应的训练数据;1/5数据所对应的测试数据;
# 将多维数据降为一维:
# ravel():将数据拉成一维的,修改对原数据有影响;
# flatten():返回的是复制的内容,修改对原数据没有影响;
lr.fit(x_train_data.iloc[indices[0],:], y_train_data.iloc[indices[0],:].values.ravel())
# 利用交叉验证进行预测:利用取出的数据进行验证,indices中的第二维是抽取出来的1/5的数据,用来进行交叉验证的;
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
# print(y_pred_undersample) 验证出来的
# 验证召回率:正确的结果有多少被给出了;
# a=y_train_data.iloc[indices[1],:].values:总的正确结果数:
# b=y_pred_undersample:预测结果是正确的:sum(a,b)一致
# recall_acc = sum(a,b)/sum(a);
# 准确率:给出的结果有多少是正确的;
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values, y_pred_undersample)
recall_accs.append(recall_acc)
# print('Iteration', iteration, ':recall score=', recall_acc)
i+=1
print('Iteration ', i,': recall score = ', recall_acc)
# 在某一惩罚力度下,5组数据形成的集合,最终求平均值;
results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
j+=1
print('')
print('Mean recall score', np.mean(recall_accs))
print('')
# 最大值所对应的索引值
best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
print('*******************************************************************')
print('Best model to choose from cross validation is with C parameter =', best_c)
print('*******************************************************************')
return best_c
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)
-----------------------------
C_parameter: 0.01
-----------------------------
Iteration 1 : recall score = 0.9242424242424242
Iteration 2 : recall score = 0.921875
Iteration 3 : recall score = 0.8481012658227848
Iteration 4 : recall score = 0.9166666666666666
Iteration 5 : recall score = 0.765625
Mean recall score 0.8753020713463752
-----------------------------
C_parameter: 0.1
-----------------------------
Iteration 1 : recall score = 0.8793103448275862
Iteration 2 : recall score = 0.9473684210526315
Iteration 3 : recall score = 0.8428571428571429
Iteration 4 : recall score = 0.8787878787878788
Iteration 5 : recall score = 0.92
Mean recall score 0.8936647575050479
-----------------------------
C_parameter: 1
-----------------------------
Iteration 1 : recall score = 0.9230769230769231
Iteration 2 : recall score = 0.9014084507042254
Iteration 3 : recall score = 0.9230769230769231
Iteration 4 : recall score = 0.9166666666666666
Iteration 5 : recall score = 0.9027777777777778
Mean recall score 0.9134013482605032
-----------------------------
C_parameter: 10
-----------------------------
Iteration 1 : recall score = 0.9102564102564102
Iteration 2 : recall score = 0.9130434782608695
Iteration 3 : recall score = 1.0
Iteration 4 : recall score = 0.8873239436619719
Iteration 5 : recall score = 0.9076923076923077
Mean recall score 0.923663227974312
-----------------------------
C_parameter: 100
-----------------------------
Iteration 1 : recall score = 0.9041095890410958
Iteration 2 : recall score = 0.9482758620689655
Iteration 3 : recall score = 0.9066666666666666
Iteration 4 : recall score = 0.8918918918918919
Iteration 5 : recall score = 0.9384615384615385
Mean recall score 0.9178811096260316
*******************************************************************
Best model to choose from cross validation is with C parameter = 10.0
*******************************************************************
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred=lr.predict(X_test.values)
#Compute confusion matrix
cnf_matrix=confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset:" , cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
#Plot non-normalized confusion matrix
class_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,
classes=class_names,
title='Confusion matrix')
plt.show()
best_c=printing_Kfold_scores(X_train,y_train)
-----------------------------
C_parameter: 0.01
-----------------------------
Iteration 1 : recall score = 0.5555555555555556
Iteration 2 : recall score = 0.6290322580645161
Iteration 3 : recall score = 0.6233766233766234
Iteration 4 : recall score = 0.5131578947368421
Iteration 5 : recall score = 0.6567164179104478
Mean recall score 0.595567749928797
-----------------------------
C_parameter: 0.1
-----------------------------
Iteration 1 : recall score = 0.5857142857142857
Iteration 2 : recall score = 0.5714285714285714
Iteration 3 : recall score = 0.618421052631579
Iteration 4 : recall score = 0.6142857142857143
Iteration 5 : recall score = 0.6363636363636364
Mean recall score 0.6052426520847572
-----------------------------
C_parameter: 1
-----------------------------
Iteration 1 : recall score = 0.6666666666666666
Iteration 2 : recall score = 0.6052631578947368
Iteration 3 : recall score = 0.603448275862069
Iteration 4 : recall score = 0.5875
Iteration 5 : recall score = 0.6
Mean recall score 0.6125756200846946
-----------------------------
C_parameter: 10
-----------------------------
Iteration 1 : recall score = 0.5205479452054794
Iteration 2 : recall score = 0.6301369863013698
Iteration 3 : recall score = 0.6764705882352942
Iteration 4 : recall score = 0.5254237288135594
Iteration 5 : recall score = 0.6805555555555556
Mean recall score 0.6066269608222516
-----------------------------
C_parameter: 100
-----------------------------
Iteration 1 : recall score = 0.5526315789473685
Iteration 2 : recall score = 0.5972222222222222
Iteration 3 : recall score = 0.6507936507936508
Iteration 4 : recall score = 0.6363636363636364
Iteration 5 : recall score = 0.5964912280701754
Mean recall score 0.6067004632794106
*******************************************************************
Best model to choose from cross validation is with C parameter = 1.0
*******************************************************************
lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample=lr.predict(X_test.values)
#Compute confusion matrix
cnf_matrix=confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset:",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
#Plot non-normalized confusion matrix
class_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,
classes=class_names,
title='Confusion matrix')
plt.show()
lr = LogisticRegression(C = 0.01, penalty = 'l2')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba=lr.predict_proba(X_test_undersample.values)
thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
y_test_predictions_high_recall=y_pred_undersample_proba[:,1]>i
plt.subplot(3,3,j)
j+=1
#Compute confusion matrix
cnf_matrix=confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset:",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
#Plot non-normalized confusion matrix
class_names=[0,1]
plot_confusion_matrix(cnf_matrix,
classes=class_names,
title='threshold >= %s' %i)
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
credit_cards=pd.read_csv('creditcard.csv')
columns=credit_cards.columns
#The labels are in the last column ('Class'). Simply remove it to obtain features columns
features_columns=columns.delete(len(columns)-1)
features=credit_cards[features_columns]
labels=credit_cards['Class']
features_train,features_test,labels_train,labels_test = train_test_split(features,
labels,
test_size=0.2,
random_state=0)
oversampler=SMOTE(random_state=0)
#os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
os_features,os_labels=oversampler.fit_resample(features_train,labels_train)
len(os_labels[os_labels==1])
227454
os_features=pd.DataFrame(os_features)
os_labels=pd.DataFrame(os_labels)
best_c=printing_Kfold_scores(os_features,os_labels)
-----------------------------
C_parameter: 0.01
-----------------------------
Iteration 1 : recall score = 0.9630360864319404
Iteration 2 : recall score = 0.9624495525530795
Iteration 3 : recall score = 0.9630808402291534
Iteration 4 : recall score = 0.9605958142721386
Iteration 5 : recall score = 0.9627658404627328
Mean recall score 0.962385626789809
-----------------------------
C_parameter: 0.1
-----------------------------
Iteration 1 : recall score = 0.9628027108766198
Iteration 2 : recall score = 0.9434925103930669
Iteration 3 : recall score = 0.9623599439775911
Iteration 4 : recall score = 0.9626233977530066
Iteration 5 : recall score = 0.9626480346274691
Mean recall score 0.9587853195255507
-----------------------------
C_parameter: 1
-----------------------------
Iteration 1 : recall score = 0.9628398458998348
Iteration 2 : recall score = 0.9429591970898891
Iteration 3 : recall score = 0.962535198873636
Iteration 4 : recall score = 0.9628606167790428
Iteration 5 : recall score = 0.963679327848766
Mean recall score 0.9589748372982336
-----------------------------
C_parameter: 10
-----------------------------
Iteration 1 : recall score = 0.942792664490261
Iteration 2 : recall score = 0.9436078604548466
Iteration 3 : recall score = 0.9633548160042232
Iteration 4 : recall score = 0.9611077084609327
Iteration 5 : recall score = 0.9623544460127029
Mean recall score 0.9546434990845933
-----------------------------
C_parameter: 100
-----------------------------
Iteration 1 : recall score = 0.9624321294321954
Iteration 2 : recall score = 0.9624070050694582
Iteration 3 : recall score = 0.96179676318397
Iteration 4 : recall score = 0.9640471729306734
Iteration 5 : recall score = 0.9624035426166258
Mean recall score 0.9626173226465845
*******************************************************************
Best model to choose from cross validation is with C parameter = 100.0
*******************************************************************
lr = LogisticRegression(C = best_c, penalty = 'l2')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
#原文链接:https://blog.csdn.net/weixin_44507435/article/details/104944917
#柱状图与堆叠图
fig,axes=plt.subplots(4,1,figsize=(10,10))
s=pd.Series(np.random.randint(0,10,16),index=list('abcdefghijklmnop'))
df=pd.DataFrame(np.random.rand(10,3),columns=['a','b','c'])
#单系列柱状图
s.plot(kind='bar',color='k',grid=True,alpha=0.5,ax=axes[0]) #ax参数 选择第几个子图
#多系列柱状图
df.plot(kind='bar',ax=axes[1],grid=True,colormap='Reds_r')
#多系列堆叠图
df.plot(kind='bar',ax=axes[2],grid=True,colormap='Blues_r',stacked=True) #stacked堆叠
df.plot.barh(ax=axes[3],grid=True,stacked=True,colormap='BuGn_r')
#原文链接:https://blog.csdn.net/weixin_44507435/article/details/104944917
plt.figure(figsize=(10,4))
x=np.arange(10)
y1=np.random.rand(10)
y2=-np.random.rand(10)
plt.bar(x,y1,width=1,facecolor='yellowgreen',edgecolor='white',yerr=y1*0.1)
plt.bar(x,y2,width=1,facecolor='lightskyblue',edgecolor='white',yerr=y2*0.1)
#x,y参数:x,y值
#width:宽度比例
#facecolor柱状图里填充的颜色、edgecolor是边框的颜色
#left-每个柱x轴左边界,bottom-每个柱y轴下边界 → bottom扩展即可化为甘特图 Gantt Chart
#align:决定整个bar图分布,默认left表示默认从左边界开始绘制,center会将图绘制在中间位置
#xerr/yerr:x/y方向error bar
for i,j in zip(x,y1):
plt.text(i+0.3,j-0.15,'%.2f' % j,color='white')
for i,j in zip(x,y2):
plt.text(i+0.3,j+0.05,'%.2f' % -j,color='white')
#给图添加text
#zip()函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。
#原文链接:https://blog.csdn.net/weixin_44507435/article/details/104944917
#table(cellText=None,cellColours=None,cellLoc='right',colWidths=None,rowLabels=None,rowColours=None,rowLoc='left',
#colLabels=None, colColours=None, colLoc='center',loc='bottom', bbox=None)
data = [[ 66386, 174296, 75131, 577908, 32015],
[ 58230, 381139, 78045, 99308, 160454],
[ 89135, 80552, 152558, 497981, 603535],
[ 78415, 81858, 150656, 193263, 69638],
[139361, 331509, 343164, 781380, 52269]]
columns=('Freeze', 'Wind', 'Flood', 'Quake', 'Hail')
rows=['%d year' % x for x in (100, 50, 20, 10, 5)]
df=pd.DataFrame(data,columns=('Freeze', 'Wind', 'Flood', 'Quake', 'Hail'),
index=['%d year' % x for x in (100, 50, 20, 10, 5)])
print(df)
df.plot(kind='bar',grid=True,colormap='Blues_r',stacked=True,figsize=(8,3))
#创建堆叠图
plt.table(cellText=data,
cellLoc='center',
cellColours=None,
rowLabels=rows,
rowColours=plt.cm.BuPu(np.linspace(0, 0.5,5))[::-1], # BuPu可替换成其他colormap
colLabels=columns,
colColours=plt.cm.Reds(np.linspace(0, 0.5,5))[::-1],
rowLoc='right',
loc='bottom')
#cellText:表格文本
#cellLoc:cell内文本对齐位置
#rowLabels:行标签
#colLabels:列标签
#rowLoc:行标签对齐位置
#loc:表格位置 → left,right,top,bottom
plt.xticks([])
#不显示x轴标注
认真是一种态度更是一种责任