信用卡欺诈预测（下采样）

糖

已于 2022-02-25 17:03:14 修改

阅读量168

点赞数 1

文章标签： python 机器学习数据挖掘

于 2021-03-24 09:58:44 首次发布

本文链接：https://blog.csdn.net/weixin_48252774/article/details/115162441

版权

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,recall_score,classification_report

# 查看分类
count_classes = pd.value_counts(data['Class'])
count_classes

# 数据可视化
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequecy")

# 数据标准化
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data['normTime'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1,1))
data = data.drop(['Time','Amount'],axis=1)
data.head()

# 数据下采样

X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)

normal_indices = data[data.Class == 0].index

random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)  # 随机选择
random_normal_indices = np.array(random_normal_indices)

under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])        # 将index值合并在一起

under_sample_data = data.iloc[under_sample_indices,:]    # 定位

X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']

print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

from sklearn.model_selection import train_test_split    # 划分数据集


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)  # 切分比例，随机状态

print("Number transactions train dataset: ", len(X_train))
print("Number transcations test dataset: ",len(X_test))
print("Total number of transcations: ",len(X_train)+len(X_test))

X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y_undersample,test_size=0.3,random_state=0)

print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transcations test dataset: ",len( X_test_undersample))
print("Total number of transcations: ",len(X_train_undersample)+len(X_test_undersample))

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

# 创建分类器
svc = SVC(kernel="linear")

# 分类
rfecv = RFECV(estimator=svc, cv=2,
          scoring='accuracy')
rfecv.fit(X_train_undersample, y_train_undersample)

print("RFEC挑选了几个特征 : %d" % rfecv.n_features_)
print("是否应该选择 %s" % rfecv.support_)
print("得分 %s" % rfecv.grid_scores_)

# 画出不同特征数量下交叉认证验证得分
plt.figure(figsize=(10,8))
#  选择的特征数量
plt.xlabel("Number of features selected")
# 交叉验证得分
plt.ylabel("Score")
# 画出各个特征的得分
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_,'-o')
plt.axvline(22,ls='--',color='k')
plt.grid()
plt.xticks(range(0, 29, 5))

plt.title('RFECV for SVC')
plt.show()

X_train_undersample=X_train_undersample[X_train_undersample.columns[rfecv.support_]]
X_test=X_test[X_test.columns[rfecv.support_]]

# 初始化随机森林
rf0 = RandomForestClassifier(random_state=666)
rf0.fit(X_train_undersample, y_train_undersample)
# print(rf0.oob_score_)
y_predprob = rf0.predict_proba(X_test)[:,1]
print("AUC Score (Train): %f" % roc_auc_score(y_test, y_predprob))

"""
gridsearchcv选参数
"""
param_test0 = {'bootstrap':[True,False]}
gsearch0 = GridSearchCV(estimator = RandomForestClassifier( random_state=666),
                       param_grid = param_test0, scoring='roc_auc',cv=5)
gsearch0.fit(X_train_undersample, y_train_undersample)
gsearch0.cv_results_, gsearch0.best_params_, gsearch0.best_score_

param_test1 = {'n_estimators':range(50,100,200)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True, random_state=666),
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X_train_undersample, y_train_undersample)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

#  最终模型进行预测

rf = RandomForestClassifier(oob_score=True, random_state=666,n_estimators=50,max_depth=8,max_features=2,
                                                           min_samples_leaf=2,min_samples_split=8,max_leaf_nodes=6)
rf.fit(X_train_undersample, y_train_undersample)
# print(rf.oob_score_)
y_predprob = rf.predict_proba(X_test)[:,1]
y_pred=rf.predict(X_test)


print("AUC Score (Train): %f" % roc_auc_score(y_test, y_predprob))

import itertools
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    绘制混淆矩阵
    """
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
   
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cnf_matrix = confusion_matrix(y_test,y_pred)
# np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                     ,classes=class_names
                     ,title='Confusion matrix')

from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

"""
accuracy_score
recall_score 
precision_score
f1_score
"""

print('accuracy_score:',accuracy_score(y_test,y_pred)*100,'%') 
print('recall_score:',recall_score(y_test,y_pred)*100,'%')  
print('precision_score:',precision_score(y_test,y_pred)*100,'%')  
print('f1_score:',f1_score(y_test,y_pred)*100,'%')

{'accuracy':accuracy_score(y_test,y_pred)*100,
             'recall':recall_score(y_test,y_pred)*100,
             'precision':precision_score(y_test,y_pred)*100,
             'f1_score':f1_score(y_test,y_pred)*100,}

rfcdf=pd.DataFrame({'Accuracy':'%d%%'%(accuracy_score(y_test,y_pred)*100),
             'Recall':'%d%%'%(recall_score(y_test,y_pred)*100),
             'Precision':'%d%%'%(precision_score(y_test,y_pred)*100),
             'F1_score':'%d%%'%(f1_score(y_test,y_pred)*100),
             'Normal/Fraud':'%d/%d'%(count_classes[0],count_classes[1]),
             'Total transactions':len(data)},index=['Data'])
rfcdf.style.set_table_attributes('style="font-size:20px"')

from sklearn.model_selection import learning_curve, validation_curve

# learning curve5折交叉验证
train_sizes, train_scores, valid_scores = learning_curve(rf,X_train_undersample, y_train_undersample, cv=5)

mean_train = np.mean(train_scores,1)  #(5,)
# 得到得分范围的上下界
upper_train = np.clip(mean_train + np.std(train_scores,1)/2,0,1) 
lower_train = np.clip(mean_train - np.std(train_scores,1)/2,0,1)
    
mean_test = np.mean(valid_scores,1)
# 得到得分范围的上下界
upper_test = np.clip(mean_test + np.std(valid_scores,1)/2,0,1) 
lower_test = np.clip(mean_test - np.std(valid_scores,1)/2,0,1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes,mean_train,'ro-',label='train')
plt.plot(train_sizes,mean_test,'go-',label='test')
##填充上下界的范围
plt.fill_between(train_sizes,upper_train,lower_train,alpha=0.2,#alpha：覆盖区域的透明度[0,1],其值越大，表示越不透明 
         color='r')                   
plt.fill_between(train_sizes,upper_test,lower_test,alpha=0.2,#alpha：覆盖区域的透明度[0,1],其值越大，表示越不透明 
         color='g')  
plt.grid()
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='best')
plt.title('Learning curve')
# plt.savefig('train number-size.png')
plt.show()

糖

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
信用卡欺诈预测（下采样）

import pandas as pdimport matplotlib.pyplot as pltimport numpy as npfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import roc_auc_scorefrom sklearn.model_selection import GridSearchCVfrom sklearn.metrics import confusion_matr
复制链接

扫一扫