import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,recall_score,classification_report
# 数据下采样
X = data.loc[:, data.columns !='Class']
y = data.loc[:, data.columns =='Class']
number_records_fraud =len(data[data.Class ==1])
fraud_indices = np.array(data[data.Class ==1].index)
normal_indices = data[data.Class ==0].index
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace =False)# 随机选择
random_normal_indices = np.array(random_normal_indices)
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])# 将index值合并在一起
under_sample_data = data.iloc[under_sample_indices,:]# 定位
X_undersample = under_sample_data.loc[:, under_sample_data.columns !='Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns =='Class']print("Percentage of normal transactions: ",len(under_sample_data[under_sample_data.Class ==0])/len(under_sample_data))print("Percentage of fraud transactions: ",len(under_sample_data[under_sample_data.Class ==1])/len(under_sample_data))print("Total number of transactions in resampled data: ",len(under_sample_data))
from sklearn.model_selection import train_test_split # 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.3, random_state =0)# 切分比例,随机状态print("Number transactions train dataset: ",len(X_train))print("Number transcations test dataset: ",len(X_test))print("Total number of transcations: ",len(X_train)+len(X_test))
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y_undersample,test_size=0.3,random_state=0)print("")print("Number transactions train dataset: ",len(X_train_undersample))print("Number transcations test dataset: ",len( X_test_undersample))print("Total number of transcations: ",len(X_train_undersample)+len(X_test_undersample))
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification