#没学会的东西都是沉没成本
#不及时复习的后果就是浪费更多的时间
12月看的 2月又来反思了!
https://blog.csdn.net/weixin_42108215/article/details/80721944
https://blog.csdn.net/perfect1t/article/details/83002858
1.查看数据特征,一般我们认为欺诈数目是少数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data=pd.values_counts(data['Class'],sort=True).sort_index()
# .sort_index() https://blog.csdn.net/xu200yang/article/details/70239109
count_class.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
2.数据的归一化处理
from sklearn_preprocessing import StandardScaler
data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))
#-1表示你根据原数据自己算 我的列反正是1
data=data.drop(['Time','Amount'],axis=1)
data.head
3.下采样
#下采样
# https://blog.csdn.net/qq1483661204/article/details/77587881
X=data.ix[:,data.columns!='Class']
y=data.ix[:,data.columns=='Class']
#Number of data points in minority class
num_records_fraud=len(data[data.Class==1])
fruad_indices=np.array(data[data.Class==1].index)
#Picking the indices of normal classes 随机选中和fraud 一样的记录 和normal一样
random_normal_indices=np.random.choice(normal_indices,number_records_fraud,replace=False)
random_normal_indices=array[random_normal_indices]
#appending the 2 indices
under_sample_indices=np.concatenate([random_normal_indices,random_normal_indices])
#记的[]
#under sample dataset 这个方法记住,取x和y
under_sample_data=data.iloc[under_sample_indices,:]
x_undersample=x_under_sample_data.ix[:,under_sample_data.columns!='Class']
y_undersample=y_under_sample_data.ix[:,under_sample_data.columns=='Class']
#showing ratio
print("percentage of normal transactions:",len(x_undersample[x_undersample==0])/len(x_undersample))
print("percentage of normal transactions:",len(x_undersample[x_undersample==1])/len(x_undersample))
print("Total number of transactions in resampled data:",len(under_sample_data))
3.提炼出训练集和操作集
from slearn.cross_validation import train_test_split
#whole dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)
print("Number transactions train dataset:",len(x_train))
print("Number transactions test dataset:",len(x_test))
print("Total mumber of transactions:",len(x_train)+len(x_test))
#跟原始数据比较
#undersampled dataset
X_train_undersample,x_test_undersample,y_train_undersample,y_test_undersample=train_test_split(X_undersample,
y_undersample,
test_size=0.3,
random_state=0)
print("*************")
print('number transactions train dataset:',len(X_train_undersample))
print('number transactions test dataset:',len(X_test_undersample))
print('total number of transactions:',len(X_train_undersample)+len(X_test_undersample))
4.开始验证
from sklearn.linear_model import LogisticRegression
from sklearn.cross.validation import KFolds,cross_value_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
'''交叉验证
KFold
fold = KFold(要交叉验证数据集的个数,分为几份(默认3),是否每次验证要洗牌(默认shuffle=False))实例化
fold是一个可迭代对象,可以用枚举的方法获取其,索引和值
for iteration, indices in enumerate(fold, start=1):
pass
这里的iteration是第几次交叉验证的次数
indices是 “要交叉验证数据集的个数”,在该次交叉验证被划分的两部分(训练集+验证集)'''
def printing_Kfold_scores(x_train_data,y_train_data):