交易数据异常检测—机器学习实战

GitHub: https://github.com/yjfiejd/transaction_data_Anomaly_Detection (可下载资料)
# coding: utf-8

# In[2]:


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


os.chdir('/Users/a1/Downloads/百度云盘/机器学习算法配套案例实战/逻辑回归-信用卡欺诈检测')


# In[4]:


data = pd.read_csv("creditcard.csv")
data.head()


# In[5]:


#data.describe


# In[6]:


#看正负样本的比例,用value_counts来统计
count_class = pd.value_counts(data['Class']).sort_index() #默认按行标升序排列
#count_class.sort_index(ascending = False) 降序排列
count_class


# In[7]:


#查看count_class的类型, 这里取了pandas中dataFrame中的一列,所以为series格式
type(count_class)


# In[8]:


#用pandas画简单的图
count_class.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')


# ## 数据预处理
# ### 1)正常样本的个数 与 异常样本的个数如下图, 样本数据不均衡该怎么办?
#     
#     #方法一:下采样 - 如上图减少0样本的个数,让它与1样本一样的少, 看看哪种更好?
#     #方法二:过采样 - 让1号样本变多,让它与0号样本一样的多

# ### 2)某些特征数值太大,比如这里的normAmount,保证特征之间的分布差异差不多
#     #归一化
#     #标准化

# In[9]:


#预处理
from sklearn.preprocessing import StandardScaler
#data中加上一列,把原来的Amount转换为新的特征
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1)) #注意reshape python3.0的用法
#删除原来的没用的特征,用drop(['',''], axis = 1) 表示列
data = data.drop(['Time','Amount'], axis = 1)
data.head()


# ### 1.1下采样策略 : 使得0和1的样本一样少

# In[10]:


# 构造特征数据,:表示选择所有的行,x列选择中不包含‘Class’这一列label, y列中只选择包含label这一列
X = data.iloc[:, data.columns != 'Class']
Y = data.iloc[:, data.columns == 'Class']

#统计Class=1的样本有多少,然后在让0样本数量与1样本数量一致
number_records_fraud = len(data[data.Class == 1]) #统计异常样本个数
#取所有1样本(少)索引值,再把所有的索引值组成新的array
fraud_indices = np.array(data[data.Class == 1].index) 
#取所有0样本(多)的索引值
normal_indices = np.array(data[data.Class == 0].index)
#从0样本中随机取,让取出的0样本的个数等于1样本个数; np.random.choice()用法:http://blog.csdn.net/autoliuweijie/article/details/51982514
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
#随机取出来,取得里面值的index值
random_normal_indices = np.array(random_normal_indices)

#组合拼接,把index都存着
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])

#利用组合号的index,从data取出数据
under_sample_data = data.iloc[under_sample_indices,:] #取出需要的列,取出所有行

#获得列新的数据集合
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
Y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

#打印新的数据集合,看下正样本与负样本均衡了么
print("Percentage of normal transction:", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of normal transction:", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of new dataset", len(under_sample_data))
fraud_indices #返回的是所有1样本的索引


# ### 交叉验证
#     #先洗牌
#     #再切分

# In[11]:


from sklearn.cross_validation import train_test_split

#对整个原始数据集切分,查看如何使用train_test_split
#http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

print("**********我是分割线**********")
print("Number transaction train dataset:", len(X_train))
print("Number transaction test dataset:", len(X_test))
print("Total number of transaction:", len(X_train)+len(X_test))


# #### 这里为什么对原始数据集也进行切分? → 后续测试model时使用
#     #回答:因为下采样数据集切分,只是为了得到合适的model,真正验证时候,需要把model放到原始的测试集中,因为下采样的测试集第一小,第二分部规则不一定与原始数据集合一样

# In[13]:


#对下采样的后的数据集进行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, Y_undersample, test_size = 0.3, random_state = 0)

print("**********我是分割线**********")
print("Number transaction train dataset:", len(X_train_undersample))
print("Number transaction test dataset:", len(X_test_undersample))
print("Total number of transaction:", len(X_train_undersample) + len(X_test_undersample))


# ### 建模操作- 逻辑回归模型
#     #模型评估方法,不能仅仅依靠精度,特别是样本不均衡的情况下
#     #需要用recall来制定model的评估标准
#     #Recall = TP/TP+FN  (True positive/ True positive + False nagative)

# In[14]:


#导入机器学习建模库,逻辑回归; 交叉验证(几份)&结果;混淆矩阵
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report


# In[53]:


#切分完数据集后,进行交叉验证
def printing_Kfold_scores(x_train_data, y_train_data):
    fold = KFold(len(y_train_data), 5, shuffle=False)
    
    #正则化惩罚项,参数: 希望model浮动小,泛化能力更强,更能避免过拟合A,B,model
    #L2正则化+1/2 w平方,看谁的loss小,惩罚力度可以用λ调节
    #L1正则化,加|w|绝对值
    c_param_range = [0.01, 0.1, 1, 10, 100] #这个就是λ
    
    #可视化展示
    results_table = pd.DataFrame(index = range(len(c_param_range), 2), columns = ['C_parameter', 'Mean recall score'])
    results_table['C_parameter'] = c_param_range
    
    j = 0
    for c_param in c_param_range:
        print("---------------------------------")
        print('C parameter:', c_param)
        print("---------------------------------")
        print('')
        
        recall_accs = []
        ## enumerate 转化为枚举值,iteration:枚举编号,indices:枚举值
        #http://blog.csdn.net/churximi/article/details/51648388
        for iteration, indices in enumerate(fold, start=1):#交叉验证,每次取不同的训练集,测试集
            
            #选择逻辑回归模型, 实例化模型
            lr = LogisticRegression(C = c_param, penalty = 'l1') #传进参数λ,选则l1正则化,也可以选l2
            
            #进行训练fit
            lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0],:].values.ravel())
            
            #进行预测再train里面的validation测试集中
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            
            #计算召回率,recall
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values, y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration', iteration, ':recall score = ', recall_acc)
            
            #recall的平均值
        results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('mean recall score', np.mean(recall_accs))
        print('')

    #best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    best_c = results_table
    best_c.dtypes.eq(object) #因为best_c中的mean recall score 值的类型为‘object’,需要转换为'float',这里找出类型为‘object’的列名,返回index
    new = best_c.columns[best_c.dtypes.eq(object)] #利用返回的列名,找出那一列,pandas.columns,
    best_c[new] = best_c[new].apply(pd.to_numeric, errors = 'coerce', axis=0) #对该列进行操作,把‘object’转换为‘float’类型
    best_c
    #通过idxmax()函数取得‘Mean recall score’中值最大的行号,通过行号找到这行,然后取这行列名为‘C_parameter’的值
    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

    #如何找到值最大时候的索引值

    #选择最合适的C参数
    # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    #print (best_c.dtypes)
    #print(best_c.dtypes.eq(object))
    print(new)
    #print(best_c.dtypes)
    
    return best_c


# In[54]:


best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)


# ### 混淆矩阵
#     #里面有预测值与真实值可以求一些指标, Recall值TP/TP+FN,精度值TP+FN/TP+FN+TN+FP
#     #在下采样中,recall值可以满足要求,但是当模型用在整体数据集中容易误杀太多,精度会降低,那么如何解决呢?
#     #要不要试一试 过采样?
#     #如果我啥都不用,用原始的数据,那模型效果怎样呢?

# In[55]:


#混淆矩阵
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html 官方画图实例
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


# In[56]:


import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


# In[57]:


lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


# In[58]:


best_c = printing_Kfold_scores(X_train,y_train)


# In[59]:


lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

j = 1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
    
    plt.subplot(3,3,j)
    j += 1
    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i) 


# In[60]:


import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值