逻辑回归案例信用卡欺诈检测
1. 逻辑回归算法原理
前言
- 逻辑回归是回归算法吗?答:不是,它是最经典的二分类算法。
- 算法选择原则:先试逻辑回归再用复杂的,能用简单就用简单的,这样模型容易解释
- 逻辑回归的决策边界可以是非线性的
Sigmoid函数
2. python实现逻辑回归
任务目标:根据两次考试的结果预测一个学生是否被大学录取
数据源:LogiReg_data.txt 提取码:p51e
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
def readFile():
path = 'LogiReg_data.txt'
pdData = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
print(pdData.head())
return pdData
def plot(pdData):
positive = pdData[
pdData['Admitted'] == 1] # returns the subset of rows such Admitted = 1, i.e. the set of *positive* examples
negative = pdData[
pdData['Admitted'] == 0] # returns the subset of rows such Admitted = 0, i.e. the set of *negative* examples
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=30, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=30, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
fig.show()
if __name__ == '__main__':
plot(readFile())
读取数据,可视化展示下是这样的
逻辑回归算法
- 目标:求 θ 0 θ 1 θ 2 \theta_0 \theta_1 \theta_2 θ0θ1θ2,因为有2个特征
- 阈值 0.5
sigmoid 函数
g ( z ) = 1 1 + e − z g(z) = \frac{1}{1+e^{-z}} g(z)=1+e−z1
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def plotSigmoid():
nums = np.arange(-10, 10, step=1) # creates a vector containing 20 equally spaced values from -10 to 10
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(nums, sigmoid(nums), 'r')
fig.show()
- g : R → [ 0 , 1 ] g:\mathbb{R} \to [0,1] g:R→[0,1]
- g ( 0 ) = 0.5 g(0)=0.5 g(0)=0.5
- g ( − ∞ ) = 0 g(- \infty)=0 g(−∞)=0
- g ( + ∞ ) = 1 g(+ \infty)=1 g(+∞)=1
model : h θ ( x ) h_\theta(x) hθ(x)
def model(X, theta):
return sigmoid(np.dot(X, theta.T))
( θ 0 θ 1 θ 2 ) × ( 1 x 1 x 2 ) = θ 0 + θ 1 x 1 + θ 2 x 2 \begin{array}{ccc} \begin{pmatrix}\theta_{0} & \theta_{1} & \theta_{2}\end{pmatrix} & \times & \begin{pmatrix}1\\ x_{1}\\ x_{2} \end{pmatrix}\end{array}=\theta_{0}+\theta_{1}x_{1}+\theta_{2}x_{2} (θ0θ1θ2)×⎝⎛1x1x2⎠⎞=θ0+θ1x1+θ2x2
预处理
def preprocess(pdData):
pdData.insert(0, 'Ones',
1) # in a try / except structure so as not to return an error if the block si executed several times
# set X (training data) and y (target variable)
orig_data = pdData.as_matrix() # convert the Pandas representation of the data to an array useful for further computations
cols = orig_data.shape[1]
X = orig_data[:, 0:cols - 1]
y = orig_data[:, cols - 1:cols]
# convert to numpy arrays and initalize the parameter array theta
# X = np.matrix(X.values)
# y = np.matrix(data.iloc[:,3:4].values) #np.array(y.values)
theta = np.zeros([1, 3])
return X,y
添加x0为1这一列,把数据转换成arrays,初始化theta
损失函数
将对数似然函数去负号
D
(
h
θ
(
x
)
,
y
)
=
−
y
log
(
h
θ
(
x
)
)
−
(
1
−
y
)
log
(
1
−
h
θ
(
x
)
)
D(h_\theta(x), y) = -y\log(h_\theta(x)) - (1-y)\log(1-h_\theta(x))
D(hθ(x),y)=−ylog(hθ(x))−(1−y)log(1−hθ(x))
求平均损失
J
(
θ
)
=
1
n
∑
i
=
1
n
D
(
h
θ
(
x
i
)
,
y
i
)
J(\theta)=\frac{1}{n}\sum_{i=1}^{n} D(h_\theta(x_i), y_i)
J(θ)=n1i=1∑nD(hθ(xi),yi)
def cost(X, y, theta):
left = np.multiply(-y, np.log(model(X, theta)))
right = np.multiply(1 - y, np.log(1 - model(X, theta)))
return np.sum(left - right) / (len(X))
计算梯度
∂ J ∂ θ j = − 1 m ∑ i = 1 n ( y i − h θ ( x i ) ) x i j \frac{\partial J}{\partial \theta_j}=-\frac{1}{m}\sum_{i=1}^n (y_i - h_\theta (x_i))x_{ij} ∂θj∂J=−m1i=1∑n(yi−hθ(xi))xij
def gradient(X, y, theta):
grad = np.zeros(theta.shape)
error = (model(X, theta)- y).ravel()
for j in range(len(theta.ravel())): #for each parmeter
term = np.multiply(error, X[:,j])
grad[0, j] = np.sum(term) / len(X)
return grad
STOP_ITER = 0
STOP_COST = 1
STOP_GRAD = 2
def stopCriterion(type, value, threshold):
#设定三种不同的停止策略
if type == STOP_ITER: return value > threshold
elif type == STOP_COST: return abs(value[-1]-value[-2]) < threshold
elif type == STOP_GRAD: return np.linalg.norm(value) < threshold
import numpy.random
#洗牌
def shuffleData(data):
np.random.shuffle(data)
cols = data.shape[1]
X = data[:, 0:cols-1]
y = data[:, cols-1:]
return X, y
import time
def descent(data, theta, batchSize, stopType, thresh, alpha):
#梯度下降求解
init_time = time.time()
i = 0 # 迭代次数
k = 0 # batch
X, y = shuffleData(data)
grad = np.zeros(theta.shape) # 计算的梯度
costs = [cost(X, y, theta)] # 损失值
while True:
grad = gradient(X[k:k+batchSize], y[k:k+batchSize], theta)
k += batchSize #取batch数量个数据
if k >= n:
k = 0
X, y = shuffleData(data) #重新洗牌
theta = theta - alpha*grad # 参数更新
costs.append(cost(X, y, theta)) # 计算新的损失
i += 1
if stopType == STOP_ITER: value = i
elif stopType == STOP_COST: value = costs
elif stopType == STOP_GRAD: value = grad
if stopCriterion(stopType, value, thresh): break
return theta, i-1, costs, grad, time.time() - init_time
def runExpe(data, theta, batchSize, stopType, thresh, alpha):
#import pdb; pdb.set_trace();
theta, iter, costs, grad, dur = descent(data, theta, batchSize, stopType, thresh, alpha)
name = "Original" if (data[:,1]>2).sum() > 1 else "Scaled"
name += " data - learning rate: {} - ".format(alpha)
if batchSize==n: strDescType = "Gradient"
elif batchSize==1: strDescType = "Stochastic"
else: strDescType = "Mini-batch ({})".format(batchSize)
name += strDescType + " descent - Stop: "
if stopType == STOP_ITER: strStop = "{} iterations".format(thresh)
elif stopType == STOP_COST: strStop = "costs change < {}".format(thresh)
else: strStop = "gradient norm < {}".format(thresh)
name += strStop
print ("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(
name, theta, iter, costs[-1], dur))
fig, ax = plt.subplots(figsize=(12,4))
ax.plot(np.arange(len(costs)), costs, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title(name.upper() + ' - Error vs. Iteration')
fig.show()
return theta
测试结果
if __name__ == '__main__':
# plot(readFile())
# 选择的梯度下降方法是基于所有样本的
n = 100
pdData = readFile()
pdData.insert(0, 'Ones', 1)
orig_data = pdData.as_matrix()
theta = np.zeros([1, 3])
runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001)
精度
#设定阈值
def predict(X, theta):
return [1 if x >= 0.5 else 0 for x in model(X, theta)]
def printAccuracy(scaled_data, theta):
scaled_X = scaled_data[:, :3]
y = scaled_data[:, 3]
predictions = predict(scaled_X, theta)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print ('accuracy = {0}%'.format(accuracy))
提高精度的方法:
把学习率调小,Mini-batch descent,数据进行标准化
3. 信用卡欺诈检测
数据:creditcard.csv 提取码:dydm
data.head()
正负样本分布
可以看出样本分布非常不均衡,正负样本数量差距非常大。这里先用一种简单的做法——下采样,什么是下采样,就是去掉多的那类中的数量保证正负样本数量一致。
下采样的结果:
把数据分成训练和测试集
原数据集数量共284807,训练数据199364,测试数据85443
下采样数据共984,训练数据688,测试数据296
修复一些新库和老库不同的地方
#!/usr/bin/env python
# encoding: utf-8
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score,classification_report
import warnings
warnings.filterwarnings("ignore")
def preprocess():
data = pd.read_csv("creditcard.csv")
# print(data.head())
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
# plt.show()
# 标准化 去掉无效列
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
# 下采样
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index
# Out of the indices we picked, randomly select "x" number (number_records_fraud)
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices, :]
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# Showing ratio
print("Percentage of normal transactions: ",
len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("Percentage of fraud transactions: ",
len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# Whole dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
,y_undersample
,test_size = 0.3
,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
return X_train, X_test, y_train, y_test, X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample
def printing_Kfold_scores(x_train_data, y_train_data):
fold = KFold(5, shuffle=False)
# Different C parameters
c_param_range = [0.01, 0.1, 1, 10, 100]
results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
results_table['C_parameter'] = c_param_range
# the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
j = 0
for c_param in c_param_range:
print('-------------------------------------------')
print('C parameter: ', c_param)
print('-------------------------------------------')
print('')
recall_accs = []
for iteration, indices in enumerate(fold.split(x_train_data)):
# Call the logistic regression model with a certain C parameter
lr = LogisticRegression(C=c_param, penalty='l1')
# Use the training data to fit the model. In this case, we use the portion of the fold to train the model
# with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())
# Predict values using the test indices in the training data
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)
# Calculate the recall score and append it to a list for recall scores representing the current c_parameter
recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration ', iteration, ': recall score = ', recall_acc)
# The mean value of those recall scores is the metric we want to save and get hold of.
results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter'] # 必须加.astype('float64')
# Finally, we can check which C parameter is the best amongst the chosen.
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
if __name__ == '__main__':
X_train, X_test, y_train, y_test, X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = preprocess()
best_c = printing_Kfold_scores(X_train_undersample, y_train_undersample)
显示混淆矩阵
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
import itertools
def test_plot_confusion_matrix(best_c, X_train_undersample, y_train_undersample):
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
召回率是 0.95 这个值更有价值,对比准确率,因为我们希望知道所有异常样本中我们识别到的概率。
当然,我们可以通过修改阈值来看看对模型效果的影响,上面代码用的是L1正则化惩罚也可调节。