1.导入相关库
import pandas as pd import matplotlib.pyplot as plt from pylab import mpl import numpy as np
2.数据预处理
共284807条数据
3.绘制图形,查看正负样本个数
mpl.rcParams['font.sans-serif']=['Microsoft YaHei'] mpl.rcParams['axes.unicode_minus']=False label_count=pd.value_counts(data['Class']) plt.title("正负例样本数") plt.xlabel("类别") plt.ylabel("频数") label_count.plot(kind='bar') plt.show()
4.采用Z标准化对Amount进行数据标准化
from sklearn.preprocessing import StandardScaler scaler=StandardScaler() data['Amount']=scaler.fit_transform(data[['Amount']]) data.head()
5.删除无用例
data=data.drop(['Time'],axis=1)
6.逻辑回归
6.1下采样解决样本不均衡
6.1.1分离对立样本
positive_eg=data[data['Class']==0] negative_eg=data[data['Class']==1]
6.1.2从0样本随机选取492条数据并进行拼接
np.random.seed(seed=2) positive_eg=positive_eg.sample(len(negative_eg)) data_c=pd.concat([positive_eg,negative_eg])
6.2下采样数据切分
from sklearn.model_selection import train_test_split X=data_c.drop('Class',axis=1) y=data_c.Class x_train,x_test,y_train,y_test=\ train_test_split(X,y,test_size=0.3,random_state=0)
6.3对原始数据切分
X_whole=data.drop('Class',axis=1) y_whole=data.Class x_train_w,x_test_w,y_train_w,y_test_w=\ train_test_split(X_whole,y_whole,test_size=0.3,random_state=0)
6.4执行交叉验证选择最优惩罚因子
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score scores = [] c_param_range = [0.01,0.1,1,10,100] z = 1 for i in c_param_range: start_time = time.time() lr = LogisticRegression(C = i, penalty = 'l2', solver='lbfgs') score = cross_val_score(lr, x_train, y_train, cv=10, scoring='recall') score_mean = sum(score)/len(score) scores.append(score_mean) end_time = time.time() print("第{}次...".format(z)) print("time spend:{:.2f}".format(end_time - start_time)) print("recall值为:{}".format(score_mean)) z +=1 best_c = c_param_range[np.argmax(scores)] print() print("最优惩罚因子为: {}".format(best_c))
6.5采用最优惩罚因子建立最优模型进行训练
lr=LogisticRegression(C=best_c,penalty='l2') lr.fit(x_train,y_train)
6.6预测结果
train_predicted=lr.predict(x_train)
6.7绘制混淆矩阵
from sklearn import metrics print(metrics.classification_report(y_train,train_predicted))
相当不错的召回率
6.7.1可视化混淆矩阵
def cm_plot(y, yp): from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt cm = confusion_matrix(y, yp) plt.matshow(cm, cmap=plt.cm.Blues) plt.colorbar() for x in range(len(cm)): for y in range(len(cm)): plt.annotate(cm[x, y], xy=(y, x), horizontalalignment='center', verticalalignment='center') plt.ylabel('True label') plt.xlabel('Predicted label') return plt cm_plot(y_train,train_predicted).show()
6.8使用大数据测试集进行测试
test_predicted=lr.predict(x_test_w) print(metrics.classification_report(y_test_w,test_predicted)) cm_plot(y_test_w,test_predicted).show()
测试效果较好
SMOTE算法
import pandas as pd import matplotlib.pyplot as plt from pylab import mpl import numpy as np import time data = pd.read_csv(r".creditcard.csv",\ encoding='utf8',engine = 'python') data.head() from sklearn.preprocessing import StandardScaler scaler=StandardScaler() data['Amount']=scaler.fit_transform(data[['Amount']]) data.head() data=data.drop(['Time'],axis=1) from sklearn.model_selection import train_test_split X_whole=data.drop('Class',axis=1) y_whole=data.Class x_train_w,x_test_w,y_train_w,y_test_w=\ train_test_split(X_whole,y_whole,test_size=0.3,random_state=0) from imblearn.over_sampling import SMOTE oversampler=SMOTE(random_state=0) os_x_train,os_y_train=oversampler.fit_sample(x_train_w,y_train_w) os_x_train=pd.DataFrame(os_x_train) os_y_train=pd.Series(os_y_train) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score scores = [] c_param_range = [0.01,0.1,1,10,100] z = 1 for i in c_param_range: start_time = time.time() lr = LogisticRegression(C = i, penalty = 'l2', solver='lbfgs') score = cross_val_score(lr, os_x_train, os_y_train, cv=10, scoring='roc_auc') score_mean = sum(score)/len(score) scores.append(score_mean) end_time = time.time() print("第{}次...".format(z)) print("time spend:{:.2f}".format(end_time - start_time)) print("recall值为:{}".format(score_mean)) z +=1 best_c = c_param_range[np.argmax(scores)] lr=LogisticRegression(C=best_c,penalty='l2') lr.fit(os_x_train,os_y_train) train_predicted=lr.predict(os_x_train) from sklearn import metrics print(metrics.classification_report(os_y_train,train_predicted)) def cm_plot(y, yp): from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt cm = confusion_matrix(y, yp) plt.matshow(cm, cmap=plt.cm.Blues) plt.colorbar() for x in range(len(cm)): for y in range(len(cm)): plt.annotate(cm[x, y], xy=(y, x), horizontalalignment='center', verticalalignment='center') plt.ylabel('True label') plt.xlabel('Predicted label') return plt cm_plot(os_y_train,train_predicted).show() test_predicted=lr.predict(x_test_w) print(metrics.classification_report(y_test_w,test_predicted)) cm_plot(y_test_w,test_predicted).show()
训练集和大数据的测试集预测效果均比较好
过采样方法的数据训练更多,相比较来说更好一些。