Task6 模型融合
模型融合方式任意,并结合Task5给出你的最优结果。
例如Stacking融合,用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分结果
# 导入库
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV,cross_val_predict
from scipy.stats import uniform
from sklearn import metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import model_selection
# 获取进行特征选择后的数据集
import pandas as pd
data = pd.read_csv('task2_proc.csv')
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
print('feature shape:{}, label shape:{}'.format(x.shape,y.shape))
feature shape:(4455, 50), label shape:(4455,)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=2018)
# 对训练数据集进行标准化处理
scaler = StandardScaler()
x_train_standard = scaler.fit_transform(x_train)
x_test_standard = scaler.fit_transform(x_test)
def get_scores(label, y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba):
train_accuracy = metrics.accuracy_score(y_train, y_train_predict)
test_accuracy = metrics.accuracy_score(y_test, y_test_predict)
# 精准率
train_precision = metrics.precision_score(y_train, y_train_predict)
test_precision = metrics.precision_score(y_test, y_test_predict)
# 召回率
train_recall = metrics.recall_score(y_train, y_train_predict)
test_recall = metrics.recall_score(y_test, y_test_predict)
# F1-score
train_f1_score = metrics.f1_score(y_train, y_train_predict)
test_f1_score = metrics.f1_score(y_test, y_test_predict)
# AUC
train_auc = metrics.roc_auc_score(y_train, y_train_proba)
test_auc = metrics.roc_auc_score(y_test, y_test_proba)
# ROC
train_fprs, train_tprs, train_thresholds = metrics.roc_curve(y_train, y_train_proba)
test_fprs, test_tprs, test_thresholds = metrics.roc_curve(y_test, y_test_proba)
plt.plot(train_fprs, train_tprs, label=label+' train ROC', linewidth=2)
plt.plot(test_fprs, test_tprs, label=label+' test ROC', linewidth=2)
plt.title("ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.show()
# 输出评分
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)
print("训练集精准率:", train_precision)
print("测试集精准率:", test_precision)
print("训练集召回率:", train_recall)
print("测试集召回率:", test_recall)
print("训练集F1-score:", train_f1_score)
print("测试集F1-score:", test_f1_score)
print("训练集AUC:", train_auc)
print("测试集AUC:", test_auc)
train = [train_accuracy, train_precision, train_recall, train_f1_score, train_auc]
test = [test_accuracy, test_precision, test_recall, test_f1_score, test_auc]
return train, test
# 逻辑回归
param_lr = {'penalty':['l1', 'l2'],
'C':[0.0001, 0.001, 0.01, 0.1, 1.0]}
lr = GridSearchCV(LogisticRegression(), param_lr, cv=5, n_jobs=-1)
lr.fit(x_train_standard, y_train)
y_train_predict = lr.predict(x_train_standard)
y_test_predict = lr.predict(x_test_standard)
y_train_proba = lr.predict_proba(x_train_standard)[:, 1]
y_test_proba = lr.predict_proba(x_test_standard)[:, 1]
Logistic_train, Logistic_test = get_scores('Logistic', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.7985888389993585
测试集准确率: 0.7920718025430067
训练集精准率: 0.7161125319693095
测试集精准率: 0.6306818181818182
训练集召回率: 0.35131744040150564
测试集召回率: 0.3425925925925926
训练集F1-score: 0.47138047138047134
测试集F1-score: 0.444
训练集AUC: 0.805792077896593
测试集AUC: 0.7991755328872803
# SVM
SVM = svm.SVC(C=0.6, kernel='rbf', gamma=20, decision_function_shape='ovr')
param_svm = {'C':[0.3,0.5,0.6,0.7],
'kernel':['rbf','linear'],
'gamma':[18,20,22]
}
SVM = GridSearchCV(SVM, param_svm, cv=5, n_jobs=-1)
SVM.fit(x_train_standard, y_train)
y_train_predict = SVM.predict(x_train_standard)
y_test_predict = SVM.predict(x_test_standard)
y_train_proba = SVM.decision_function(x_train_standard)
y_test_proba = SVM.decision_function(x_test_standard)
SVM_train, SVM_test = get_scores('SVM', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.7886465683130212
测试集准确率: 0.7920718025430067
训练集精准率: 0.776
测试集精准率: 0.7053571428571429
训练集召回率: 0.24341279799247176
测试集召回率: 0.24382716049382716
训练集F1-score: 0.3705826170009551
测试集F1-score: 0.3623853211009175
训练集AUC: 0.8048898362396254
测试集AUC: 0.8041357415329117
# 决策树
param_DT = {'max_depth':range(1,10)}
tree = GridSearchCV(DecisionTreeClassifier(), param_DT, cv=5, n_jobs=-1)
tree.fit(x_train_standard, y_train)
y_train_predict = tree.predict(x_train_standard)
y_test_predict = tree.predict(x_test_standard)
y_train_proba = tree.predict_proba(x_train_standard)[:, 1]
y_test_proba = tree.predict_proba(x_test_standard)[:, 1]
DT_train, DT_test = get_scores('DT', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.791853752405388
测试集准确率: 0.7778608825729244
训练集精准率: 0.6978609625668449
测试集精准率: 0.5944055944055944
训练集召回率: 0.32747804265997493
测试集召回率: 0.2623456790123457
训练集F1-score: 0.44577284372331344
测试集F1-score: 0.36402569593147743
训练集AUC: 0.7738046649515605
测试集AUC: 0.7459538347165857
# 随机森林
# rf = RandomForestClassifier(n_estimators=1000,criterion='gini',oob_score=True,
# random_state=2018,verbose=0,n_jobs=-1)
param_rf ={'n_estimators':range(10,71,10)}
rf = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20,
max_depth=8,max_features='sqrt',random_state=10),
param_grid =param_rf,scoring='roc_auc',cv=5)
rf.fit(x_train_standard, y_train)
y_train_predict = rf.predict(x_train_standard)
y_test_predict = rf.predict(x_test_standard)
y_train_proba = rf.predict_proba(x_train_standard)[:, 1]
y_test_proba = rf.predict_proba(x_test_standard)[:, 1]
RM_train, RM_test = get_scores('RM', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.8066067992302758
测试集准确率: 0.7958115183246073
训练集精准率: 0.8439716312056738
测试集精准率: 0.7575757575757576
训练集召回率: 0.2986198243412798
测试集召回率: 0.23148148148148148
训练集F1-score: 0.4411492122335496
测试集F1-score: 0.3546099290780142
训练集AUC: 0.8589286515514609
测试集AUC: 0.8022710930739887
# XGboost
param_xgb = dict(
max_depth = [4, 5, 6, 7],
learning_rate = np.linspace(0.03, 0.3, 10),
n_estimators = [100, 200]
)
xgb = GridSearchCV(XGBClassifier(), param_xgb, cv=5, n_jobs=-1)
xgb.fit(x_train_standard, y_train)
y_train_predict = xgb.predict(x_train_standard)
y_test_predict = xgb.predict(x_test_standard)
y_train_proba = xgb.predict_proba(x_train_standard)[:, 1]
y_test_proba = xgb.predict_proba(x_test_standard)[:, 1]
XGBoost_train, XGBoost_test = get_scores('XGBoost', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.8598460551635664
测试集准确率: 0.7958115183246073
训练集精准率: 0.8829787234042553
测试集精准率: 0.660377358490566
训练集召回率: 0.520702634880803
测试集召回率: 0.32407407407407407
训练集F1-score: 0.6550907655880033
测试集F1-score: 0.43478260869565216
训练集AUC: 0.9279309474294222
测试集AUC: 0.8091172778569949
# 各模型分数对比
model_name = ['Logistic','SVM','DecisionTree','RandomForest','xgboost']
columns = ['accuracy','precision','recall','f1','roc_auc']
ttype = ['train','test']
pd_list = []
model_score_train = [Logistic_train, SVM_train, DT_train, RM_train, XGBoost_train]
model_score_test = [Logistic_test, SVM_test, DT_test, RM_test, XGBoost_test]
for train,test in zip(model_score_train,model_score_test):
pd_list.append(pd.DataFrame([train,test],index=ttype,columns=columns))
pd.concat(pd_list,axis=0,keys=model_name)
accuracy | precision | recall | f1 | roc_auc | ||
---|---|---|---|---|---|---|
Logistic | train | 0.798589 | 0.716113 | 0.351317 | 0.471380 | 0.805792 |
test | 0.792072 | 0.630682 | 0.342593 | 0.444000 | 0.799176 | |
SVM | train | 0.788647 | 0.776000 | 0.243413 | 0.370583 | 0.804890 |
test | 0.792072 | 0.705357 | 0.243827 | 0.362385 | 0.804136 | |
DecisionTree | train | 0.791854 | 0.697861 | 0.327478 | 0.445773 | 0.773805 |
test | 0.777861 | 0.594406 | 0.262346 | 0.364026 | 0.745954 | |
RandomForest | train | 0.806607 | 0.843972 | 0.298620 | 0.441149 | 0.858929 |
test | 0.795812 | 0.757576 | 0.231481 | 0.354610 | 0.802271 | |
xgboost | train | 0.859846 | 0.882979 | 0.520703 | 0.655091 | 0.927931 |
test | 0.795812 | 0.660377 | 0.324074 | 0.434783 | 0.809117 |
Stacking模型融合
from mlxtend.classifier import StackingClassifier
sclf = StackingClassifier(classifiers=[SVM, tree, rf, xgb], meta_classifier=lr)
import warnings
warnings.filterwarnings("ignore")
for clf, label in zip([SVM, tree, rf, xgb, sclf],['SVM','DecisionTree','RandomForest','xgboost', 'stackingClassifier']):
scores = model_selection.cross_val_score(clf, x, y, cv=5, scoring='accuracy')
print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean, scores.std(), label))
训练了很久没有出结果 ,还需要找找问题
# 画图
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
for clf, lab, grd in zip([SVM, tree, rf, xgb, sclf],
['SVM','DecisionTree','RandomForest','xgboost', 'stackingClassifier'],
itertools.product([0, 1], repeat=2)):
clf.fit(X, y)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(X=X, y=y, clf=clf)
plt.title(lab)
参考链接:https://blog.csdn.net/github_35965351/article/details/60763606