K折交叉验证
K折交叉验证(k-fold cross validation),将初始采样(样本集X,Y)分割成K份,一份被保留作为验证模型的数据(test set),其他K-1份用来训练(train set)。交叉验证重复K次,每份验证一次,平均K次的结果或者使用其它结合方式,最终得到一个单一估测。
Grid Search
Grid Search:一种调参手段;穷举搜索:在所有候选的参数选择中,通过循环遍历,尝试每一种可能性,表现最好的参数就是最终的结果。其原理就像是在数组里找最大值。(为什么叫网格搜索?以有两个参数的模型为例,参数a有3种可能,参数b有4种可能,把所有可能性列出来,可以表示成一个3*4的表格,其中每个cell就是一个网格,循环过程就像是在每个网格里遍历、搜索,所以叫grid search)
1. 逻辑回归
best_score = 0.0
# for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
log_model = LogisticRegression(C=C,random_state =2018)
scores = cross_val_score(log_model,X_train_stand,y_train,cv=5) #5折交叉验证
score = scores.mean() #取平均数
if score > best_score:
best_score = score
best_parameters = {"C":C}
log_model = LogisticRegression(**best_parameters)
log_model.fit(X_train_stand,y_train)
test_score = log_model.score(X_test_stand,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))
2. SVM
best_score = 0.0
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
svm = SVC(gamma=gamma,C=C,random_state =2018)
scores = cross_val_score(svm,X_train_stand,y_train,cv=5) #5折交叉验证
score = scores.mean() #取平均数
if score > best_score:
best_score = score
best_parameters = {"gamma":gamma,"C":C}
svm = SVC(**best_parameters)
svm.fit(X_train_stand,y_train)
test_score = svm.score(X_test_stand,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))
3. 随机森林
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, max_features=0.6, oob_score=True, random_state=2018)
param_grid = [
{
'n_estimators': range(50,300,10),
}
]
#以AUC值为评判标准,使用5折交叉验证开始搜索
grid_search4 = GridSearchCV(rf_clf, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
grid_search4.fit(X_train_stand,y_train)
print("best para:", grid_search4.best_params_)
print("Best score on:", grid_search4.best_score_)
4. XGBoost
best_score = 0.0
# xgb_model = XGBClassifier()
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
xgb_model = XGBClassifier(gamma=gamma,C=C,random_state =2018)
scores = cross_val_score(svm,X_train_stand,y_train,cv=5) #5折交叉验证
score = scores.mean() #取平均数
if score > best_score:
best_score = score
best_parameters = {"gamma":gamma,"C":C}
xgb_model = XGBClassifier(**best_parameters)
xgb_model.fit(X_train_stand,y_train)
test_score = xgb_model.score(X_test_stand,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))
5. LightGBM
best_score = 0.0
# lgb_model = LGBMClassifier()
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
lgb_model = LGBMClassifier(gamma=gamma,C=C,random_state =2018)
scores = cross_val_score(svm,X_train_stand,y_train,cv=5) #5折交叉验证
score = scores.mean() #取平均数
if score > best_score:
best_score = score
best_parameters = {"gamma":gamma,"C":C}
lgb_model = LGBMClassifier(**best_parameters)
lgb_model.fit(X_train_stand,y_train)
test_score = lgb_model.score(X_test_stand,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))
6. 模型融合
import time
import pandas as pd
import numpy as np
import sys
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from mlxtend.classifier import StackingClassifier
features_path = './feature/feature_file/data_w_tfidf.pkl'#tfidf特征的路径
fp = open(features_path, 'rb')
x_train, y_train, x_test = pickle.load(fp)
fp.close()
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=2019)
lr = LogisticRegression(random_state=2019,C=0.1)
lgb = LGBMClassifier(boosting_type='GBDT',random_state=2019,silent=0)
gbdt = GradientBoostingClassifier(random_state=2019,max_depth=3,n_estimators=50)
xgbc = XGBClassifier(random_state=2019,max_depth=3,eta=0.1,subsample=0.6)
rf = RandomForestClassifier(n_estimators=500,oob_score=True, random_state=2019)
svm = SVC(random_state=2019,tol=0.01)
sclf = StackingClassifier(classifiers=[lr, gbdt, xgbc,rf,svm], meta_classifier=lgb)
# sclf1 = StackingClassifier(classifiers=[gbdt, xgbc,svm], meta_classifier=lgb)
# sclf2 = StackingClassifier(classifiers=[gbdt, xgbc,svm], meta_classifier=lr)
# sclf3 = StackingClassifier(classifiers=[svm], meta_classifier=lr)
def get_scores(model, X_train, X_test, Y_train, Y_test):
model.fit(X_train, Y_train)
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)
if hasattr(model, "decision_function"):
y_train_proba = model.decision_function(X_train)
y_test_proba = model.decision_function(X_test)
else:
y_train_proba = (model.predict_proba(X_train))[:, 1]
y_test_proba = (model.predict_proba(X_test))[:, 1]
# accuracy
train_accuracy = metrics.accuracy_score(Y_train, y_train_predict, average="micro")
test_accuracy = metrics.accuracy_score(Y_test, y_test_predict, average="micro")
# recision
train_precision = metrics.precision_score(Y_train, y_train_predict, average="micro")
test_precision = metrics.precision_score(Y_test, y_test_predict, average="micro")
# recall
train_recall = metrics.recall_score(Y_train, y_train_predict, average="micro")
test_recall = metrics.recall_score(Y_test, y_test_predict, average="micro")
# f1-score
train_f1 = metrics.f1_score(Y_train, y_train_predict, average="micro")
test_f1 = metrics.f1_score(Y_test, y_test_predict, average="micro")
# auc
train_auc = metrics.roc_auc_score(Y_train, y_train_proba)
test_auc = metrics.roc_auc_score(Y_test, y_test_proba)
# roc 曲线
train_fprs,train_tprs,train_thresholds = metrics.roc_curve(Y_train, y_train_proba)
test_fprs,test_tprs,test_thresholds = metrics.roc_curve(Y_test, y_test_proba)
plt.plot(train_fprs, train_tprs)
plt.plot(test_fprs, test_tprs)
plt.plot([0,1], [0,1],"--")
plt.title("ROC curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(labels=["Train AUC:"+str(round(train_auc, 5)),"Test AUC:"+str(round(test_auc,5))], loc="lower right")
plt.show()
#输出各种得分
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)
print("==================================")
print("训练集精准率:", train_precision)
print("测试集精准率:", test_precision)
print("==================================")
print("训练集召回率:", train_recall)
print("测试集召回率:", test_recall)
print("==================================")
print("训练集F1-score:", train_f1)
print("测试集F1-score:", test_f1)
print("==================================")
print("训练集AUC:", train_auc)
print("测试集AUC:", test_auc)
参考
https://blog.csdn.net/weixin_40363627/article/details/85015683