常见的机器学习模型有:逻辑回归(Logistic Regression,LR)、决策树(Decision Tree,DT)、随机梯度下降(Stochastic Gradient Descent,SGD)、支持向量机(Support Vector Machines,SVM)、随机森林(Random Forest,RF)、梯度提升决策树(Gradient Boosting Decision Tree,GBDT)、极限梯度提升(XGBoost)和LightGBM(Light Gradient Boosting Machine,LGBM)。
本文介绍了以上八种机器学习模型的便捷使用方法,实现了多个数据集对应多种方法。
1.导入所使用的包
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
2.定义模型(参数自行设置)
LR = LogisticRegression(random_state=100,tol=1e-6) # 逻辑回归模型
DT = DecisionTreeClassifier(random_state=100) #决策树模型
SGD = SGDClassifier(random_state=100) #随机梯度下降
SVM = SVC(probability=True,random_state=100,tol=1e-6) # SVM模型
RF = RandomForestClassifier(n_estimators=100,random_state=100) # 随机森林
GBDT = GradientBoostingClassifier(random_state=100) #GBDT
XGB = XGBClassifier(random_state=100) #XGBoost
LGBM = LGBMClassifier(random_state=100) #LGBM
3.定义评价指标(AUC、SN、SP、ACC、MCC)
def muti_score(model,data_x,data_y,data_X,data_Y):
model.fit(data_x,data_y)
res = model.predict_proba(data_X)[:,1]
auc = roc_auc_score(data_Y,res)
print("AUC:",auc)
pred = np.squeeze(res)
f = pred>0.5
pred[f]=1
pred[pred<0.5]=0
pred[pred==0.5]=0
predict_label = pred
pos_label = 1
true_label = data_Y
pos_num = np.sum(true_label==pos_label)
print('pos_num=',pos_num)
neg_num = true_label.shape[0]-pos_num
print('neg_num=',neg_num)
tp =np.sum((true_label==pos_label) & (predict_label==pos_label))
print('tp=',tp)
tn = np.sum(true_label==predict_label)-tp
print('tn=',tn)
sn = tp/pos_num
sp = tn/neg_num
acc = (tp+tn)/(pos_num+neg_num)
fn = pos_num - tp
fp = neg_num - tn
print('fn=',fn)
print('fp=',fp)
tp = np.array(tp,dtype=np.float64)
tn = np.array(tn,dtype=np.float64)
fp = np.array(fp,dtype=np.float64)
fn = np.array(fn,dtype=np.float64)
mcc = (tp*tn-fp*fn)/(np.sqrt((tp+fn)*(tp+fp)*(tn+fp)*(tn+fn)))
print(sn,sp,acc,mcc)
4.循环调用(可自行选择所用到的数据和模型)
M_train_name = ["train_x_1","train_x_2","train_x_3","train_x_4","train_x_5","train_x_6","train_x_7"]
M_test_name = ["test_x_1","test_x_2","test_x_3","test_x_4","test_x_5","test_x_6","test_x_7"]
model_name = ["LR","DT","SGD","SVM","RF","GBDT","XGB","LGBM"] #选择已经定义好的模型
for i in range(len(M_train_name)):
train_n = train_name[i]
test_n = test_name[i]
train_X = eval(train_n)
test_X = eval(test_n)
print(train_n,test_n)
for name in model_name:
model = eval(name)
print(name)
muti_score(model,train_X ,train_y,test_X,test_y)
print("-"*50) #分割不同模型的结果
print("*"*100) #分割不同数据的结果