导入的包
import pandas as pd
import warnings
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
读取数据集
data_all = pd.read_csv('dataset.csv', error_bad_lines=False, sep=',')
处理数据集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0) #将数据转化为标准数据
构建模型
lr = LogisticRegression(random_state=2018,tol=1e-6) # 逻辑回归模型
tree = DecisionTreeClassifier(random_state=2018) #决策树模型
svm = SVC(probability=True,random_state=2018,tol=1e-6) # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 随机森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018) #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018) #lgb
构建评分函数,并采取5折交叉验证的方式评分
def muti_score(model):
warnings.filterwarnings('ignore')
accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)
precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)
recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)
f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)
auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())
其中mean()指的是求得的均值
| 模型 | 准确率 | 精确率 |
模型 | 准确率 | 精确率 | 召回率 | F1_score | AUC |
---|---|---|---|---|---|
逻辑回归 | 0.7890191148682617 | 0.6542724662896913 | 0.3377975457965613 | 0.44525012166067884 | 0.7840451024530857 |
决策树 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 |
SVM | 0.787758390223099 | 0.7351623295760905 | 0.24060335431243626 | 0.36179547264664874 | 0.7640376541388867 |
随机森林 | 0.7921756804332226 | 0.7135700690071172 | 0.2867128441334693 | 0.40835414886475174 | 0.7752164698827589 |
GBDT | 0.7938590063951863 | 0.6604108594441386 | 0.36633732991104395 | 0.4708811551285791 | 0.7888240065764295 |
Xgboost | 0.7982740847293591 | 0.6829783239831001 | 0.3663162336064133 | 0.47673826685376613 | 0.7914190511145234 |
LightGbm | 0.79049080811139 | 0.6421783397519263 | 0.3730354066312717 | 0.47150438344663004 | 0.7776116341798183 |
1. 分析
模型的评分思想,是通过采用5折交叉验证,得出其中的均值分数来评判。从上表中可以看出逻辑回归,随机森林,GBDT,Xgboost,LightGbm的各个指标都很相近而且分数也较高,说明这几个模型拟合数据效果都较好,都可以选做模型。综合来看Xgboost的分数更高一些,它的评分效果最好。
2. 问题
- 还没有学会在代码中,可以直接输出表格的操作。
- 对各个模型的参数还不太了解
- 对数据集划分,怎样构造优质的数据还比较欠缺。
3. 参考文章
cross_val_score的 scoring参数值解析
python机器学习库sklearn——交叉验证(K折、留一、留p、随机)
4. 完整代码
```python
import pandas as pd
import warnings
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
# 读取数据集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')
# 划分为5折交叉验证数据集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0) #将数据转化为标准数据
#构建模型
lr = LogisticRegression(random_state=2018,tol=1e-6) # 逻辑回归模型
tree = DecisionTreeClassifier(random_state=2018) #决策树模型
svm = SVC(probability=True,random_state=2018,tol=1e-6) # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 随机森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018) #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018) #lgb
def muti_score(model):
warnings.filterwarnings('ignore')
accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)
precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)
recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)
f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)
auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)
print("准确率:",accuracy.mean())
print("精确率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())
model_name=["lr","tree","svm","forest","Gbdt","Xgbc","gbm"]
for name in model_name:
model=eval(name)
print(name)
muti_score(model)
'''
lr
准确率: 0.7890191148682617
精确率: 0.6542724662896913
召回率: 0.3377975457965613
F1_score: 0.44525012166067884
AUC: 0.7840451024530857
tree
准确率: 0.6962524533638791
精确率: 0.39920670173446693
召回率: 0.4157413593052284
F1_score: 0.40705496051057793
AUC: 0.6029856787858856
svm
准确率: 0.787758390223099
精确率: 0.7351623295760905
召回率: 0.24060335431243626
F1_score: 0.36179547264664874
AUC: 0.7640376541388867
forest
准确率: 0.7921756804332226
精确率: 0.7135700690071172
召回率: 0.2867128441334693
F1_score: 0.40835414886475174
AUC: 0.7752164698827589
Gbdt
准确率: 0.7938590063951863
精确率: 0.6604108594441386
召回率: 0.36633732991104395
F1_score: 0.4708811551285791
AUC: 0.7888240065764295
Xgbc
准确率: 0.7982740847293591
精确率: 0.6829783239831001
召回率: 0.3663162336064133
F1_score: 0.47673826685376613
AUC: 0.7914190511145234
gbm
准确率: 0.79049080811139
精确率: 0.6421783397519263
召回率: 0.3730354066312717
F1_score: 0.47150438344663004
AUC: 0.7776116341798183
'''