task4
记录5个模型(逻辑回归、SVM、决策树、随机森林、XGBoost)关于accuracy、precision,recall和F1-score、auc值的评分表格,并画出ROC曲线
# 导入库
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV,cross_val_predict
from scipy.stats import uniform
from sklearn import metrics
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
# 获取进行特征选择后的数据集
import pandas as pd
data = pd.read_csv('task2_proc.csv')
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
print('feature shape:{}, label shape:{}'.format(x.shape,y.shape))
feature shape:(4455, 50), label shape:(4455,)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=2018)
# 对训练数据集进行标准化处理
scaler = StandardScaler()
x_train_standard = scaler.fit_transform(x_train)
x_test_standard = scaler.fit_transform(x_test)
def get_scores(label, y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba):
train_accuracy = metrics.accuracy_score(y_train, y_train_predict)
test_accuracy = metrics.accuracy_score(y_test, y_test_predict)
# 精准率
train_precision = metrics.precision_score(y_train, y_train_predict)
test_precision = metrics.precision_score(y_test, y_test_predict)
# 召回率
train_recall = metrics.recall_score(y_train, y_train_predict)
test_recall = metrics.recall_score(y_test, y_test_predict)
# F1-score
train_f1_score = metrics.f1_score(y_train, y_train_predict)
test_f1_score = metrics.f1_score(y_test, y_test_predict)
# AUC
train_auc = metrics.roc_auc_score(y_train, y_train_proba)
test_auc = metrics.roc_auc_score(y_test, y_test_proba)
# ROC
train_fprs, train_tprs, train_thresholds = metrics.roc_curve(y_train, y_train_proba)
test_fprs, test_tprs, test_thresholds = metrics.roc_curve(y_test, y_test_proba)
plt.plot(train_fprs, train_tprs, label=label+' train ROC', linewidth=2)
plt.plot(test_fprs, test_tprs, label=label+' test ROC', linewidth=2)
plt.title("ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.show()
# 输出评分
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)
print("训练集精准率:", train_precision)
print("测试集精准率:", test_precision)
print("训练集召回率:", train_recall)
print("测试集召回率:", test_recall)
print("训练集F1-score:", train_f1_score)
print("测试集F1-score:", test_f1_score)
print("训练集AUC:", train_auc)
print("测试集AUC:", test_auc)
train = [train_accuracy, train_precision, train_recall, train_f1_score, train_auc]
test = [test_accuracy, test_precision, test_recall, test_f1_score, test_auc]
return train, test
# 逻辑回归
lr = LogisticRegression(random_state=2018)
lr.fit(x_train_standard, y_train)
y_train_predict = lr.predict(x_train_standard)
y_test_predict = lr.predict(x_test_standard)
y_train_proba = lr.predict_proba(x_train_standard)[:, 1]
y_test_proba = lr.predict_proba(x_test_standard)[:, 1]
Logistic_train, Logistic_test = get_scores('Logistic', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.79826812059
测试集准确率: 0.791323859387
训练集精准率: 0.711055276382
测试集精准率: 0.624309392265
训练集召回率: 0.355081555834
测试集召回率: 0.348765432099
训练集F1-score: 0.473640167364
测试集F1-score: 0.447524752475
训练集AUC: 0.806606203682
测试集AUC: 0.79632981122
# SVM
svm_linearSVC = LinearSVC(random_state=2018)
svm_linearSVC.fit(x_train_standard, y_train)
y_train_predict = svm_linearSVC.predict(x_train_standard)
y_test_predict = svm_linearSVC.predict(x_test_standard)
y_train_proba = svm_linearSVC.decision_function(x_train_standard)
y_test_proba = svm_linearSVC.decision_function(x_test_standard)
SVM_train, SVM_test = get_scores('SVM', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.79826812059
测试集准确率: 0.791323859387
训练集精准率: 0.737288135593
测试集精准率: 0.622950819672
训练集召回率: 0.32747804266
测试集召回率: 0.351851851852
训练集F1-score: 0.453518679409
测试集F1-score: 0.449704142012
训练集AUC: 0.807375460649
测试集AUC: 0.789623779752
# 决策树
tree = DecisionTreeClassifier(random_state=2018)
tree.fit(x_train_standard, y_train)
y_train_predict = tree.predict(x_train_standard)
y_test_predict = tree.predict(x_test_standard)
y_train_proba = tree.predict_proba(x_train_standard)[:, 1]
y_test_proba = tree.predict_proba(x_test_standard)[:, 1]
DT_train, DT_test = get_scores('DT', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 1.0
测试集准确率: 0.70531039641
训练集精准率: 1.0
测试集精准率: 0.404371584699
训练集召回率: 1.0
测试集召回率: 0.456790123457
训练集F1-score: 1.0
测试集F1-score: 0.428985507246
训练集AUC: 1.0
测试集AUC: 0.620793877128
# 随机森林
rf = RandomForestClassifier(n_estimators=1000,criterion='gini',oob_score=True,
random_state=2018,verbose=0,n_jobs=-1)
rf.fit(x_train_standard, y_train)
y_train_predict = rf.predict(x_train_standard)
y_test_predict = rf.predict(x_test_standard)
y_train_proba = rf.predict_proba(x_train_standard)[:, 1]
y_test_proba = rf.predict_proba(x_test_standard)[:, 1]
RM_train, RM_test = get_scores('RM', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 1.0
测试集准确率: 0.800299177263
训练集精准率: 1.0
测试集精准率: 0.705035971223
训练集召回率: 1.0
测试集召回率: 0.302469135802
训练集F1-score: 1.0
测试集F1-score: 0.423326133909
训练集AUC: 1.0
测试集AUC: 0.787944986777
# XGboost
xgb = XGBClassifier(random_state=2018)
xgb.fit(x_train_standard, y_train)
y_train_predict = xgb.predict(x_train_standard)
y_test_predict = xgb.predict(x_test_standard)
y_train_proba = xgb.predict_proba(x_train_standard)[:, 1]
y_test_proba = xgb.predict_proba(x_test_standard)[:, 1]
XGBoost_train, XGBoost_test = get_scores('XGBoost', y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
训练集准确率: 0.846375881976
测试集准确率: 0.806282722513
训练集精准率: 0.842672413793
测试集精准率: 0.707006369427
训练集召回率: 0.490589711418
测试集召回率: 0.342592592593
训练集F1-score: 0.620142743854
测试集F1-score: 0.461538461538
训练集AUC: 0.909125506734
测试集AUC: 0.812420021206
# 各模型分数对比
model_name = ['Logistic','SVM','DecisionTree','RandomForest','xgboost']
columns = ['accuracy','precision','recall','f1','roc_auc']
ttype = ['train','test']
pd_list = []
model_score_train = [Logistic_train, SVM_train, DT_train, RM_train, XGBoost_train]
model_score_test = [Logistic_test, SVM_test, DT_test, RM_test, XGBoost_test]
for train,test in zip(model_score_train,model_score_test):
pd_list.append(pd.DataFrame([train,test],index=ttype,columns=columns))
pd.concat(pd_list,axis=0,keys=model_name)
accuracy | precision | recall | f1 | roc_auc | ||
---|---|---|---|---|---|---|
Logistic | train | 0.798268 | 0.711055 | 0.355082 | 0.473640 | 0.806606 |
test | 0.791324 | 0.624309 | 0.348765 | 0.447525 | 0.796330 | |
SVM | train | 0.798268 | 0.737288 | 0.327478 | 0.453519 | 0.807375 |
test | 0.791324 | 0.622951 | 0.351852 | 0.449704 | 0.789624 | |
DecisionTree | train | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
test | 0.705310 | 0.404372 | 0.456790 | 0.428986 | 0.620794 | |
RandomForest | train | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
test | 0.800299 | 0.705036 | 0.302469 | 0.423326 | 0.787945 | |
xgboost | train | 0.846376 | 0.842672 | 0.490590 | 0.620143 | 0.909126 |
test | 0.806283 | 0.707006 | 0.342593 | 0.461538 | 0.812420 |