from svm import *
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
#from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
#from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight
import xgboost
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# load dataset
dataframe = pd.read_csv("Train.csv", header=None)
dataset = dataframe.values
X = dataset[:, 0:85].astype(float)
Y = dataset[:, 85].astype(int)
# encode class values as integers
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y)
# convert integers to dummy variables (one hot encoding)
dummy_y = np_utils.to_categorical(encoded_Y)
X_train, X_test, Y_train, Y_test = train_test_split(X,dummy_y, test_size=0.3, random_state=True)
weight = compute_class_weight('balanced', [0,1], Y_train[:,1])
# In[1]:下面是决策树以及Bagging和随机森林在数据集上的表现
# In[1-1]:度量单个决策树
tree = DecisionTreeClassifier(criterion='gini', max_depth=None, class_weight={0:weight[0] , 1: weight[1]})
tree = tree.fit(X_train, Y_train[:,1])
y1_test_pred_DTC = tree.predict(X_test)
print("DecisionTree Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_DTC)))
print("DecisionTree Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_DTC)))
DTC_pred = tree.predict_proba(X_test)
fpr_dtc1, tpr_dtc1, thresholds_dtc1 = roc_curve(Y_test[:,1], DTC_pred[:,1])
roc_auc_dtc1 = auc(fpr_dtc1, tpr_dtc1)
# In[1-2]度量bagging分类器,生成500个决策树,详细的参数建议参考官方文档
bag = BaggingClassifier(base_estimator=LogisticRegression(penalty="l1", C=1.5, class_weight={0:weight[0] , 1: weight[1]}), n_estimators=500, max_samples=1.0, max_features=1.0,
bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1)
bag = bag.fit(X_train, Y_train[:,1])
y1_test_pred_BAG = bag.predict(X_test)
print("Bagging Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_BAG)))
print("Bagging Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_BAG)))
BAG_pred = bag.predict_proba(X_test)
fpr_bag1, tpr_bag1, thresholds_bag1 = roc_curve(Y_test[:,1], BAG_pred[:,1])
roc_auc_bag1 = auc(fpr_bag1, tpr_bag1)
# In[1-3]:随机森林,bagging思想
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=None, min_samples_split=2, bootstrap=True,
n_jobs=1, random_state=1, class_weight={0:weight[0] , 1: weight[1]})
rf = rf.fit(X_train, Y_train[:,1])
y1_test_pred_RFC = bag.predict(X_test)
print("RandomForest Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_RFC)))
print("RandomForest Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_RFC)))
RFC_pred = rf.predict_proba(X_test)
fpr_rfc1, tpr_rfc1, thresholds_rfc1 = roc_curve(Y_test[:,1], RFC_pred[:,1])
roc_auc_rfc1 = auc(fpr_rfc1, tpr_rfc1)
# In[1-plot]:
plt.plot(fpr_dtc1, tpr_dtc1, lw=2, alpha=.6)
plt.plot(fpr_bag1, tpr_bag1, lw=2, alpha=.6)
plt.plot(fpr_rfc1, tpr_rfc1, lw=2, alpha=.6)
plt.plot([0, 1], [0, 1], lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve for DecisionTree,Bagging,RandomForest")
plt.legend(["DecisionTree {:.4f})".format(roc_auc_dtc1),
"Bagging {:.4f})".format(roc_auc_bag1),
"RandomForest {:.4f})".format(roc_auc_rfc1)], fontsize=8, loc=2)
# In[2]:下面是决策树、AdaBoost、GBDT和XGBoost在数据集上的表现。
# In[2-1]Boosting分类器
ada = AdaBoostClassifier(n_estimators=1000, learning_rate=0.0001, random_state=1)
ada = ada.fit(X_train, Y_train[:,1])
y1_test_pred_ADA = ada.predict(X_test)
print("AdaBoost Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_ADA)))
print("AdaBoost Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_ADA)))
ADA_pred = ada.predict_proba(X_test)
fpr_ada1, tpr_ada1, thresholds_ada1 = roc_curve(Y_test[:,1], ADA_pred[:,1])
roc_auc_ada1 = auc(fpr_ada1, tpr_ada1)
# In[2-2]GradientBoosting分类器
gbdt =GradientBoostingClassifier(n_estimators=1000, learning_rate=0.0001, random_state=1)
gbdt= gbdt.fit(X_train, Y_train[:,1])
y1_test_pred_GDBT = gbdt.predict(X_test)
print("GradientBoosting Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_GDBT)))
print("GradientBoosting Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_GDBT)))
GDBT_pred = gbdt.predict_proba(X_test)
fpr_gdbt1, tpr_gdbt1, thresholds_gdbt1 = roc_curve(Y_test[:,1], GDBT_pred[:,1])
roc_auc_gdbt1 = auc(fpr_gdbt1, tpr_gdbt1)
# In[2-3]XGBoost分类器
xgb =xgboost.XGBClassifier(n_estimators=10000, learning_rate=0.0001, scale_pos_weight=weight[1]/weight[0])#random_state=1,
xgb= xgb.fit(X_train, Y_train[:,1])
y1_test_pred_XGB = xgb.predict(X_test)
print("XGBoost Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_XGB)))
print("XGBoost Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_XGB)))
XGB_pred = xgb.predict_proba(X_test)
fpr_xgb1, tpr_xgb1, thresholds_xgb1 = roc_curve(Y_test[:,1], XGB_pred[:,1])
roc_auc_xgb1 = auc(fpr_xgb1, tpr_xgb1)
# In[2-plot]:
plt.plot(fpr_ada1, tpr_ada1, lw=2, alpha=.6)
plt.plot(fpr_gdbt1, tpr_gdbt1, lw=2, alpha=.6)
plt.plot(fpr_xgb1, tpr_xgb1, lw=2, alpha=.6)
plt.plot([0, 1], [0, 1], lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve for Boosting,GradientBoosting,XGBoost")
plt.legend(["AdaBoost {:.4f})".format(roc_auc_ada1),
"GradientBoosting {:.4f})".format(roc_auc_gdbt1),
"XGBoost {:.4f})".format(roc_auc_xgb1)], fontsize=8, loc=2)
# In[3]下面是Blending,Stacking和加权平均在数据集上的表现,heamy详细的参数建议参考官方文档
# In[3-1]:Blending
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
dataset = Dataset(X_train,Y_train,X_test)
#创建RF模型和LR模型
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
# Blending两个模型
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.blend(proportion=0.2,seed=111)
#第二层使用lr模型stack
bending = Regressor(dataset=stack_ds, estimator=LinearRegression)
y1_test_pred_BED = bending.predict()
print("Blending Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], np.int64(y1_test_pred_BED[:,1]>0.50000))))
print("Blending Classification report (test):\n {0}".format(classification_report(Y_test[:,1], np.int64(y1_test_pred_BED[:,1]>0.50000))))
BED_pred = bending.predict()
fpr_bed1, tpr_bed1, thresholds_bed1 = roc_curve(Y_test[:,1], BED_pred[:,1])
roc_auc_bed1 = auc(fpr_bed1, tpr_bed1)
# In[3-2]:Stacking
#创建数据集
dataset = Dataset(X_train,Y_train,X_test)
#创建RF模型和LR模型
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
# Stack两个模型
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)
#第二层使用lr模型stack
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y1_test_pred_STK = stacker.predict()
print("Stacking Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], np.int64(y1_test_pred_STK[:,1]>0.50000))))
print("Stacking Classification report (test):\n {0}".format(classification_report(Y_test[:,1], np.int64(y1_test_pred_STK[:,1]>0.50000))))
STK_pred = stacker.predict()
fpr_stk1, tpr_stk1, thresholds_stk1 = roc_curve(Y_test[:,1], STK_pred[:,1])
roc_auc_stk1 = auc(fpr_stk1, tpr_stk1)
# In[3-3]:加权平均
from sklearn.neighbors import KNeighborsRegressor
dataset = Dataset(X_train,Y_train,X_test)
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')
pipeline = ModelsPipeline(model_rf,model_lr,model_knn)
every = pipeline.mean().execute()
print("Mean Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], np.int64(every[:,1]>0.50000))))
print("Mean Classification report (test):\n {0}".format(classification_report(Y_test[:,1], np.int64(every[:,1]>0.50000))))
fpr_eve1, tpr_eve1, thresholds_eve1 = roc_curve(Y_test[:,1], every[:,1])
roc_auc_eve1 = auc(fpr_eve1, tpr_eve1)
#weights = pipeline.find_weights(mean_absolute_error)
#result = pipeline.weight(weights)
# In[3-plot]:
plt.plot(fpr_bed1, tpr_bed1, lw=2, alpha=.6)
plt.plot(fpr_stk1, tpr_stk1, lw=2, alpha=.6)
plt.plot(fpr_eve1, tpr_eve1, lw=2, alpha=.6)
plt.plot([0, 1], [0, 1], lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve for Blending,Stacking,Mean")
plt.legend(["Blending {:.4f})".format(roc_auc_bed1),
"Stacking {:.4f})".format(roc_auc_stk1),
"Mean {:.4f})".format(roc_auc_eve1)], fontsize=8, loc=2)
# In[baseline]:LogisticRegression
lr_clf2 = LogisticRegression(penalty="l1", C=1.5, class_weight={0:weight[0] , 1: weight[1]})
lr_clf2.fit(X_train, Y_train[:,1])
y1_test_pred_LR = lr_clf2.predict(X_test)
print("LR Confusion matrix (test):\n {0}\n".format(confusion_matrix(Y_test[:,1], y1_test_pred_LR)))
print("LR Classification report (test):\n {0}".format(classification_report(Y_test[:,1], y1_test_pred_LR)))
# In[baseline-plot]:
LR_pred = lr_clf2.predict_proba(X_test)
fpr_lr2, tpr_lr2, thresholds_lr2 = roc_curve(Y_test[:,1], LR_pred[:,1])
roc_auc_lr2 = auc(fpr_lr2, tpr_lr2)
plt.plot(fpr_lr2, tpr_lr2, lw=2, alpha=.6)
plt.plot([0, 1], [0, 1], lw=2, linestyle="--")
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC curve")
plt.legend(["Logistic Reg (AUC {:.4f})".format(roc_auc_lr2)], fontsize=8, loc=2)
# 使用10折交叉验证结果
#results10 = bending.validate(k=10,scorer=mean_absolute_error)