数据来源:http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
开发工具:Jupyter notebook
语言版本:python 3.6
1.查看数据情况
(1)数据
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
data = pd.read_csv("D:\\bank-full.csv",sep = ';')
test = pd.read_csv("D:\\bank.csv",sep = ';')
data.head()
(2)部分列数据及关系
data.plot.scatter(x='age', y='balance')
import seaborn as sns
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'y', y = 'duration', data = data)
xt = plt.xticks(rotation=45)
plt.show()
sns.distplot(data['age'])
(3)空值查看
data.isnull().any()
test.isnull().any()
2.特征工程
特征工程将非数值特征进行one-hot编码,之后进行归一化
from sklearn import preprocessing
def get_y_train():
y_train = np.array(data['y'])
y_train = np.where(y_train == 'yes', 0, 1) #将预测值转换成01
return y_train
def get_y_test():
y_test = np.array(test['y'])
y_test = np.where(y_test == 'yes', 0, 1) #将预测值转换成01
return y_test
def get_X_train():
oh_data = pd.get_dummies(data) #对非数值数据进行ont-hot编码
columns_size = oh_data.columns.size
X_train = oh_data.iloc[:,0:columns_size-2] #取特征
X_train = preprocessing.scale(X_train) #归一化
return X_train
def get_X_test():
oh_test = pd.get_dummies(test) #对非数值数据进行ont-hot编码
columns_size = oh_test.columns.size
X_test = oh_test.iloc[:,0:columns_size-2] #取特征
X_test= preprocessing.scale(X_test) #归一化
return X_test
X_test = get_X_test()
X_train = get_X_train()
y_test = get_y_test()
y_train = get_y_train()
3.建立模型
(1)AdaBoost
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=9, min_samples_split=20, min_samples_leaf=9),
algorithm="SAMME",
n_estimators=200, learning_rate=0.8)
ada.fit(X_train, y_train)
y_pred = bdt.predict(X_test)
target_names = ['1', '0']
print(classification_report(y_test, y_pred, target_names=target_names))
(2)Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
bagging_tree = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.9,max_features=0.9)
bagging_tree.fit(X_train, y_train)
y_pred = bagging_tree.predict(X_test)
target_names = ['1', '0']
print(classification_report(y_test, y_pred, target_names=target_names))
4.模型评估
上面代码中,估使用classification_report对分类的情况进行评价,也可以画出roc曲线查看模型的效果,大多数机器学习比赛也使用AUC进行评测。
(1)AdaBoost
from sklearn.metrics import roc_curve, auc
y_pred = ada.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, 'b', label='AdaBoost AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
(2)Bagging
y_pred = bagging_tree.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, 'b', label='Bagging AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()