文章目录
前言
入门机器学习,记录学习日常,如有错误请多指正。
参考书目:机器学习及Python应用
数据集可在陈强教授主页下载
一、数据预处理
1.数据介绍
案例采用葡萄牙银行市场营销(bank market)数据集。响应变量y(取值yes或no)表示在接到银行的直销电话后,客户是否会购买银行定期存款产品。特征变量包括个人特征、经济状况、营销状态等。
2.导入模块和数据文件
1)导入案例所需的全部模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import cohen_kappa_score
2)导入数据
bank=pd.read_csv('D:\数据集\\MLPython_Data\\bank-additional.csv',sep=';')
3.数据概况
print(bank.shape)
pd.options.display.max_columns=30
print(bank.head())
(4119, 21)#数据框形状
#前五行观测值
age job marital education default housing loan \
0 30 blue-collar married basic.9y no yes no
1 39 services single high.school no no no
2 25 services married high.school no yes no
3 38 services married basic.9y no unknown unknown
4 47 admin. married university.degree no yes no
contact month day_of_week duration campaign pdays previous \
0 cellular may fri 487 2 999 0
1 telephone may fri 346 4 999 0
2 telephone jun wed 227 1 999 0
3 telephone jun fri 17 3 999 0
4 cellular nov mon 58 1 999 0
poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m \
0 nonexistent -1.8 92.893 -46.2 1.313
1 nonexistent 1.1 93.994 -36.4 4.855
2 nonexistent 1.4 94.465 -41.8 4.962
3 nonexistent 1.4 94.465 -41.8 4.959
4 nonexistent -0.1 93.200 -42.0 4.191
nr.employed y
0 5099.1 no
1 5191.0 no
2 5228.1 no
3 5228.1 no
4 5195.8 no
考察样本中购买金融产品意愿的人数与比例
bank=bank.drop('duration',axis=1)#去除无用特征变量
print('购买金融产品意愿人数:',bank.y.value_counts())
print('购买金融产品意愿比例',bank.y.value_counts(normalize=True))
购买金融产品意愿人数: y
no 3668
yes 451
Name: count, dtype: int64
购买金融产品意愿比例 y
no 0.890507
yes 0.109493
Name: proportion, dtype: float64
4.将字符型分类变量虚拟化
X_raw=bank.iloc[:,:-1]#提取原特征变量
X=pd.get_dummies(X_raw)#虚拟化
print(X.head(2))
y=bank.iloc[:,-1]
#虚拟化后矩阵X前两行
age campaign pdays previous emp.var.rate cons.price.idx \
0 30 2 999 0 -1.8 92.893
1 39 4 999 0 1.1 93.994
cons.conf.idx euribor3m nr.employed job_admin. job_blue-collar \
0 -46.2 1.313 5099.1 False True
1 -36.4 4.855 5191.0 False False
job_entrepreneur job_housemaid job_management job_retired ... \
0 False False False False ...
1 False False False False ...
month_jul month_jun month_mar month_may month_nov month_oct \
0 False False False True False False
1 False False False True False False
month_sep day_of_week_fri day_of_week_mon day_of_week_thu \
0 False True False False
1 False True False False
day_of_week_tue day_of_week_wed poutcome_failure poutcome_nonexistent \
0 False False False True
1 False False False True
poutcome_success
0 False
1 False
[2 rows x 62 columns]
5.分层抽样
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=1000,random_state=1)
二、分类树
1.DecisionTreeClassifier类介绍
def __init__(self,
*,
criterion: Any = "gini",#用来衡量决策树节点分裂的标准,可以是'gini'(基尼不纯度)或'entropy'(信息熵)
splitter: Any = "best",#用于指定用来分割节点的策略,可以是'best'(基于基尼不纯度或信息增益进行选择)或'random'(随机选择)
max_depth: Any = None,#树的最大深度
min_samples_split: Any = 2,#拆分内部节点所需的最小样本数
min_samples_leaf: Any = 1,#在叶节点中所需的最小样本数
min_weight_fraction_leaf: Any = 0.0,#在所有叶节点中所有样本的总权重的最小加权分数(不是样本数)。如果小于此数,则该叶节点会与它的兄弟节点合并
max_features: Any = None,#考虑用于分割的特征的最大数量
random_state: Any = None,#指定随机数生成器的种子
max_leaf_nodes: Any = None,#最大叶节点数
min_impurity_decrease: Any = 0.0,#分裂节点后,节点不纯度的减少的最小值
class_weight: Any = None,#为不同的类别设置不同的权重
ccp_alpha: Any = 0.0,#成本复杂性参数
monotonic_cst: Any = None) -> None
2.创建实例
model=DecisionTreeClassifier(max_depth=2,random_state=123)
model.fit(X_train,y_train)
print('预测准确率:',model.score(X_test,y_test))
预测准确率: 0.904
3.决策树图像
plot_tree(model,feature_names=X.columns,node_ids=True,rounded=True,precision=2)
plt.show()
4.成本复杂性参数与叶节点总不纯度关系图
model=DecisionTreeClassifier(random_state=123)
path=model.cost_complexity_pruning_path(X_train,y_train)
plt.plot(path.ccp_alphas,path.impurities,marker='o',drawstyle='steps-post')
plt.xlabel('alpha (cost-complexity parameter)')
plt.ylabel('Total Leaf MSE')
plt.title('Total Leaf MSE vs alpha for Training Set')
plt.show()
三、选择最优超参数
1.十折交叉验证
param_grid={'ccp_alpha':path.ccp_alphas}
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
model=GridSearchCV(DecisionTreeClassifier(random_state=123),param_grid,cv=kfold)#网格验证
model.fit(X_train,y_train)
print('最佳超参数',model.best_params_)
model=model.best_estimator_#最优模型
model.score('最优决策树预测准确率:',X_test,y_test)
最佳超参数 {'ccp_alpha': np.float64(0.0021510777681259807)}
最优决策树预测准确率:0.904
2.最优决策树图像
plot_tree(model,feature_names=X.columns,node_ids=True,proportion=True,rounded=True,precision=2)
plt.show()
3.变量重要性柱状图
sorted_index=model.feature_importances_.argsort()
plt.barh(range(X.shape[1]),model.feature_importances_[sorted_index])#水平柱状图
plt.yticks(np.arange(X.shape[1]),X.columns[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Decision Tree')
plt.tight_layout()
plt.show()
4.混淆矩阵
pred=model.predict(X_test)
table=pd.crosstab(y_test,pred,rownames=['Actual'],colnames=['Predicted'])
print('混淆矩阵:',table)
混淆矩阵:
Predicted no yes
Actual
no 880 11
yes 85 24
相关预测指标
table=np.array(table)
Accuracy=((table[0,0]+table[1,1])/np.sum(table))
print('预测准确率:',Accuracy)
Sensitivity=table[1,1]/(table[1,0]+table[1,1])
print('灵敏度:',Sensitivity)
print('cohen_kappa:',cohen_kappa_score(y_test,pred))
预测准确率: 0.904
灵敏度: 0.22018348623853212
cohen_kappa: 0.2960328518002493
5.降低预测概率门槛值
再作以上预测时,默认以“概率大于0.5”作为预测标准。为提高算法的灵敏度,以识别有更多潜在购买意向的客户,可降低此概率门槛值,例如将“概率大于0.1”设置为有购买意向。
prob=model.predict_proba(X_test)#购买意向概率
prob_yes=prob[:,1]#选择有购买意向的类别
pred_new=(prob_yes>=0.1)
table=pd.crosstab(y_test,pred_new,rownames=['Actual'],colnames=['Predicted'])
print(table)
Predicted False True
Actual
no 821 70
yes 50 59
相关预测指标
table=np.array(table)
Accuracy=((table[0,0]+table[1,1])/np.sum(table))
print('预测准确率:',Accuracy)
Sensitivity=table[1,1]/(table[1,0]+table[1,1])
print('灵敏度:',Sensitivity)
预测准确率: 0.88
灵敏度: 0.5412844036697247
预测准确率 0.904
结果显示,降低预测概率门槛值后预测准确率降低为88%,但灵敏度确提高为54.1%。
四、分类树决策边界
利用iris数据的后两个特征变量进行分类树估计,并画出相应的决策边界。
通过10折交叉验证选择最优决策树
from sklearn.datasets import load_iris
from mlxtend.plotting import plot_decision_regions
X,y=load_iris(return_X_y=True)
X2=X[:,2:4]
model=DecisionTreeClassifier(random_state=123)
path=model.cost_complexity_pruning_path(X2,y)
param_grid={'ccp_alpha':path.ccp_alphas}
kfold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
model=GridSearchCV(DecisionTreeClassifier(random_state=123),param_grid,cv=kfold)
model.fit(X2,y)
print('准确率:',model.score(X2,y))
准确率: 0.9933333333333333
画出决策边界
plot_decision_regions(X2,y,model)
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.title('Decision Boundary for Decision Tree')
plt.show()