回归决策树
import pandas as pd
# 导入房价数据
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing()
# 查看房价数据的具体信息
housing
housing的具体信息如下图所示:
# 将housing中的特征数据和标签数据构造成DataFrame对象
df = pd.DataFrame(housing.data,columns=housing.feature_names)
df.head()
# 查看房价数据的统计信息
df.describe()
# 利用sklearn构造回归决策树模型
from sklearn import tree
dtr = tree.DecisionTreeRegressor(max_depth = 3) # 设置树的深度为3
dtr.fit(housing.data, housing.target)
# 输出结果为:
# DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
# max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, min_samples_leaf=1,
# min_samples_split=2, min_weight_fraction_leaf=0.0,
# presort=False, random_state=None, splitter='best')
# 构造dot_data数据,用于可视化决策树结果
dot_data = \
tree.export_graphviz(
dtr, # 决策树模型名称
out_file = None, # 输出文件的句柄或名称,默认值是None
feature_names = housing.feature_names, # 训练所用特征名称
filled = True, # 指定是否给节点填充不同的颜色
impurity = True, # 指定在节点中显示不纯度,如MSE、MAE等
rounded = True # 设置节点在拐点处的形状为圆形
)
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
from IPython.display import Image
Image(graph.create_png())
回归决策树模型的可视化结果:
# 调用train_test_split将原始数据划分成训练集和测试集(验证集)
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = \
train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42) # test_size指定测试集所占的比例
dtr = tree.DecisionTreeRegressor(random_state = 42)
dtr.fit(data_train, target_train)
dtr.score(data_test, target_test) # 输出结果为0.637355881715626
score的说明:
score(X, y, sample_weight=None) method of sklearn.tree.tree.DecisionTreeRegressor instance
Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the residual
sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
sum of squares ((y_true - y_true.mean()) ** 2).sum().
The best possible score is 1.0 and it can be negative (because the
model can be arbitrarily worse). A constant model that always
predicts the expected value of y, disregarding the input features,
would get a R^2 score of 0.0.
Parameters
----------
X : array-like, shape = (n_samples, n_features)
Test samples. For some estimators this may be a
precomputed kernel matrix instead, shape = (n_samples,
n_samples_fitted], where n_samples_fitted is the number of
samples used in the fitting for the estimator.
y : array-like, shape = (n_samples) or (n_samples, n_outputs)
True values for X.
sample_weight : array-like, shape = [n_samples], optional
Sample weights.
Returns
-------
score : float
R^2 of self.predict(X) wrt. y.
# 导出各个特征的重要性值
pd.Series(dtr.feature_importances_, index = housing.feature_names)
使用交叉验证调整回归决策树模型中的参数(网格搜索)
from sklearn.model_selection import GridSearchCV
tree_param_grid = { 'max_depth':list((5,10,12)),'min_samples_split': list((2,4,6)),'min_samples_leaf':list((1,2,4))}
grid = GridSearchCV(tree.DecisionTreeRegressor(),param_grid=tree_param_grid, cv=5)
grid.fit(data_train, target_train)
grid.cv_results_['mean_test_score'], grid.best_params_, grid.best_score_
交叉验证结果:
# 采用由交叉验证的到的“最佳”参数重新训练回顾决策树
dtr2 = tree.DecisionTreeRegressor(max_depth=10,min_samples_leaf=4,min_samples_split=4)
dtr2.fit(data_train, target_train)
dtr2.score(data_test, target_test) # 输出结果为0.6970711600432351
分类决策树
# 导入拍拍贷贷款数据集
df = pd.read_excel('loans.xlsx','Sheet1')
df.head()
df左半部分数据:
df右半部分数据:
数据集中的特征说明:
- grade:贷款级别
sub_grade
: 贷款细分级别short_emp
:一年以内短期雇佣emp_length_num
:受雇年限home_ownership
:居住状态(自有,按揭,租住)- dti:贷款占收入比例
- purpose:贷款用途
- term:贷款周期
last_delinq_none
:贷款申请人是否有不良记录last_major_derog_none
:贷款申请人是否有还款逾期90天以上记录revol_util
:透支额度占信用比例total_rec_late_fee
:逾期罚款总额safe_loans
:贷款是否安全
df.shape # 输出结果为(46508, 13)
# 查看原始数据中中是否含有缺失值
df.isnull().any() # 从输出结果可以看到原始数据中不含缺失值
# 输出结果为:
# grade False
# sub_grade False
# short_emp False
# emp_length_num False
# home_ownership False
# dti False
# purpose False
# term False
# last_delinq_none False
# last_major_derog_none False
# revol_util False
# total_rec_late_fee False
# safe_loans False
# dtype: bool
# 从原始数据中提取出特征集和标签集
X = df.drop('safe_loans', axis=1)
y = df.safe_loans
# 查看标签集中的数据分布是否均匀
y.value_counts()
# 输出结果为:
# 1 23358
# -1 23150
# 将特征集中的类别属性映射成对应的数值
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
X_trans = X.apply(lambda x: d[x.name].fit_transform(x),axis=0)
X_trans.head()
转换后的特征数据:
# 将数据划分成训练集和测试集(验证集)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, random_state=1)
# 调用sklearn包建立决策树模型
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=3,random_state=10)
clf = clf.fit(X_train, y_train)
column_names = X_train.columns.tolist() # 提取特征名称
#构造用于可视化决策树的dot数据
dot_data = \
tree.export_graphviz(
clf, # 决策树模型名称
out_file = None, # 输出文件的句柄或名称,默认值是None
feature_names = column_names, # 训练所用特征名称
filled = True, # 指定是否给节点填充不同的颜色
impurity = True, # 指定在节点中显示不纯度,如MSE、MAE等
rounded = True # 设置节点在拐点处的形状为圆形
)
# 可视化决策树
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
from IPython.display import Image
Image(graph.create_png())
分类决策树模型的可视化结果:
# 评估模型性能
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf.predict(X_test)) # accuracy_score默认情况下计算的是分类准确率
# 输出结果为0.6161520598606691
使用交叉验证调整分类决策树模型中的参数(网格搜索)
from sklearn.model_selection import GridSearchCV
tree_param_grid = { 'max_depth':list((3,6,9)),'min_samples_split': list((2,4,6)),'min_samples_leaf':list((1,2,4))}
grid = GridSearchCV(tree.DecisionTreeClassifier(),param_grid=tree_param_grid, cv=5)
grid.fit(X_train, y_train)
grid.cv_results_['mean_test_score'], grid.best_params_, grid.best_score_
交叉验证结果:
# 采用由交叉验证的到的“最佳”参数重新训练分类决策树
clf2 = tree.DecisionTreeClassifier(max_depth=9,min_samples_leaf=2,min_samples_split=4)
clf2 = clf2.fit(X_test, y_test)
accuracy_score(y_test, clf2.predict(X_test))
# 输出结果为0.6852154468048508
# 利用AUC度量分类器的性能
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(y_test,clf2.predict(X_test))
auc_score # AUC的值与accuracy_score的值十分接近
# auc_score的值为0.6852070693397126
决策树算法原理参考:
《机器学习》——周志华
《Python数据分析与机器学习实战》——唐宇迪
http://www.mamicode.com/info-detail-2412736.html
http://www.cnblogs.com/pinard/p/6050306.html
http://www.cnblogs.com/pinard/p/6053344.html
决策树算法调参参考:
https://www.cnblogs.com/pinard/p/6056319.html
https://www.cnblogs.com/wanglei5205/p/8581354.html
https://www.cnblogs.com/jiaxin359/p/8641976.html
其他参考: