决策树算法的sklearn实现及其调参

回归决策树

import pandas as pd

# 导入房价数据
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing()

# 查看房价数据的具体信息
housing

housing的具体信息如下图所示:

# 将housing中的特征数据和标签数据构造成DataFrame对象
df = pd.DataFrame(housing.data,columns=housing.feature_names)
df.head()

# 查看房价数据的统计信息
df.describe()

# 利用sklearn构造回归决策树模型
from sklearn import tree
dtr = tree.DecisionTreeRegressor(max_depth = 3)  # 设置树的深度为3
dtr.fit(housing.data, housing.target)

# 输出结果为:
# DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
# max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, min_samples_leaf=1,
# min_samples_split=2, min_weight_fraction_leaf=0.0,
# presort=False, random_state=None, splitter='best')
# 构造dot_data数据,用于可视化决策树结果
dot_data = \
    tree.export_graphviz(
        dtr,                # 决策树模型名称
        out_file = None,   # 输出文件的句柄或名称,默认值是None
        feature_names = housing.feature_names,  # 训练所用特征名称
        filled = True,     # 指定是否给节点填充不同的颜色
        impurity = True,   # 指定在节点中显示不纯度,如MSE、MAE等
        rounded = True     # 设置节点在拐点处的形状为圆形
    )

import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
from IPython.display import Image
Image(graph.create_png())

回归决策树模型的可视化结果:

# 调用train_test_split将原始数据划分成训练集和测试集(验证集)
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = \
    train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42)   # test_size指定测试集所占的比例
dtr = tree.DecisionTreeRegressor(random_state = 42)
dtr.fit(data_train, target_train)
dtr.score(data_test, target_test)  # 输出结果为0.637355881715626

score的说明: 

score(X, y, sample_weight=None) method of sklearn.tree.tree.DecisionTreeRegressor instance
    Returns the coefficient of determination R^2 of the prediction.
    
    The coefficient R^2 is defined as (1 - u/v), where u is the residual
    sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
    sum of squares ((y_true - y_true.mean()) ** 2).sum().
    The best possible score is 1.0 and it can be negative (because the
    model can be arbitrarily worse). A constant model that always
    predicts the expected value of y, disregarding the input features,
    would get a R^2 score of 0.0.
    
    Parameters
    ----------
    X : array-like, shape = (n_samples, n_features)
        Test samples. For some estimators this may be a
        precomputed kernel matrix instead, shape = (n_samples,
        n_samples_fitted], where n_samples_fitted is the number of
        samples used in the fitting for the estimator.
    
    y : array-like, shape = (n_samples) or (n_samples, n_outputs)
        True values for X.
    
    sample_weight : array-like, shape = [n_samples], optional
        Sample weights.
    
    Returns
    -------
    score : float
        R^2 of self.predict(X) wrt. y.
# 导出各个特征的重要性值
pd.Series(dtr.feature_importances_, index = housing.feature_names)

使用交叉验证调整回归决策树模型中的参数(网格搜索)

from sklearn.model_selection import GridSearchCV
tree_param_grid = { 'max_depth':list((5,10,12)),'min_samples_split': list((2,4,6)),'min_samples_leaf':list((1,2,4))}
grid = GridSearchCV(tree.DecisionTreeRegressor(),param_grid=tree_param_grid, cv=5)
grid.fit(data_train, target_train)
grid.cv_results_['mean_test_score'], grid.best_params_, grid.best_score_

交叉验证结果: 

# 采用由交叉验证的到的“最佳”参数重新训练回顾决策树
dtr2 = tree.DecisionTreeRegressor(max_depth=10,min_samples_leaf=4,min_samples_split=4)
dtr2.fit(data_train, target_train)
dtr2.score(data_test, target_test)   # 输出结果为0.6970711600432351

分类决策树

# 导入拍拍贷贷款数据集
df = pd.read_excel('loans.xlsx','Sheet1')  
df.head()

df左半部分数据:

df右半部分数据:

数据集中的特征说明:

  • grade:贷款级别
  • sub_grade: 贷款细分级别
  • short_emp:一年以内短期雇佣
  • emp_length_num:受雇年限
  • home_ownership:居住状态(自有,按揭,租住)
  • dti:贷款占收入比例
  • purpose:贷款用途
  • term:贷款周期
  • last_delinq_none:贷款申请人是否有不良记录
  • last_major_derog_none:贷款申请人是否有还款逾期90天以上记录
  • revol_util:透支额度占信用比例
  • total_rec_late_fee:逾期罚款总额
  • safe_loans:贷款是否安全
df.shape   # 输出结果为(46508, 13)
# 查看原始数据中中是否含有缺失值
df.isnull().any()    # 从输出结果可以看到原始数据中不含缺失值

# 输出结果为:
# grade                    False
# sub_grade                False
# short_emp                False
# emp_length_num           False
# home_ownership           False
# dti                      False
# purpose                  False
# term                     False
# last_delinq_none         False
# last_major_derog_none    False
# revol_util               False
# total_rec_late_fee       False
# safe_loans               False
# dtype: bool
# 从原始数据中提取出特征集和标签集
X = df.drop('safe_loans', axis=1)
y = df.safe_loans

# 查看标签集中的数据分布是否均匀
y.value_counts()

# 输出结果为:
#  1    23358
# -1    23150
# 将特征集中的类别属性映射成对应的数值
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
X_trans = X.apply(lambda x: d[x.name].fit_transform(x),axis=0)
X_trans.head()

转换后的特征数据:

# 将数据划分成训练集和测试集(验证集)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, random_state=1)

# 调用sklearn包建立决策树模型
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=3,random_state=10)
clf = clf.fit(X_train, y_train)

column_names = X_train.columns.tolist()   # 提取特征名称

#构造用于可视化决策树的dot数据
dot_data = \
    tree.export_graphviz(
        clf,                # 决策树模型名称
        out_file = None,   # 输出文件的句柄或名称,默认值是None
        feature_names = column_names,  # 训练所用特征名称
        filled = True,     # 指定是否给节点填充不同的颜色
        impurity = True,   # 指定在节点中显示不纯度,如MSE、MAE等
        rounded = True     # 设置节点在拐点处的形状为圆形
    )

# 可视化决策树
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
from IPython.display import Image
Image(graph.create_png())

分类决策树模型的可视化结果:

# 评估模型性能
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf.predict(X_test))  # accuracy_score默认情况下计算的是分类准确率

# 输出结果为0.6161520598606691

使用交叉验证调整分类决策树模型中的参数(网格搜索)

from sklearn.model_selection import GridSearchCV
tree_param_grid = { 'max_depth':list((3,6,9)),'min_samples_split': list((2,4,6)),'min_samples_leaf':list((1,2,4))}
grid = GridSearchCV(tree.DecisionTreeClassifier(),param_grid=tree_param_grid, cv=5)
grid.fit(X_train, y_train)
grid.cv_results_['mean_test_score'], grid.best_params_, grid.best_score_

交叉验证结果: 

# 采用由交叉验证的到的“最佳”参数重新训练分类决策树
clf2 = tree.DecisionTreeClassifier(max_depth=9,min_samples_leaf=2,min_samples_split=4)
clf2 = clf2.fit(X_test, y_test)
accuracy_score(y_test, clf2.predict(X_test)) 
   
# 输出结果为0.6852154468048508
# 利用AUC度量分类器的性能
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(y_test,clf2.predict(X_test))
auc_score    # AUC的值与accuracy_score的值十分接近

# auc_score的值为0.6852070693397126

决策树算法原理参考:

《机器学习》——周志华

《Python数据分析与机器学习实战》——唐宇迪

http://www.mamicode.com/info-detail-2412736.html

http://www.cnblogs.com/pinard/p/6050306.html

http://www.cnblogs.com/pinard/p/6053344.html

决策树算法调参参考:

https://www.cnblogs.com/pinard/p/6056319.html

https://www.cnblogs.com/wanglei5205/p/8581354.html

https://www.cnblogs.com/jiaxin359/p/8641976.html

其他参考:

https://www.jianshu.com/p/67a71e366516

  • 4
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在Python中,可以使用scikit-learn库中的决策树算法调参决策树的一些常用参数包括最大深度(max_depth),最小样本分割数(min_samples_split),最小叶子节点样本数(min_samples_leaf),和划分质量的衡量指标(criterion)等。 以下是一些常见的调参方法: 1. 网格搜索:通过定义一个参数网格,在给定的参数范围内进行组合搜索,找到最佳的参数组合。可以使用`GridSearchCV`来自动搜索最佳参数。 ```python from sklearn.model_selection import GridSearchCV # 定义参数网格 param_grid = { 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 3], 'criterion': ['gini', 'entropy'] } # 创建决策树模型 tree_classifier = DecisionTreeClassifier() # 使用网格搜索进行参数调优 grid_search = GridSearchCV(tree_classifier, param_grid, cv=5) grid_search.fit(X_train, y_train) # 输出最佳参数组合 print(grid_search.best_params_) ``` 2. 随机搜索:与网格搜索类似,但是随机搜索在给定的参数范围内随机选择参数组合,可以减少搜索空间。可以使用`RandomizedSearchCV`来进行随机搜索。 ```python from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint # 定义参数分布 param_dist = { 'max_depth': randint(3, 10), 'min_samples_split': randint(2, 20), 'min_samples_leaf': randint(1, 10), 'criterion': ['gini', 'entropy'] } # 创建决策树模型 tree_classifier = DecisionTreeClassifier() # 使用随机搜索进行参数调优 random_search = RandomizedSearchCV(tree_classifier, param_distributions=param_dist, n_iter=10, cv=5) random_search.fit(X_train, y_train) # 输出最佳参数组合 print(random_search.best_params_) ``` 3. 交叉验证:使用交叉验证评估不同参数组合的性能,选择性能最好的参数。可以使用`cross_val_score`

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值