%matplotlib inline import matplotlib.pyplot as plt import pandas as pd from sklearn.datasets.california_housing import fetch_california_housing housing = fetch_california_housing() # 内置的数据集
housing.data.shape # (20640,8)
from sklearn import tree
dtr = tree.DecisionTreeRegressor(max_depth = 2) # 构造决策树模型,并指定最大的深度为2
dtr.fit(housing.data[:,[6,7]],housing.target)
可视化显示(要先安装一个模块) dot_data = tree.export_graphviz(dtr,out_file=None,feature_names=housing.feature_names[6:8],filled=True,impurity=False,rounded=True)
# feature_names 特征名字
# 要先安装 import pydotplus graph = pydotplus.graph_from_dot_data(dot_data) graph.get_nodes()[7].set_fillcolor("#FFF2DD") # 满足第七个节点为True的时候显示的颜色 from IPython.display import Image Image(graph.create_png())
graph.write_png("dtr_white_background.png")
from sklearn.model_selection import train_test_split data_train,data_test,target_train,target_test = train_test_split(housing.data,housing.target,test_size=0.1,random_state=42) # random_state每次随机的测试集
# 都是一样的,它指的是以中状态 dtr = tree.DecisionTreeRegressor(random_state=42) dtr.fit(data_train,target_train) dtr.score(data_test,target_test)
from sklearn.grid_search import GridSearchCV tree_param_grid = {'min_samples_split':list((3,6,9)),'n_estimators':list((10,50,100))}
# min_samples_split:叶子节点至少为多少的时候才能进行分裂,n_estimators:? grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_param_grid,cv=5) # cv交叉验证 grid.fit(dat_train,target_train) grid.grid_scores_,grid.best_param_,grid.best_score_