一、导入数据
1、库
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
2、sklearn导入数据
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
3、建树
from sklearn import tree
dtr = tree.DecisionTreeRegressor(max_depth = 2)
dtr.fit(housing.data[:,[6,7]], housing.target)
4、可视化
# 可视化显示, 先安装graphviz
dot_data = \
tree.export_graphviz(
dtr, #构造的对象
out_file = None,
feature_names = housing.feature_names[6:8], #特征名字传进来
filled = True,
impurity = False,
rounded = True
)
# pip install pydotplus
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
from IPython.display import Image
Image(graph.create_png())
5、参数选择:
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = \
train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42) #test_size:测试取10% random_state:生成随机数的种子
dtr = tree.DecisionTreeRegressor(random_state = 42)
dtr.fit(data_train, target_train)
dtr.score(data_test, target_test)
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(data_train, target_train)
rfr.score(data_test, target_test)
from sklearn.model_selection import GridSearchCV #遍历参数,选择最优参数
tree_param_grid = {'min_samples_split': list((3,6,9)), 'n_estimators': list((10,50,100))}#字典
grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_param_grid, cv=5, return_train_score=True)#cv:交叉验证
grid.fit(data_train, target_train)
grid.cv_results_, grid.best_params_, grid.best_score_