1、信息熵:随机变量的不确定性的度量。
熵越大,数据的不确定性越高。熵越小,数据的不确定性越低
信息熵的计算公式:
Pi:对一个系统中可能有K类个信息,每一类信息所占的比例就叫做Pi
假如一个系统有三个类别,每一个类别所占的比例分别为1/3,1/3,1/3。此时系统的信息熵为:
在假如系统的三个类别,每个类别所占的比例分别为1/10,2/10,7/10.。此时系统的信息熵:
如果系统有两个类别,一类所占的比例是x,另一类的比例就是1-x,此时系统的信息熵为:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data[:,2:]
y = iris.target
plt.scatter(x[y==0,0],x[y==0,1])
plt.scatter(x[y==1,0],x[y==1,1])
plt.scatter(x[y==2,0],x[y==2,1])
plt.show()
基于信息熵训练决策树模型
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=2,criterion="entropy")
dt_clf.fit(x,y)
def plot_decision_boundary(model,axis):
x0,x1 = np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1,1))
x_new = np.c_[x0.ravel(),x1.ravel()]
y_predict = model.predict(x_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,0,3])
plt.scatter(x[y==0,0],x[y==0,1])
plt.scatter(x[y==1,0],x[y==1,1])
plt.scatter(x[y==2,0],x[y==2,1])
plt.show()
2、基尼系数
基尼系数越大,不确定性越高,基尼系数越小,不确定性越低
假如一个系统有三个类别,每一个类别所占的比例分别为1/3,1/3,1/3。此时系统的基尼系数是:
基尼系数和信息熵训练出来的决策树的效果几乎没有任何差别
信息熵VS基尼系数
1)熵信息的计算比基尼系数稍慢
2)sklearn中默认为基尼系数
3)大多数时候二者没有特别效果优劣
#!/usr/bin/python
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
import numpy as np
x,y = datasets.make_moons()
print(x.shape)
print(y.shape)
plt.scatter(x[y==0,0],x[y==0,1],color="red")
plt.scatter(x[y==1,0],x[y==1,1],color="blue")
plt.show()
#为数据添加随机的噪音
x,y = datasets.make_moons(noise=0.25,random_state=666)
plt.scatter(x[y==0,0],x[y==0,1],color="red")
plt.scatter(x[y==1,0],x[y==1,1],color="blue")
plt.show()
不加入任何参数来训练决策树
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x,y)
def plot_decision_boundary(model,axis):
x0,x1 = np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1,1))
x_new = np.c_[x0.ravel(),x1.ravel()]
y_predict = model.predict(x_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="red")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="blue")
plt.show()
训练的结果
此时的决策边界形状相对是不规则的产生了过拟合现象,接下来我们传入一些参数来抑制过拟合现象
dt_clf = DecisionTreeClassifier(max_depth=2)
dt_clf.fit(x,y)
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="red")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="blue")
plt.show()
此时决策边界比较清晰,过拟合现象就被消除了
min_samples_split:对于一个样本节点来说至少有多少个样本我们才进行划分下去,此参数的值越高越不容易产生过拟合,如果过高则会产生欠拟合现象
dt_clf = DecisionTreeClassifier(max_depth=2,min_samples_split=10)
dt_clf.fit(x,y)
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="red")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="blue")
plt.show()
min_samples_leaf:对于一个叶子节点来说至少应该有几个样本
dt_clf = DecisionTreeClassifier(min_samples_leaf=6)
dt_clf.fit(x,y)
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="red")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="blue")
plt.show()
max_leaf_nodes:最多有多少个叶子节点
dt_clf = DecisionTreeClassifier(max_leaf_nodes=4)
dt_clf.fit(x,y)
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="red")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="blue")
plt.show()
#coding:utf-8
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets.california_housing import fetch_california_housing
housing = fetch_california_housing()
#housing.head()
#housing.data.values
print("***************************************")
print(housing.data.reshape(-1,8))
from sklearn import tree
#实例化决策树模型
dtr = tree.DecisionTreeRegressor(max_depth = 2)
dtr.fit(housing.data[:,[6,7]],housing.target)
dot_data = \
tree.export_graphviz(
dtr,
out_file = None,
feature_names = housing.feature_names[6:8],
filled = True,
impurity = False,
rounded = True
)
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
from IPython.display import Image
Image(graph.create_png())
graph.write_png("dtr_white_background.png")
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = \
train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42)
dtr = tree.DecisionTreeRegressor(random_state = 42)
dtr.fit(data_train, target_train)
print(dtr.score(data_test, target_test))
#随机森林
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor( random_state = 42)
rfr.fit(data_train, target_train)
print(rfr.score(data_test, target_test))
from sklearn.grid_search import GridSearchCV
tree_param_grid = { 'min_samples_split': list((3,6,9)),'n_estimators':list((10,50,100))}
print("**********************************************")
print(tree_param_grid)
grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_param_grid, cv=5)
grid.fit(data_train, target_train)
grid.grid_scores_, grid.best_params_, grid.best_score_
rfr = RandomForestRegressor( min_samples_split=3,n_estimators = 100,random_state = 42)
rfr.fit(data_train, target_train)
print(rfr.score(data_test, target_test))
pd.Series(rfr.feature_importances_, index = housing.feature_names).sort_values(ascending = False)