学习笔记:
利用决策树对三种不同结构的数据集进行分类。
#模块导入
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.tree import DecisionTreeClassifier
#Y为标签
X, Y = make_classification(n_samples=100 #100个样本
,n_features=2 #2个特征,二维数据
,n_redundant=0
,n_informative=2
,random_state=1
,n_clusters_per_class=1
)
plt.scatter(X[:,0], X[:,1]) #X为二维数据,X[:,0]表示X中轴为0的数据
plt.show()
#使二分数据稍微疏散
rng = np.random.RandomState(2)
X += rng.uniform(size=X.shape)
linearly_separable = (X, Y)
plt.scatter(X[:,0], X[:,1])
plt.show()
#三组数据放入datasets
datasets = [make_moons(noise=0.3, random_state=0)
,make_circles(noise=0.2, factor=0.5, random_state=1)
,linearly_separable]
#创建画布,宽高比为6:9
figure = plt.figure(figsize=(6, 9))
#开始迭代datasets中的数据
i = 1
for ds_index, ds in enumerate(datasets):
X, Y = ds
X = StandardScaler().fit_transform(X)#数据标准化处理
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
#创造一个比数据集更大的区间
x1_min, x1_max = X[:,0].min()-0.5, X[:,0].max()+0.5
x2_min, x2_max = X[:,1].min()-0.5, X[:,1].max()+0.5
#生成网格数据,生成两个二维数组
array1, array2 = np.meshgrid(np.arange(x1_min, x1_max, 0.1)
,np.arange(x2_min, x2_max, 0.1))
#生成彩色画布
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000','#0000FF'])
ax = plt.subplot(len(datasets), 2, i)
if ds_index == 0:
ax.set_title("Input data")
#将数据集放入坐标系
ax.scatter(X_train[:,0], X_train[:,1], c=Y_train, cmap=cm_bright, edgecolors='k')
ax.scatter(X_test[:,0], X_test[:,1], c=Y_test, cmap=cm_bright, alpha=0.6, edgecolors='k')#颜色RGBA
ax.set_xlim(array1.min(), array1.max())
ax.set_ylim(array2.min(), array2.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
ax = plt.subplot(len(datasets), 2, i)
#决策树建模
clf = DecisionTreeClassifier(max_depth = 5)
clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
#绘制决策边界,Z为类概率
Z = clf.predict_proba(np.c_[array1.ravel(), array2.ravel()])[:,1]
Z = Z.reshape(array1.shape)
ax.contourf(array1, array2, Z, cmap=cm, alpha=0.8)
ax.scatter(X_train[:,0], X_train[:,1], c=Y_train, cmap=cm_bright, edgecolors='k')
ax.scatter(X_test[:,0], X_test[:,1], c=Y_test, cmap=cm_bright, edgecolors='k', alpha=0.6)
ax.set_xlim(array1.min(), array1.max())
ax.set_ylim(array2.min(), array2.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_index == 0:
ax.set_title("Decision Tree")
#右下角添加分类效果
ax.text(array1.max() - 0.3, array2.min() + 0.3, ("{:.1f}%".format(score*100)), size=15, horizontalalignment = "right")
i += 1
plt.tight_layout()#避免相邻图的坐标轴重叠
plt.savefig('D:\\pylearn\\py_pic\\DT.jpg')
plt.show()