from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from sklearn import tree
import pandas as pd
plt.rcParams['font.sans-serif'] = ['SimHei']
#加载数据集
data = pd.read_csv('data.csv')
datas = np.array(data)
x = np.array(data[['radius_mean','texture_mean']])
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
y = np.array(data['diagnosis'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
#创建决策树算法对象
tree_clf=DecisionTreeClassifier(max_depth=2,random_state=42)
#创建决策树
tree_clf.fit(X_train,y_train)
#绘制决策树结构
tree.plot_tree(tree_clf,filled=True)
from sklearn.tree import export_text
feature_namess = ['radius_mean','texture_mean']
r=export_text(tree_clf,feature_names=feature_namess)
print(r)
l,r = x[:,0].min() -1, x[:,0].max() + 1
b,t = x[:,1].min() -1, x[:,1].max() + 1
n=500
#定义绘制决策边界的函数
def plo(clf,x,y,axes=[l,r,b,t],
datas=True,
legend=False,
plot_training=True):
x1s=np.linspace(axes[0], axes[1],100)
x2s=np.linspace(axes[2], axes[3],100)
x1,x2=np.meshgrid(x1s, x2s)
X_new=np.c_[x1.ravel(),x2.ravel()]
y_pred=clf.predict(X_new).reshape(x1.shape)
custom_cmap=ListedColormap(['#a0faa0','#9898ff'])
plt.contourf(x1, x2,y_pred,alpha=0.3,cmap=custom_cmap)
if not datas:
custom_cmap2=ListedColormap(['#a0faa0','#9898ff'])
plt.contour(x1, x2,y_pred,cmap=custom_cmap2,alpha=0.8)
if plot_training:
plt.plot(x[:,0][y==0],x[:,1][y==0],"yo",label="良性乳腺癌")
plt.plot(x[:,0][y==1],x[:,1][y==1],"bs",label="恶性乳腺癌")
plt.axis(axes)
if datas:
plt.xlabel("radius_mean",fontsize=14)
plt.ylabel("texture_mean",fontsize=14)
else:
plt.xlabel(r"$x_1$",fontsize=18)
plt.ylabel(r"$x_2$",fontsize=18)
if legend:
plt.legend(loc="lower righr",fontsize=14)
tree_clf1=DecisionTreeClassifier(random_state=42)#没有做剪枝处理 会出现过拟合
#减少过拟合
tree_clf2=DecisionTreeClassifier(min_samples_leaf=4,random_state=42)#做剪枝处理
tree_clf1.fit(X_train,y_train)
tree_clf2.fit(X_train,y_train)
plt.figure(figsize=(12,4))
plt.subplot(121)
plo(tree_clf1,x,y,axes=[l,r,b,t],datas=False)
plt.title('剪枝前良恶性乳腺癌-决策树算法',fontsize = 16)
plt.xlabel('radius_mean', fontsize = 12)
plt.ylabel('texture_mean', fontsize = 12,rotation=90)
plt.xlim(l, r)
plt.ylim(b, t)
plt.legend()
plt.subplot(122)
plo(tree_clf2,x,y,axes=[l,r,b,t],datas=False)
plt.title('剪枝后良恶性乳腺癌-决策树算法',fontsize = 16)
plt.xlabel('radius_mean', fontsize = 12)
plt.ylabel('texture_mean', fontsize = 12,rotation=90)
plt.xlim(l, r)
plt.ylim(b, t)
plt.legend()
plt.show()
print('\n')
#模型准确率
abb1 = tree_clf.score(X_test, y_test)
abb2_1 = tree_clf1.score(X_test, y_test)
abb2_2 = tree_clf2.score(X_test, y_test)
print('第一张图形决策树算法得分:abb1=',abb1)
print('第二张图形左边决策树算法得分:abb2_1=',abb2_1)
print('第二张图形右边决策树算法得分:abb2_2=',abb2_2)
|--- radius_mean <= 15.03
| |--- texture_mean <= 19.61
| | |--- class: 0
| |--- texture_mean > 19.61
| | |--- class: 0
|--- radius_mean > 15.03
| |--- texture_mean <= 16.39
| | |--- class: 0
| |--- texture_mean > 16.39
| | |--- class: 1
第一张图形决策树算法得分:abb1= 0.9005847953216374
第二张图形左边决策树算法得分:abb2_1= 0.8421052631578947
第二张图形右边决策树算法得分:abb2_2= 0.8947368421052632
注意:仅供参考