决策树-对威斯康星州乳腺癌(诊断)数据集分析

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from sklearn import tree
import pandas as pd
plt.rcParams['font.sans-serif'] = ['SimHei']
 
#加载数据集 
data = pd.read_csv('data.csv') 
datas = np.array(data)
x = np.array(data[['radius_mean','texture_mean']])
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})  
y = np.array(data['diagnosis'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


#创建决策树算法对象
tree_clf=DecisionTreeClassifier(max_depth=2,random_state=42)
#创建决策树
tree_clf.fit(X_train,y_train)
#绘制决策树结构
tree.plot_tree(tree_clf,filled=True)

from sklearn.tree import export_text
feature_namess = ['radius_mean','texture_mean']
r=export_text(tree_clf,feature_names=feature_namess)
print(r)



l,r = x[:,0].min() -1, x[:,0].max() + 1
b,t = x[:,1].min() -1, x[:,1].max() + 1
n=500
#定义绘制决策边界的函数
def plo(clf,x,y,axes=[l,r,b,t],
        datas=True,
        legend=False,
        plot_training=True):
    x1s=np.linspace(axes[0], axes[1],100)
    x2s=np.linspace(axes[2], axes[3],100)
    x1,x2=np.meshgrid(x1s, x2s)
    X_new=np.c_[x1.ravel(),x2.ravel()]
    y_pred=clf.predict(X_new).reshape(x1.shape)
    custom_cmap=ListedColormap(['#a0faa0','#9898ff'])
    plt.contourf(x1, x2,y_pred,alpha=0.3,cmap=custom_cmap)
    if not datas:
        custom_cmap2=ListedColormap(['#a0faa0','#9898ff'])
        plt.contour(x1, x2,y_pred,cmap=custom_cmap2,alpha=0.8)
    if plot_training:
        plt.plot(x[:,0][y==0],x[:,1][y==0],"yo",label="良性乳腺癌")
        plt.plot(x[:,0][y==1],x[:,1][y==1],"bs",label="恶性乳腺癌")
        plt.axis(axes)
    if datas:
        plt.xlabel("radius_mean",fontsize=14)
        plt.ylabel("texture_mean",fontsize=14)
    else:
        plt.xlabel(r"$x_1$",fontsize=18)
        plt.ylabel(r"$x_2$",fontsize=18)
    if legend:
        plt.legend(loc="lower righr",fontsize=14)
        

tree_clf1=DecisionTreeClassifier(random_state=42)#没有做剪枝处理 会出现过拟合

#减少过拟合
tree_clf2=DecisionTreeClassifier(min_samples_leaf=4,random_state=42)#做剪枝处理

tree_clf1.fit(X_train,y_train)
tree_clf2.fit(X_train,y_train)

plt.figure(figsize=(12,4))
plt.subplot(121)
plo(tree_clf1,x,y,axes=[l,r,b,t],datas=False)
plt.title('剪枝前良恶性乳腺癌-决策树算法',fontsize = 16)
plt.xlabel('radius_mean', fontsize = 12)
plt.ylabel('texture_mean', fontsize = 12,rotation=90)
plt.xlim(l, r)
plt.ylim(b, t)
plt.legend()

plt.subplot(122)
plo(tree_clf2,x,y,axes=[l,r,b,t],datas=False)
plt.title('剪枝后良恶性乳腺癌-决策树算法',fontsize = 16)
plt.xlabel('radius_mean', fontsize = 12)
plt.ylabel('texture_mean', fontsize = 12,rotation=90)
plt.xlim(l, r)
plt.ylim(b, t)
plt.legend()
plt.show()
print('\n')

#模型准确率
abb1 = tree_clf.score(X_test, y_test)
abb2_1 = tree_clf1.score(X_test, y_test)
abb2_2 = tree_clf2.score(X_test, y_test)
print('第一张图形决策树算法得分:abb1=',abb1)
print('第二张图形左边决策树算法得分:abb2_1=',abb2_1)
print('第二张图形右边决策树算法得分:abb2_2=',abb2_2)

                             

|--- radius_mean <= 15.03
|   |--- texture_mean <= 19.61
|   |   |--- class: 0
|   |--- texture_mean >  19.61
|   |   |--- class: 0
|--- radius_mean >  15.03
|   |--- texture_mean <= 16.39
|   |   |--- class: 0
|   |--- texture_mean >  16.39
|   |   |--- class: 1

第一张图形决策树算法得分:abb1= 0.9005847953216374
第二张图形左边决策树算法得分:abb2_1= 0.8421052631578947
第二张图形右边决策树算法得分:abb2_2= 0.8947368421052632

注意:仅供参考

  • 19
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值