python代码如下
from sklearn.feature_extraction import DictVectorizer
from sklearn import tree
from sklearn import preprocessing
import csv
import graphviz
Dtree = open('西瓜数据集3.0.csv', 'r')
reader = csv.reader(Dtree)
"""
色泽 1-3代表 浅白 青绿 乌黑
根蒂 1-3代表 稍蜷 蜷缩 硬挺
敲声 1-3代表 清脆 浊响 沉闷
纹理 1-3代表 清晰 稍糊 模糊
脐部 1-3代表 平坦 稍凹 凹陷
触感 1-2代表 硬滑 软粘
好瓜 1代表 是 0 代表 不是
"""
# 获取第一行数据
headers = reader.__next__()
print(headers)
# 特征和标签列表
featureList = []
labelList = []
for row in reader:
labelList.append(row[-1])
rowDict = {}
for i in range(1, len(row)-3):
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
print(featureList)
# 将特征列表转换为01表示
vec = DictVectorizer()
x_data = vec.fit_transform(featureList).toarray()
print("x_data: " + str(x_data))
# 将标签列表转换为01表示
lb = preprocessing.LabelBinarizer()
y_data = lb.fit_transform(labelList)
print("y_data: " + str(y_data))
# 创建决策树模型
model = tree.DecisionTreeClassifier(criterion='entropy')
# 输入数据建立模型
model.fit(x_data, y_data)
# 测试
x_test = x_data[0]
predict = model.predict(x_test.reshape(1,-1))
print("predict: " + str(predict))
# 导出决策树
dot_data = tree.export_graphviz(model,
out_file=None,
feature_names=vec.get_feature_names(),
class_names=lb.classes_,
filled=True,
rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('Tree')
西瓜数据集3.0数据如下
number | colour and lustre | root and base | Knock | venation | umbilical region | touch | density | sugar content | good |
1 | 2 | 2 | 2 | 1 | 3 | 1 | 0.697 | 0.46 | 1 |
2 | 3 | 2 | 3 | 1 | 3 | 1 | 0.744 | 0.376 | 1 |
3 | 3 | 2 | 2 | 1 | 3 | 1 | 0.634 | 0.264 | 1 |
4 | 2 | 2 | 3 | 1 | 3 | 1 | 0.608 | 0.318 | 1 |
5 | 1 | 2 | 2 | 1 | 3 | 1 | 0.556 | 0.215 | 1 |
6 | 2 | 1 | 2 | 1 | 2 | 2 | 0.403 | 0.237 | 1 |
7 | 3 | 1 | 2 | 2 | 2 | 2 | 0.481 | 0.149 | 1 |
8 | 3 | 1 | 2 | 1 | 2 | 1 | 0.437 | 0.211 | 1 |
9 | 3 | 1 | 3 | 2 | 2 | 1 | 0.666 | 0.091 | 0 |
10 | 2 | 3 | 1 | 1 | 1 | 2 | 0.243 | 0.267 | 0 |
11 | 1 | 3 | 1 | 3 | 1 | 1 | 0.245 | 0.057 | 0 |
12 | 1 | 2 | 2 | 3 | 1 | 2 | 0.343 | 0.099 | 0 |
13 | 2 | 1 | 2 | 2 | 3 | 1 | 0.639 | 0.161 | 0 |
14 | 1 | 1 | 3 | 2 | 3 | 1 | 0.657 | 0.198 | 0 |
15 | 3 | 1 | 2 | 1 | 2 | 2 | 0.36 | 0.37 | 0 |
16 | 1 | 2 | 2 | 3 | 1 | 1 | 0.593 | 0.042 | 0 |
17 | 2 | 2 | 3 | 2 | 2 | 1 | 0.719 | 0.103 | 0 |
结果PDF如下:
随后可以进行预剪枝和后剪枝来使结构变得更加清晰简单。