【课设,毕设,大作业】
在人工智能的浪潮中,是否有一刻,你也渴望成为其中的一员?今天,就有这样一个机会摆在你面前!
我们为你精心准备了一个特别的项目——一个清晰逻辑、代码完整的决策树实战项目,它不仅包含了完整的数据集,而且保证你拿到手后立即可运行。是的,没有冗余的配置,没有复杂的环境搭建,一切都为了让你轻松上手!
作为AI初学者的你,或许对代码的复杂性和理论的深奥感到迷茫。别担心,这个项目正是为你量身打造。它的特点在于——逻辑清晰、易于理解,让你在实践中快速把握决策树的核心原理和应用场景。
想要迈出学习人工智能的第一步吗?现在,只需扫描下方的二维码,这份专业的决策树项目资料将立刻到达你的手中。不仅如此,你还将加入一个充满活力的学习社群,与同样热爱AI的伙伴们一起成长。
在这个系列里面难度不一定,
但是我承诺大家拿到手就可以跑
决策树
决策树是一种流行的机器学习方法,通过模拟人类决策过程来预测结果。它以树形结构呈现,由节点和分支构成。节点代表数据属性,分支代表决策规则。从根节点开始,根据特定规则,数据被分到不同的分支,直至叶节点,叶节点代表最终的决策结果。决策树易于理解和实现,广泛应用于分类和回归任务。其优点在于直观、计算效率高,但也容易过拟合,需要适当的剪枝策略。
具体代码实现
决策树模型
import sys
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from math import log
from DTNode import Node
class DecisionTree:
def __init__(self):
self.root = None
print('Initiated')
sys.setrecursionlimit(3000)
def train(self, x, y):
# 计算熵
def entropy(y):
counter = Counter(y)
entropy = 0.
for num in counter.values():
p = num / len(y)
entropy += -p * np.log(p)
return entropy
# 划分
def split(x, y, dimension, value):
# x: 特征
# y: 标签
# dimension: 选取用来划分的特征
# value: 用来划分的特征的分类边界
left = (x[:, dimension] <= value)
right = (x[:, dimension] > value)
return x[left], x[right], y[left], y[right]
def find_best_split_dimension(x, y):
entropy_best = float('inf')
dimension_best = -1
value_best = -1
for dim in range(x.shape[1]):
# 按当前的dim的数值重新排序数据
sorted = np.argsort(x[:, dim])
for i in range(1, len(x)):
# 找到数据的不同值的位置
if x[sorted[i], dim] != x[sorted[i-1], dim]:
# 用数据的不同值的中间位置作为当前的分类边界
value = (x[sorted[i], dim] + x[sorted[i-1], dim])/2
# 用改分类边界对数据进行划分
x_left, x_right, y_left, y_right = split(x, y, dim, value)
# 计算概率
p_left = len(x_left) / len(x)
p_right = len(x_right) / len(x)
# 计算当前的信息熵
entropy_current = p_left * entropy(y_left) + p_right * entropy(y_right)
# 判断该次分类的信息熵是不是最佳的信息熵
if entropy_current < entropy_best:
entropy_best = entropy_current
dimension_best = dim
value_best = value
return entropy_best, dimension_best, value_best
def create_tree(x, y, dim_list):
# 找出当前最适合的分类特征
ent, dim, value = find_best_split_dimension(x, y)
# 分出当前子树
x_left, x_right, y_left, y_right = split(x, y, dim, value)
# 创建当前节点
node = Node(x, y, dim, value)
# 递归函数的结束条件
if len(x) <= 1:
return node
elif ent < 0.00001:
return node
# elif len(np.unique(y)) < 2:
# return node
# elif (dim_list is not None) and (dim in dim_list):
# return node
# 递归
dim_list.append(dim)
node.left = create_tree(x_left, y_left, dim_list)
node.right = create_tree(x_right, y_right, dim_list)
return node
self.root = create_tree(x, y, list())
return self
def predict(self, x):
def travel(x, node):
p = node
if x[p.dimension] <= p.value and p.left:
pred = travel(x, p.left)
elif x[p.dimension] > p.value and p.right:
pred = travel(x, p.right)
else:
counter = Counter(p.y)
pred = counter.most_common(1)[0][0]
return pred
y_predicted = []
for data in x:
y_pred = travel(data, self.root)
y_predicted.append(y_pred)
return np.array(y_predicted)
def evaluate(self, x, y):
y_predicted = self.predict(x)
sco = np.sum((y_predicted == y) / len(y_predicted))
print('Accuracy is: ' + str(sco))
return sco
import sys
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from math import log
from DTNode import Node
class DecisionTree:
def __init__(self):
self.root = None
print('Initiated')
sys.setrecursionlimit(3000)
def train(self, x, y):
# 计算熵
def entropy(y):
counter = Counter(y)
entropy = 0.
for num in counter.values():
p = num / len(y)
entropy += -p * np.log(p)
return entropy
# 划分
def split(x, y, dimension, value):
# x: 特征
# y: 标签
# dimension: 选取用来划分的特征
# value: 用来划分的特征的分类边界
left = (x[:, dimension] <= value)
right = (x[:, dimension] > value)
return x[left], x[right], y[left], y[right]
def find_best_split_dimension(x, y):
entropy_best = float('inf')
dimension_best = -1
value_best = -1
for dim in range(x.shape[1]):
# 按当前的dim的数值重新排序数据
sorted = np.argsort(x[:, dim])
for i in range(1, len(x)):
# 找到数据的不同值的位置
if x[sorted[i], dim] != x[sorted[i-1], dim]:
# 用数据的不同值的中间位置作为当前的分类边界
value = (x[sorted[i], dim] + x[sorted[i-1], dim])/2
# 用改分类边界对数据进行划分
x_left, x_right, y_left, y_right = split(x, y, dim, value)
# 计算概率
p_left = len(x_left) / len(x)
p_right = len(x_right) / len(x)
# 计算当前的信息熵
entropy_current = p_left * entropy(y_left) + p_right * entropy(y_right)
# 判断该次分类的信息熵是不是最佳的信息熵
if entropy_current < entropy_best:
entropy_best = entropy_current
dimension_best = dim
value_best = value
return entropy_best, dimension_best, value_best
def create_tree(x, y, dim_list):
# 找出当前最适合的分类特征
ent, dim, value = find_best_split_dimension(x, y)
# 分出当前子树
x_left, x_right, y_left, y_right = split(x, y, dim, value)
# 创建当前节点
node = Node(x, y, dim, value)
# 递归函数的结束条件
if len(x) <= 1:
return node
elif ent < 0.00001:
return node
# elif len(np.unique(y)) < 2:
# return node
# elif (dim_list is not None) and (dim in dim_list):
# return node
# 递归
dim_list.append(dim)
node.left = create_tree(x_left, y_left, dim_list)
node.right = create_tree(x_right, y_right, dim_list)
return node
self.root = create_tree(x, y, list())
return self
def predict(self, x):
def travel(x, node):
p = node
if x[p.dimension] <= p.value and p.left:
pred = travel(x, p.left)
elif x[p.dimension] > p.value and p.right:
pred = travel(x, p.right)
else:
counter = Counter(p.y)
pred = counter.most_common(1)[0][0]
return pred
y_predicted = []
for data in x:
y_pred = travel(data, self.root)
y_predicted.append(y_pred)
return np.array(y_predicted)
def evaluate(self, x, y):
y_predicted = self.predict(x)
sco = np.sum((y_predicted == y) / len(y_predicted))
print('Accuracy is: ' + str(sco))
return sco
写在后面
所有的代码我已经展示出来了,大家可以配合注释食用。这个注释我是按照小白的思维去写的,可能会过于详细,大佬就忽略吧!
要是有同学要源码文件和数据集,可以加微信领取!大家拿到以后基本上就可以配合完成决策树的学习了。还有人工智能,机器学习的学习路径也分享给大家。冲冲冲!生命不息,学习不止!
![图片描述](https://img-blog.csdnimg.cn/direct/86760963089e4a46915579085e48a3fd.jpeg#pic_center)