理论部分大家可以自己学习,这里代码是利用的逻辑编写,没有用sklearn
import numpy as np
import pandas as pd
from collections import Counter
class DecisionTree:
class Node:
def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
self.feature_index = feature_index # 特征索引
self.threshold = threshold # 分割阈值
self.value = value # 叶子节点预测值
self.left = left # 左子树
self.right = right # 右子树
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 分割的最小样本数
self.root = None # 根节点
def fit(self, X, y):
self.root = self._build_tree(X, y, depth=0)
def _build_tree(self, X, y, depth):
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
if depth == self.max_depth or n_samples < self.min_samples_split or n_classes == 1: # 满足停止条件
value = self._most_common_label(y)
return self.Node(value=value)
best_feature_index, best_threshold = self._find_best_split(X, y)
if best_feature_index is None or best_threshold is None:# 满足停止条件
value = self._most_common_label(y)
return self.Node(value=value)
left_indices = X[:, best_feature_index] < best_threshold
right_indices = ~left_indices
left_branch = self._build_tree(X[left_indices], y[left_indices], depth+1)# 递归构建子树
right_branch = self._build_tree(X[right_indices], y[right_indices], depth+1)
return self.Node(feature_index=best_feature_index, threshold=best_threshold,
left=left_branch, right=right_branch)
def _find_best_split(self, X, y):
n_samples, n_features = X.shape
best_info_gain = -1
best_feature_index = None
best_threshold = None
entropy_parent = self._entropy(y)# 计算父节点的熵
for feature_index in range(n_features):
unique_values = np.unique(X[:, feature_index])
for threshold in unique_values:
left_indices = X[:, feature_index] < threshold
right_indices = ~left_indices
entropy_left = self._entropy(y[left_indices]) # 计算子节点的熵和信息增益
entropy_right = self._entropy(y[right_indices])
info_gain = self._information_gain(entropy_parent, y[left_indices], y[right_indices])
if info_gain > best_info_gain:# 选择信息增益最大的分割点
best_info_gain = info_gain
best_feature_index = feature_index
best_threshold = threshold
return best_feature_index, best_threshold
def _entropy(self, y): #计算熵
counter = Counter(y)
probabilities = [count / len(y) for count in counter.values()]
entropy = -sum(p * np.log2(p) for p in probabilities)
return entropy
def _information_gain(self, entropy_parent, y_left, y_right):
n_total = len(y_left) + len(y_right)
p_left, p_right = len(y_left) / n_total, len(y_right) / n_total
info_gain = entropy_parent - (p_left * self._entropy(y_left) + p_right * self._entropy(y_right))
return info_gain #计算信息增益
def _most_common_label(self, y):
counter = Counter(y)
most_common = counter.most_common(1)
return most_common[0][0]
def predict(self, X):
return [self._traverse_tree(x, self.root) for x in X]
def _traverse_tree(self, x, node): #进行分类
if node.value is not None:
return node.value
if x[node.feature_index] < node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
data = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/train.xlsx")
x_train = np.array(data.iloc[:, 1:5])
y_train = np.array(data.iloc[:, 6])
tree = DecisionTree(max_depth=4, min_samples_split=1)
tree.fit(x_train, y_train)
data1 = pd.read_excel("C:/Users/wxc/Desktop/xuexi/python/pythonProject/机器学习/决策树/test.xlsx",header=None) # 新样本特征
x_test = np.array(data1.iloc[:, 1:5])
y_test = np.array(data1.iloc[:, 6])
predictions = tree.predict(x_test)
print("预测值为:", predictions)
c = 0
for i in range(len(y_test)):
if y_test[i] == predictions[i]:
c= c+1
print('准确率')
print(c/(len(y_test)))