决策树C4.5算法复现(基于数据集win分类)
C4.5 对比 ID3
优势:①ID3仅能处理离散值,C4.5可以处理连续值(采用二分法选择最佳分类标准)
劣势:①:C4.5可以处理缺失值,ID3无法处理缺失值
本文主要生成整棵决策树,关于剪枝处理和对缺失值的处理以后我会慢慢补充`
from scipy import *
from math import log
import operator
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
def load_dataSet(train_size):
"""
主要划分了训练集和测试集
主要使用的是pandas来读取exel表并且用sklearn库来划分测试集和训练集
"""
fpath = r'd:win.xls'
object = pd.read_excel(fpath)
dataSet = object.iloc[0:179, 1:14]
trans_dataSet = dataSet.values.tolist()
#X = np.array(m1)
#k1 = object.iloc[0:150, 1:5]
#k2 = k1.values.tolist()
list1 = []
for i in trans_dataSet:
list1.append(i)
data_size = len(list1)
test_data_size = data_size - train_size
train_data, test_data = train_test_split(list1, test_size=test_data_size / data_size)
return train_data, test_data
def Ent(data_set):
"""
本处计算信息熵
"""
num_entries = len(data_set)
label_number = {
}
for entry in data_set:
label = entry[-1]
if label in label_number.keys():
label_number[label] += 1
else:
label_number[label] = 1
info_D = 0.0
for label in label_number.keys():
prob = float(label_number[label]) / num_entries
info_D -= prob * log(prob, 2)
return info_D
def split_data_set(data_set, index, value, continuo, part=0):
"""
本处主要是对连续值和离散值的处理
"""
rest_data_set = []
if continuo ==False:
for entry in data_set:
if entry[index] == value:
reduced_entry = entry[:index]
reduced_entry.extend(entry[index + 1:]) # 划分后去除数据中第index列的值
rest_data_set.append(reduced_entry)
else:
for entry in data_set:
if part == 0 and float(entry[index]) <= value: # 求划分点左侧的数据集
reduced_entry = entry[:index]
reduced_entry.extend(entry[index + 1:]) # 划分后去除数据中第index列的值
rest_data_set