目录
ID3算法
使用numpy ,pandas库实现ID3算法,ID3不能处理连续数据。这里给出ID3实现所有的代码,前面有解释。
import operator
import numpy as np
import pandas as pd
# 导入数据集
file_path = '路径'
def read_data(file_path):
"""读取数据"""
data_set = pd.read_csv(file_path)
# 返回列名
label = data_set.columns.tolist()
data_set=np.array(data_set)
return data_set,label
def calculation_Shannon_Ent(data_set):
"""计算香浓熵"""
num_entries = len(data_set) # entries 项
label_counts = {}
for featVec in data_set:
current_label = featVec[-1] # current 当前
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
shannonEnt = 0.0
for key in label_counts:
prob = float(label_counts[key]) / num_entries
shannonEnt -= prob * np.log2(prob)
return shannonEnt
def split_data_set(data_set,axis,value):
"""
ID3算法需要对每个特征计算信息增益,选取最大的,这里就是对特征进行切分,
传入每一列的索引,和值进行切分,返回的值不包括当前的这一个特征列。
:param axis: 特征对应的 列名索引 [0,1,2,3]
:param value: 特征值[id,color,root]对应列名里面所取的值,例如对color特征
进行切分传入value=dark_green,value=light_white,不同特征返回不同的列表
计算香浓熵
:return:
"""
# 例如传入(data_set,1,dark_green),返回包含特征等于dark_green的列表
ret_data_set = []
for featVec in data_set: # feat 合适的
if featVec[axis] ==value:
reduced_featVec = featVec[:axis] # 0:0 的切片是一个[]
reduce_eatVec_list = list(reduced_featVec)
reduce_eatVec_list.extend(featVec[axis+1:])
ret_data_set.append(reduce_eatVec_list)
return ret_data_set
def choose_best_feature_to_split(data_set):
"选择最好的特征"
num_features =len(data_set[0])-1 # 统计有几个特征
base_Entropy = calculation_Shannon_Ent(data_set)
best_info_gain = 0.0
best_feature =-1
# unique 独特的,唯一的
for i in range(num_features):
feat_list = [example[i] for example in data_set]
unique_values = set(feat_list) # 统计每个特征,有几个值
new_Entropy = 0.0
for value in unique_values:
sub_data_set =split_data_set(data_set,i,value) # 特征里面不同的值进行切分数据
prob =len(sub_data_set)/float(len(data_set))
# 计算每个不同值的熵,再乘以概率 H(D|A)
new_Entropy += prob * calculation_Shannon_Ent(sub_data_set)#
# 就是信息增益H(D)-H(D|A)
info_gain = base_Entropy -new_Entropy
if info_gain >best_info_gain:
best_info_gain = info_gain
best_feature =i
return best_feature
def majorityCnt(class_list):
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote]=0
class_count[vote]+=1
# 拿key的值进行从小到大排序,完了反转,从大到小显示,返回的是列表,里面的值是元组,key,value
sorted_class_count = sorted(class_count.items(),
key=operator.itemgetter(1),
reverse=True)
# 多的种类为一类,其他舍去
return sorted_class_count[0][0]
def create_tree(data_set,labels):
"""
递归停止条件:
1.当前所有的label都一样
2. 只有一个特征
:param data_set: 数据集
:param labels: 标签列表
:return:
"""
class_list = [example[-1] for example in data_set]
# 会判断是否跳出递归根据,树停止条件所有标记都一样
if class_list.count(class_list[0])==len(class_list):
return class_list[0]
# 当前的数据只有一个特征
if len(data_set[0]) ==1:
return majorityCnt(class_list)
best_feat = choose_best_feature_to_split(data_set)
# 最好的标签
best_feat_label = labels[best_feat]
my_Tree = {best_feat_label:{}}
del (labels[best_feat])
feat_values = [example[best_feat] for example in data_set]
unique_values =set(feat_values)
print(unique_values)
for value in unique_values:
sub_labels =labels[:]
my_Tree[best_feat_label][value] = create_tree(
split_data_set(data_set,best_feat,value),
sub_labels
)
return my_Tree
if __name__ == '__main__':
# data ,label= read_data(file_path)
# 如果有id,删除id,它的信息增益最大
# data = data[:,1:]
# label =label[1:]
# 传入的数据需要是数组格式
data = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
label=['no surfacing','flippers']
tree = create_tree(data,label)
print(tree)
gini
在这里插入代码片
决策树回归
参数介绍
DecisionTreeClassifier 类里面的参数
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.,
min_impurity_split=None,
class_weight=None,
presort=False):
criterion=“gini” | |
splitter | |
max_depth | |
min_samples_split | |
min_samples_leaf | |
min_weight_fraction_leaf | |
max_features | |
random_state | |
max_leaf_nodes | |
min_impurity_decrease | |
min_impurity_split | |
class_weight | |
presort |