决策树学习

最新推荐文章于 2024-05-31 16:10:43 发布

treeHash

最新推荐文章于 2024-05-31 16:10:43 发布

阅读量270

点赞数

本文链接：https://blog.csdn.net/u012433488/article/details/72044824

版权

#!/usr/bin/python
# -*- coding:utf-8 -*-
# Author:Tom
# Date:2017/05/14
# Email:bluelovelydog@gmail.com
# Test:decisiontree



# dataset
dataset = [
	['slashdot', 'USA', 'yes', 18, 'None'],
	['google', 'France', 'yes', 23, 'Premium'],
	['ss', 'china', 'no', 21, '']
]

# tree node
class DecisionNode:
	def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
		"""
		col:保存的是最优的条件
	    value：最优的测试条件
		result：保存的是树的结果
		tb：实际就是符合条件的set1集合
		fb：实际就是不符合条件的set2集合
		"""
		self.col = col
		self.value = value
		self.results = results
		self.tb = tb
		self.fb = fb

# divide function
def deivide_set(rows, column, value):
	"""
	rows:dataset
	column:el of dataset
	value: value of el
	当输入一个dataset时， 先在列表中判断dataset中每一个元素的column的值是否等于value的输入值
	"""
	split_fuc = None
	if isinstance(value, int) or isinstance(value, float):
		split_fuc = lambda row:row[column] >= value
	else:
		split_fuc = lambda row:row[column] == value

	set1 = [row for row in rows if split_fuc(row)]
	set2 = [row for row in rows if not split_fuc(row)]
	return (set1, set2)

# count the el of results in dataset
def unique_counts(rows):
	"""
	len（row）：结果实际是rows的长度，但是下标可是从0开始的，所以len（row）- 1实际就是最后一个元素的下标
	r：实际就是最后一个元素
	"""
	results = {}
	for row in rows:
		r = row[len(row) - 1]
		if r not in results:
			results[r] = 0
		results[r] += 1
	return results

# entropy
def entropy(rows):
	"""
	这个是求熵的公式
	"""
	from math import log
	results = unique_counts(rows)
	entropy = 0.0
	for r in results.keys():
		p = float(results[r]) / len(rows)
		entropy = entropy - p*log(p, 2)
	return entropy

# buildtree
def build_tree(rows, scoref=entropy):
	if len(rows) == 0:return DecisionNode() # 集合为空，返回一个root节点
	current_scroe = scoref(rows)            # 求熵的值， entropy是上面熵的函数

	best_gain = 0.0
	best_criteria = None
	best_sets = None

	col_count = len(rows[0]) - 1
	for col in range(0, col_count):
		col_value = {}
		for row in rows:
			col_value[row[col]] = 1
		for value in col_value.keys():
			(set1, set2) = deivide_set(rows, col, value)

			p = float(len(set1)) / len(rows)
			gain = current_scroe - p*scoref(set1) - (1-p)*scoref(set2) # 信息增益的集合
			if gain > best_gain and len(set1) > 0 and len(set2) > 0:
				best_gain = gain
				best_criteria = (col, value)
				best_sets = (set1, set2)
	if best_gain > 0:
		true_branch = build_tree(best_sets[0])
		false_branch = build_tree(best_sets[1])
		return DecisionNode(col=best_criteria, value=best_criteria[0], tb=true_branch, fb=false_branch)
	else:
		return DecisionNode(results=unique_counts(rows))

treeHash

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
决策树学习

#!/usr/bin/python# -*- coding:utf-8 -*-# Author:Tom# Date:2017/05/14# Email:bluelovelydog@gmail.com# Test:decisiontree# datasetdataset = [ ['slashdot', 'USA', 'yes', 18, 'None'], ['google'
复制链接

扫一扫