决策树学习

#!/usr/bin/python
# -*- coding:utf-8 -*-
# Author:Tom
# Date:2017/05/14
# Email:bluelovelydog@gmail.com
# Test:decisiontree



# dataset
dataset = [
	['slashdot', 'USA', 'yes', 18, 'None'],
	['google', 'France', 'yes', 23, 'Premium'],
	['ss', 'china', 'no', 21, '']
]

# tree node
class DecisionNode:
	def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
		"""
		col:保存的是最优的条件
	    value:最优的测试条件
		result:保存的是树的结果
		tb:实际就是符合条件的set1集合
		fb:实际就是不符合条件的set2集合
		"""
		self.col = col
		self.value = value
		self.results = results
		self.tb = tb
		self.fb = fb

# divide function
def deivide_set(rows, column, value):
	"""
	rows:dataset
	column:el of dataset
	value: value of el
	当输入一个dataset时, 先在列表中判断dataset中每一个元素的column的值是否等于value的输入值
	"""
	split_fuc = None
	if isinstance(value, int) or isinstance(value, float):
		split_fuc = lambda row:row[column] >= value
	else:
		split_fuc = lambda row:row[column] == value

	set1 = [row for row in rows if split_fuc(row)]
	set2 = [row for row in rows if not split_fuc(row)]
	return (set1, set2)

# count the el of results in dataset
def unique_counts(rows):
	"""
	len(row):结果实际是rows的长度,但是下标可是从0开始的,所以len(row)- 1实际就是最后一个元素的下标
	r:实际就是最后一个元素
	"""
	results = {}
	for row in rows:
		r = row[len(row) - 1]
		if r not in results:
			results[r] = 0
		results[r] += 1
	return results

# entropy
def entropy(rows):
	"""
	这个是求熵的公式
	"""
	from math import log
	results = unique_counts(rows)
	entropy = 0.0
	for r in results.keys():
		p = float(results[r]) / len(rows)
		entropy = entropy - p*log(p, 2)
	return entropy

# buildtree
def build_tree(rows, scoref=entropy):
	if len(rows) == 0:return DecisionNode() # 集合为空,返回一个root节点
	current_scroe = scoref(rows)            # 求熵的值, entropy是上面熵的函数

	best_gain = 0.0
	best_criteria = None
	best_sets = None

	col_count = len(rows[0]) - 1
	for col in range(0, col_count):
		col_value = {}
		for row in rows:
			col_value[row[col]] = 1
		for value in col_value.keys():
			(set1, set2) = deivide_set(rows, col, value)

			p = float(len(set1)) / len(rows)
			gain = current_scroe - p*scoref(set1) - (1-p)*scoref(set2) # 信息增益的集合
			if gain > best_gain and len(set1) > 0 and len(set2) > 0:
				best_gain = gain
				best_criteria = (col, value)
				best_sets = (set1, set2)
	if best_gain > 0:
		true_branch = build_tree(best_sets[0])
		false_branch = build_tree(best_sets[1])
		return DecisionNode(col=best_criteria, value=best_criteria[0], tb=true_branch, fb=false_branch)
	else:
		return DecisionNode(results=unique_counts(rows))	

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值