#!/usr/bin/python
# -*- coding:utf-8 -*-
# Author:Tom
# Date:2017/05/14
# Email:bluelovelydog@gmail.com
# Test:decisiontree
# dataset
dataset = [
['slashdot', 'USA', 'yes', 18, 'None'],
['google', 'France', 'yes', 23, 'Premium'],
['ss', 'china', 'no', 21, '']
]
# tree node
class DecisionNode:
def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
"""
col:保存的是最优的条件
value:最优的测试条件
result:保存的是树的结果
tb:实际就是符合条件的set1集合
fb:实际就是不符合条件的set2集合
"""
self.col = col
self.value = value
self.results = results
self.tb = tb
self.fb = fb
# divide function
def deivide_set(rows, column, value):
"""
rows:dataset
column:el of dataset
value: value of el
当输入一个dataset时, 先在列表中判断dataset中每一个元素的column的值是否等于value的输入值
"""
split_fuc = None
if isinstance(value, int) or isinstance(value, float):
split_fuc = lambda row:row[column] >= value
else:
split_fuc = lambda row:row[column] == value
set1 = [row for row in rows if split_fuc(row)]
set2 = [row for row in rows if not split_fuc(row)]
return (set1, set2)
# count the el of results in dataset
def unique_counts(rows):
"""
len(row):结果实际是rows的长度,但是下标可是从0开始的,所以len(row)- 1实际就是最后一个元素的下标
r:实际就是最后一个元素
"""
results = {}
for row in rows:
r = row[len(row) - 1]
if r not in results:
results[r] = 0
results[r] += 1
return results
# entropy
def entropy(rows):
"""
这个是求熵的公式
"""
from math import log
results = unique_counts(rows)
entropy = 0.0
for r in results.keys():
p = float(results[r]) / len(rows)
entropy = entropy - p*log(p, 2)
return entropy
# buildtree
def build_tree(rows, scoref=entropy):
if len(rows) == 0:return DecisionNode() # 集合为空,返回一个root节点
current_scroe = scoref(rows) # 求熵的值, entropy是上面熵的函数
best_gain = 0.0
best_criteria = None
best_sets = None
col_count = len(rows[0]) - 1
for col in range(0, col_count):
col_value = {}
for row in rows:
col_value[row[col]] = 1
for value in col_value.keys():
(set1, set2) = deivide_set(rows, col, value)
p = float(len(set1)) / len(rows)
gain = current_scroe - p*scoref(set1) - (1-p)*scoref(set2) # 信息增益的集合
if gain > best_gain and len(set1) > 0 and len(set2) > 0:
best_gain = gain
best_criteria = (col, value)
best_sets = (set1, set2)
if best_gain > 0:
true_branch = build_tree(best_sets[0])
false_branch = build_tree(best_sets[1])
return DecisionNode(col=best_criteria, value=best_criteria[0], tb=true_branch, fb=false_branch)
else:
return DecisionNode(results=unique_counts(rows))
决策树学习
最新推荐文章于 2024-05-31 16:10:43 发布