C4.5算法是机器学习和数据挖掘领域中的一整套用于处理分类问题的算法。
该算法是有监督学习类型的。
计算公式:
样本数据:
Outlook | Temperature | Humidity | Windy | PlayGolf? |
sunny | 85 | 85 | FALSE | no |
sunny | 80 | 90 | TRUE | no |
overcast | 83 | 86 | FALSE | yes |
rainy | 70 | 96 | FALSE | yes |
rainy | 68 | 80 | FALSE | yes |
rainy | 65 | 70 | TRUE | no |
overcast | 64 | 65 | TRUE | yes |
sunny | 72 | 95 | FALSE | no |
sunny | 69 | 70 | FALSE | yes |
rainy | 75 | 80 | FALSE | yes |
sunny | 75 | 70 | TRUE | yes |
overcast | 72 | 90 | TRUE | yes |
overcast | 81 | 75 | FALSE | yes |
rainy | 71 | 91 | TRUE | no |
实现:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@Author : 德意志之力
@contact :zengliang0720@gmail.com
@File : simpleC4.5.py
@Time : 17/5/16 下午10:57
'''
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
import pandas as pd
import numpy as np
def createTrainTree():
#pandas读取csv数据
data_df = pd.read_csv("data.csv")
#数据分类值
target = np.array(data_df['Play Golf?'])
data_df = data_df[['Outlook','Temperature','Humidity','Windy']]
#将字符类型转为int
data_df.Outlook[data_df.Outlook == 'Sunny'] = 0
data_df.Outlook[data_df.Outlook == 'Overcast'] = 1
data_df.Outlook[data_df.Outlook == 'Rainy'] = 2
#数据属性
data = []
for line in range(len(data_df)):
data.append(np.array(data_df.ix[line]).tolist())
return data,target
def treeLearn():
data , target = createTrainTree()
#实例决策树类
dt = DecisionTreeClassifier()
#拟合决策树模型
dt.fit(data,target)
#预测样本类别值
print dt.predict([0, 65, 76, False])
#做交叉检验
print cross_val_score(dt,data,target,cv = 5)
treeLearn()
如有问题,欢迎指正。