1.使用sklearn来完成购买电脑的预测
import csv
from sklearn import preprocessing, tree
from sklearn.feature_extraction import DictVectorizer
featureList = [] # 特征向量容器 --> 注意这可是特征和特征值组成的dict所组成的List
labelList = [] # 标签容器
data = [] # 原始数据集
header = []
try:
with open('D:\machineLearning\src\decisionTree\AllElectronics.csv','r') as allElectronics:
reader = csv.reader(allElectronics)
header = next(reader) # ['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer']
data = [row for row in reader] # 使用列表解析
except csv.Error as e:
print("Error reading CSV file at line %s: %s" % (reader.line_num,e))
# row ['1', 'youth', 'high', 'no', 'fair', 'no']
# 将featureList转换成:featureList = [{k1,v1},{k2,v2}]
for row in data:
labelList.append(row[len(row)-1]) #数据集中的最有一行为标签
rowDict = {}
for i in range(1,len(row)-1):
rowDict[header[i]] = row[i]
featureList.append(rowDict)
# 将featureList转换成数值类型键值矩阵
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
print("dummyX:" + str(dummyX))
print(vec.get_feature_names())
# 将labelList转换成数值类型向量
lb = preprocessing.LabelBinarizer()
dummY = lb.fit_transform(labelList)
print("dummyY: " + str(dummY))
# 构建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy') # 使用熵的度量,也就是使用ID3算法
clf = clf.fit(dummyX,dummY)
print("clf: " + str(clf))
#画出决策树
# with open('allElectronicInformationGainOri.dot','w') as f:
# f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file=f)
# 预测实验
oneRow = dummyX[0,:]
newRowX = oneRow
newRowX[0] = 1 # youth
newRowX[2] = 0 # 高收入
print("newRowX: " + str(newRowX))
willBePredict = []
willBePredict.append(newRowX)
predictedY = clf.predict(willBePredict)
print("predictedY: " + str(predictedY[0])) # 预测会购买
2.背后的原理
#_*_ coding:utf-8 _*_
import operator
from math import log
# 构建数据集
def createDateSet():
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
label = ['no surfacing','flippers']
return dataSet,label
# 计算数据集的熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet) # 获取数据集长度
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
# 开始计算熵 plogp
shannonEnt = 0.0
for key in labelCounts.keys():
prop = float(labelCounts[key])/numEntries
shannonEnt -= prop * log(prop,2)
return shannonEnt
# 按照给定的特征划分数据集,取出每个特征的第axis个属性的值为value的特征,并将第axis后的属性形成一个新的list
# 该算法为计算每个特征的信息增益做准备
def splitDataSet(dataSet,axis,value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 计算每个特征的信息增益,并返回信息增益最大的那个特征
def chooseBestFeatureToSplit(dataSet):
beseEntropy = calcShannonEnt(dataSet) # 数据集合的信息增益
numFeatures = len(dataSet[0]) - 1 # 获取特征的个数
baseInfoGain = 0.0; bastFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList) # 类别标签组成的集合 (去除相同的元素)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value) # 或找出dataSet中第i个特征值为value的数据集合
prob = len(subDataSet) / float(len(dataSet)) # 计算频率(拟合概率)
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = beseEntropy - newEntropy # 计算每种特征向量的信息增益
if(infoGain > baseInfoGain): # 取出最大的信息增益所对应的特征向量说对应的索引地址
baseInfoGain = infoGain
bastFeature = i
return bastFeature
# 多数表决筛选出类别
def majorityCnt(classList):
classCount = {}
for vote in classList.keys():
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# 构建决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet] # ['yes', 'yes', 'no', 'no', 'no']
if classList.count(classList[0]) == len(classList): # 第一种情况,当类别完全相同时,停止划分
return classList[0]
if len(dataSet[0]) == 1: # 当只有一个特征的时候,遍历完所有实例返回出现次数最多的类别
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 筛选出信息增益最大的特征
bestFeatLabl = labels[bestFeat]
myTree = {bestFeatLabl : {}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabl][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
dataSet, label = createDateSet()
# ent = calcShannonEnt(dataSet)
# print(ent)
# data_set = splitDataSet(dataSet, 1, 1)
# print(data_set)
# bestFeature = chooseBestFeatureToSplit(dataSet)
# print(bestFeature)
myTree = createTree(dataSet, label)
print(myTree) #{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}