数据处理
数据就这样,也没啥好处理的
q
w
q
qwq
qwq
决策树(删减版)
删减在于,用的字典。
所以判断
r
e
t
u
r
n
return
return的时候少了,为了方便递归。
少了启发式算法,直接用信息增益率
计算信息增益
def calcShannonEnt(dataSet):
shannonEnt = 0
classList = [example[-1] for example in dataSet]#最后一个是类别
Set = set(classList)
for x in Set:
p = classList.count(x) / len(classList)
shannonEnt += (-1) * p * log(p)
return shannonEnt
根据信息增益率找最佳划分特征
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures): #iterate over all the features
featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
uniqueVals = set(featList) #get a set of unique values,属性值的数量
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy
if (infoGain > bestInfoGain): #compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
return bestFeature #returns an integer
生成决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]#最后一个是类别
if classList.count(classList[0]) == len(classList):
return classList[0]#stop splitting when all of the classes are equal
if len(labels) == 0:#stop splitting when there are no more features in dataSet
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)#对这一特征的各个属性值
for value in uniqueVals:
subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree
myTree = createTree(lenses,lensesLabels)
总代码
#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']
#计算原始数据的香农熵
import numpy as np
from math import log
def calcShannonEnt(dataSet):
shannonEnt = 0
classList = [example[-1] for example in dataSet]#最后一个是类别
Set = set(classList)
for x in Set:
p = classList.count(x) / len(classList)
shannonEnt += (-1) * p * log(p)
return shannonEnt
#print(calcShannonEnt(lenses))
def majorityCnt(classList):
thisset = set(classList)
num = 0
ans = 0
for x in thisset:
if classList.count(x) > num:
num = classList.count(x)
ans = x
return ans
#classList = [example[-1] for example in lenses]#最后一个是类别
#print(majorityCnt(classList))
#划分数据集
def splitDataSet(dataSet,feature_index,feature_value):
subDataSet = []
for b in dataSet:
if b[feature_index]==feature_value:
temp = b[:feature_index]#注意这里不能直接用del删除而应该用切片,用del原数据集会改变
temp.extend(b[feature_index+1:])#这两句等价于删除了这个特征,因为已经用于划分了
subDataSet.append(temp)
return subDataSet
#print(splitDataSet(lenses,0,"pre"))
#选择特征划分
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeatures): #iterate over all the features
featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
uniqueVals = set(featList) #get a set of unique values,属性值的数量
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy
if (infoGain > bestInfoGain): #compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
return bestFeature #returns an integer
#生成决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]#最后一个是类别
if classList.count(classList[0]) == len(classList):
return classList[0]#stop splitting when all of the classes are equal
if len(labels) == 0:#stop splitting when there are no more features in dataSet
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)#找最佳划分属性
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)#对这一特征的各个属性值
for value in uniqueVals:
subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree
myTree = createTree(lenses,lensesLabels)
#构造分类器
def classify(inputTree,featLabels,testVec):
#同学们自己编写
cnt = 0
while cnt < len(featLabels):
cnt += 1
key = str(list(inputTree.keys())[0])
val = 0
for x,y in zip(featLabels,testVec):
if x == key:
val = y
break
inputTree = inputTree[key][val]
if type(inputTree) != dict:
return inputTree
classLabel = inputTree
return classLabel
lensesLabels = ['age', 'prescript', 'astigmatic','tearRate']
print(classify(myTree, ['age','prescript','astigmatic','tearRate'],['young','myope','yes','normal']))
sklearn 实例
唯一麻烦的就是,需要实现离散化特征,字符串不能训练
需要用
L
a
b
e
l
E
n
c
o
d
e
r
LabelEncoder
LabelEncoder实现
#导入数据集
fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
#print(lenses)
lensesLabels = ['age','prescript','astigmatic','tearRate']
import numpy as np
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
x_train=np.array([x[:-1] for x in lenses])
y_train=np.array([x[-1] for x in lenses])
for i in range(len(x_train[0])):
le.fit(x_train[:,i])
x_train[:,i] = le.transform(x_train[:,i])
print(x_train)
print(y_train)
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
print('Train score:{:.3f}'.format(tree.score(x_train,y_train)))