不知道我是不是入行晚了(爆哭),为啥找的资料都是好久好久以前的了,算了将就着看看吧
1.第三章代码
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 21 11:27:43 2020
@author: 29033
"""
''' script 3-1 '''
from math import log
import operator
#计算给定数据集的熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)#确定dataset有几行==有几个样本
labelCounts = {} #创建字典
for featVec in dataSet:#针对每一行处理
currentLabel = featVec[-1]#查找到标签
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0#如果没有的话,就创建
labelCounts[currentLabel] += 1#有的话,计数+1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
#创建数据集
def createDataSet():
dataSet = [[1,1,'yes'],#最后一个实际是标签
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,0,'no']]
labels = ['no surfacing','flippers']#实际是特征的名字,在字典形式里是键
return dataSet,labels
#按给定数据集进行划分
def splitDataSet(dataSet,axis,value)->list:
retDataSet = []
for featVec in dataSet:
if featVec [axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
#选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;bestFeature = -1
for i in range(numFeatures):
feaList = [example[i] for example in dataSet]#第i个属性所有可能的值
uniqueVals = set(feaList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)#针对某个属性的特定值
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calcShannonEnt(subDataSet)#加权
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain#找最大的熵增益
bestFeature = i
return bestFeature
#递归构建决策树
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),#返回的是一个新的列表
key = operator.itemgetter(1),#按照对象的第1个域【第二个位置】的值
reverse = True)#降序排列
#sort 会修改原始的 list(返回值为None)
return sortedClassCount
#递归创建树的函数代码
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]#标签列表
if classList.count(classList[0]) == len(classList):#count() 方法用于统计某个元素在列表中出现的次数。
return classList[0]
if len(dataSet[0]) == 1:#消耗到只有标签
return majorityCnt(classList)#所有标签中,最多的那一项
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]#最佳特征
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])#del删除的是变量,而不是数据;del 是删除引用而不是删除对象;因为可能迭代
featValues = [example[bestFeat] for example in dataSet]#挑出最佳特征的全部值
uniqueVals = set(featValues)#最佳特征的值只保留不同的
for value in uniqueVals:
subLabels = labels[:]#为了分辨,因为可能迭代
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),
subLabels)
return myTree
#使用决策树进行分类
def classify(inputTree,featLabels,testVec):#inputTree:训练集得到的决策树;featLabels:训练集属性名;testVec:测试集
firstStr = list(inputTree.keys())[0]
#print('firstStr:',firstStr)
secondDict = inputTree[firstStr]#确定有哪几条路
#print('secondDict:',secondDict)
featIndex = featLabels.index(firstStr)#将出发点属性(键)转换为索引,方便找出测试集同样的属性的值
#print('featIndex:',featIndex)
for key in secondDict.keys():
#print('key:',key)
#print('**',testVec[featIndex])
if testVec[featIndex] == key:#看对应位置与键对应吗-->走这条路吗
if type(secondDict[key]).__name__ == 'dict':
# print('secondDict[key]:',secondDict[key])#走这条路,可以确定这条路有哪些分叉
classLabel = classify(secondDict[key],featLabels,testVec)
else:
# print('***999***',secondDict[key])
classLabel = secondDict[key]
# print('***666***')
return classLabel
#决策树的储存
def storeTree(inputTree,filename):
import pickle
fw = open(filename,'wb')
pickle.dump(inputTree,fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename,'rb')
return pickle.load(fr)
treeplotter
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 13 08:49:44 2020
@author: 29033
"""
import matplotlib
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle = "sawtooth",fc = "0.8")
leafNode = dict(boxstyle = "round4",fc = "0.8")
arrow_args = dict(arrowstyle = "<-")
matplotlib.rcParams['font.family'] = 'STSong'
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
createPlot.axl.annotate(nodeTxt,xy = parentPt,\
xycoords = "axes fraction",
xytext = centerPt,textcoords = "axes fraction",\
va = "center",ha = "center",bbox = nodeType,
arrowprops = arrow_args)
def createPlot():
fig = plt.figure(1,facecolor = "white")
fig.clf()
createPlot.axl = plt.subplot(111,frameon = False)
plotNode('决策结点',(0.5,0.1),(0.1,0.5),decisionNode)
plotNode('叶结点',(0.8,0.1),(0.3,0.8),leafNode)
plt.show()
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]#在python3中myTree.keys()返回的则是dick_keys类型
#实际返回的是首个key,确定了出发点
#print(firstStr)
secondDict = myTree[firstStr]#返回的是摘掉首个Key剩下的,即键值;或者是确定将要走的路的范围
#print(secondDict)
#举例来说,调用依次返回的是
'''
no surfacing#key
{0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}#value
flippers
{0: 'no', 1: 'yes'}
'''
for key in secondDict.keys():
#print('**',key)
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])#牛掰
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
maxDepth = getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
def retrieveTree(i):
listOfTree = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head':{0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTree[i]
def plotMidText(cntrPt,parentPt,txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0+cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0+cntrPt[1]
createPlot.axl.text(xMid,yMid,txtString)
def plotTree(myTree,parentPt,nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0+float(numLeafs))/2.0/plotTree.totalW,\
plotTree.yOff)
plotMidText(cntrPt,parentPt,nodeTxt)
plotNode(firstStr,cntrPt,parentPt,decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type (secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key],cntrPt,str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),\
cntrPt,leafNode)
plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#主函数
def createPlot(inTree):
fig = plt.figure(1,facecolor = 'white')
fig.clf()
axprops = dict(xticks = [],yticks = [])
createPlot.axl = plt.subplot(111,frameon = False,**axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree,(0.5,1.0),'')
plt.show()
调用函数
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 23 12:52:29 2020
@author: 29033
"""
#建立树
import trees
import treePlotter
'''
#测试
myDat,labels = trees.createDataSet()
#myDat[0][-1] = 'maybe'
#print(trees.calcShannonEnt(myDat))
#print(trees.chooseBestFeatureToSplit(myDat))
#myTree = trees.createTree(myDat,labels)
#print(myTree)
myTree = treePlotter.retrieveTree(0)
print(trees.classify(myTree,labels,[1,0]))
#print(trees.classify(myTree,labels,[1,1]))
'''
'''
import treePlotter
treePlotter.createPlot()
myTree = treePlotter.retrieveTree(0)
treePlotter.createPlot(myTree)
#print(treePlotter.getTreeDepth(myTree))
'''
#trees.storeTree(myTree,'cl.txt')
myTree = trees.grabTree('cl.txt')
print(myTree)
第九章 树回归
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 10:40:34 2020
@author: 29033
"""
'''
from numpy import *
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet,feature,value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
return mat0,mat1
def regLeaf(dataSet):
return mean(dataSet[:,-1])
def regErr(dataSet):
return var(dataSet[:,-1])*shape(dataSet)[0]
def chooseBestSplit(dataSet,leafType = regLeaf,errType = regErr,ops=(1,4)):
#进行了预剪枝
tolS = ops[0];tolN = ops[1]
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
return None,leafType(dataSet)
m,n = shape(dataSet)
S = errType(dataSet)
bestS = inf;bestIndex = 0;bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):
mat0,mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
continue
newS = errType(mat0)+errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S-bestS) < tolS:#下降的误差起码要有,不然不分类
return None,leafType(dataSet)
mat0,mat1 = binSplitDataSet(dataSet,bestIndex,bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):#最少应有的切分样本数目,过低容易导致过拟合
return None,leafType(dataSet)
return bestIndex,bestValue
def createTree(dataSet,leafType = regLeaf,errType = regErr,ops = (1,4)):
feat,val = chooseBestSplit(dataSet,leafType,errType,ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet,rSet = binSplitDataSet(dataSet,feat,val)
retTree['left'] = createTree(lSet,leafType,errType,ops)
retTree['right'] = createTree(rSet,leafType,errType,ops)
return retTree
#后剪枝
def isTree(obj):
return (type(obj).__name__ == 'dict')
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left']+tree['right'])/2.0
def prune(tree,testData):#待剪枝的树与,实际上该是验证集
if shape(testData)[0] == 0:
return getMean(tree)
if isTree(tree['left']) or isTree(tree['right']):
lSet,rSet = binSplitDataSet(testData,tree['spInd'],tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'],lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'],rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet,rSet = binSplitDataSet(testData,tree['spInd'],tree['spVal'])
errorNoMerge = sum (power(lSet[:,-1] - tree['left'],2)) + sum(power(rSet[:,-1] - tree['right'],2))#不分支
treeMean = (tree['left'] + tree['right'])/2.0
errorMerge = sum(power(lSet[:,-1] - treeMean,2))#进行分支
if errorMerge < errorNoMerge:
print("merging")
return treeMean#合并
else:
return tree
else:
return tree#不剪枝
#模型树
def linearSolve(dataSet):
m,n = shape(dataSet)
X = mat(ones((m,n)));Y = mat(ones((m,1)))
X[:,1:n] = dataSet[:,0:n-1]#从0开始计数.第一列是常数项
Y = dataSet[:,-1]
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular,cannot do inverse,\n\
try increasing the second value of ops')#不可逆
ws = xTx.I*(X.T*Y)
return ws,X,Y
def modelLeaf(dataSet):
ws,X,Y =linearSolve(dataSet)
return ws
def modelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X*ws
return sum(power(Y-yHat,2))
#用树回归进行预测
def regTreeEval(model,inDat):#对回归叶节点进行预测
return float(model)
def modelTreeEval(model,inMat):
n=inMat.shape[1]
X=mat(ones((1,n)))
X[:,1:n]=inMat[:,:-1]
return float(X*model)
def treeForeCast(tree,inData,modelEval = regTreeEval):
if not isTree(tree):
return modelEval(tree,inData)
if float(inMat[:,tree['spInd']])>tree['spVal']:
if isTree(tree['left']):
return treeForeCast(tree['left'],inData,modelEval)
else:
return modelEval(tree['left'],inData)
else:
if isTree(tree['right']):
return treeForeCast(tree['right'],inData,modelEval)
else:
return modelEval(tree['right'],inData)
def createForeCast(tree,testData,modelEval = regTreeEval):#预测
m = len(testData)
yHat = mat(zeros((m,1)))
for i in range(m):
yHat[i,0] = treeForeCast(tree,mat(testData[i]),modelEval)
return yHat
#这段是我写的,下面是大佬写的
'''
# -*- coding:utf-8 -*-
import math
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet(fileName):
fr=open(fileName)
dataSet=[]
for line in fr.readlines():
items=line.strip().split('\t')
dataSet.append(list(map(float,items)))
return dataSet
def regLeaf(dataMat):
return mean(dataMat[:,-1])
def regErr(dataMat):
return var(dataMat[:,-1])*dataMat.shape[0]
def modelLeaf(dataMat):
ws,X,Y=linearSolve(dataMat)
return ws
def modelErr(dataMat):
ws,X,Y=linearSolve(dataMat)
YHat=X*ws
return sum(power(YHat-Y,2))
def binSplitDataSet(dataMat,feature,value):
mat0=dataMat[nonzero(dataMat[:,feature]>value)[0]]
mat1=dataMat[nonzero(dataMat[:,feature]<=value)[0]]
return mat0,mat1
def chooseBestFeature(dataMat,leafType,errType,ops):
tolS=ops[0];tolN=ops[1]
if len(set(dataMat[:,-1].T.tolist()[0]))==1:
return None,leafType(dataMat)
m,n=shape(dataMat);S=errType(dataMat)
bestS=inf;bestVal=0;bestFeature=0
for feat in range(n-1):
for splitValue in set(dataMat[:,feat].T.tolist()[0]):
mat0,mat1=binSplitDataSet(dataMat,feat,splitValue)
if (mat0.shape[0]<tolN) or (mat1.shape[0]<tolN):
continue
nowErr=errType(mat0)+errType(mat1)
if nowErr<bestS:
bestS=nowErr
bestFeature=feat
bestVal=splitValue
if abs(S-bestS)<tolS:
return None,leafType(dataMat)
mat0,mat1=binSplitDataSet(dataMat,bestFeature,bestVal)
if (mat0.shape[0]<tolN) or (mat1.shape[0]<tolN):
return None,leafType(dataMat)
return bestFeature,bestVal
def isTree(obj):
return (type(obj).__name__=='dict')
def createTree(dataMat,leafType=modelLeaf,errType=modelErr,ops=(1,4)):
feat,val=chooseBestFeature(dataMat,leafType,errType,ops)
if feat==None:
return val
retTree={}
retTree['spInd']=feat
retTree['spVal']=val
leftMat,rightMat=binSplitDataSet(dataMat,feat,val)
retTree['lTree']=createTree(leftMat,leafType,errType,ops)
retTree['rTree']=createTree(rightMat,leafType,errType,ops)
# 建树的时候计算出每个节点下面的叶子节点数量,并且计算出该节点下面的叶子节点的和
# 方便后剪枝的时候能够快速的对树进行塌陷处理
# 此处改动已经和原书中的写法有了很大不同
if isTree(retTree['lTree']) and isTree(retTree['rTree']):
retTree['leafN']=retTree['lTree']['leafN']+retTree['rTree']['leafN']
retTree['total']=retTree['lTree']['total']+retTree['rTree']['total']
elif (not isTree(retTree['lTree'])) and isTree(retTree['rTree']):
retTree['leafN']=1+retTree['rTree']['leafN']
retTree['total']=retTree['lTree']+retTree['rTree']['total']
elif isTree(retTree['lTree']) and (not isTree(retTree['rTree'])):
retTree['leafN']=retTree['lTree']['leafN']+1
retTree['total']=retTree['lTree']['total']+retTree['rTree']
else:
retTree['leafN']=2
retTree['total']=retTree['lTree']+retTree['rTree']
return retTree
def getMean(tree):
if isTree(tree):
if isTree(tree['lTree']):
tree['lTree']=tree['lTree']['total']
if isTree(tree['rTree']):
tree['rTree']=tree['rTree']['total']
return tree['total']*1.0/tree['leafN']
else:
return tree
def prune(tree,testData):
if testData.shape[0]==0:
return getMean(tree)
if isTree(tree['lTree']) or isTree(tree['rTree']):
lSet,rSet=binSplitDataSet(testData,tree['spInd'],tree['spVal'])
if isTree(tree['lTree']):
tree['lTree']=prune(tree['lTree'],lSet)
if isTree(tree['rTree']):
tree['rTree']=prune(tree['rTree'],rSet)
if not isTree(tree['lTree']) and not isTree(tree['rTree']):
lSet,rSet=binSplitDataSet(testData,tree['spInd'],tree['spVal'])
errNoMerge=sum(power(lSet[:,-1]-tree['lTree'],2))+sum(power(rSet[:,-1]-tree['rTree'],2))
treeMean=tree['total']/tree['leafN']
errMerge=sum(power(testData[:,-1]-treeMean,2))
if errNoMerge<errMerge:
print ("merging")
return treeMean
else:
return tree
else:
return tree
def linearSolve(dataSet):
m,n = shape(dataSet)
X = mat(ones((m,n)));Y = mat(ones((m,1)))
X[:,1:n] = dataSet[:,0:n-1]#从0开始计数.第一列是常数项
Y = dataSet[:,-1]
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular,cannot do inverse,\n\
try increasing the second value of ops')#不可逆
ws = xTx.I*(X.T*Y)
return ws,X,Y
# 回归树预测
def regTreeEval(model,inMat):
return float(model)
# 模型树预测
def modelTreeEval(model,inMat):
n=inMat.shape[1]
X=mat(ones((1,n)))
X[:,1:n]=inMat[:,:-1]
return float(X*model)
def treeForeCast(tree,inMat,modelEval=modelTreeEval):
if not isTree(tree):
return modelEval(tree,inMat)
if float(inMat[:,tree['spInd']])>tree['spVal']:
if not isTree(tree['lTree']):
return modelEval(tree['lTree'],inMat)
else:
return treeForeCast(tree['lTree'],inMat,modelEval)
else:
if not isTree(tree['rTree']):
return modelEval(tree['rTree'],inMat)
else:
return treeForeCast(tree['rTree'],inMat,modelEval)
def createForeCast(tree,testMat,modelEval=modelTreeEval):
m=testMat.shape[0]
yHat=mat(zeros((m,1)))
for i in range(m):
yHat[i]=treeForeCast(tree,testMat[i],modelEval)
return yHat
def run():
dataSet=loadDataSet('bikeSpeedVsIq_train.txt')
testSet=loadDataSet('bikeSpeedVsIq_test.txt')
tree=createTree(mat(dataSet),ops=(1,20))
yHat=createForeCast(tree,mat(testSet))
print (corrcoef(yHat.T,mat(testSet)[:,1].T))
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(array(dataSet)[:,0],array(dataSet)[:,1],c='cyan',marker='o')
plt.show()
run()