CART
import matplotlib.pyplot as plt
import numpy as np
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 带切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
Desc:
生成叶结点
Parameters:
dataSet - 数据集合
Returns:
目标变量均值
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
Desc:
误差估计函数
Parameters:
dataSet - 数据集合
Returns:
目标变量的总方差
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
"""
Desc:
找到数据的最佳二元切分方式函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
bestIndex - 最佳切分特征
bestValue - 最佳特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf')
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
"""
Desc:
树构建函数
Parameters:
dataSet - 数据集合
leafType - 生成叶结点的函数
errType - 误差估计函数
ops - 用户定义的参数构成的元组
Returns:
retTree - 构建的回归树
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:
return val
retTree = {
}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
if __name__ == '__main__':
myData = loadDataSet('ex2.txt')
myMat = np.mat(myData)
print(createTree(myMat))
import matplotlib.pyplot as plt
import numpy as np
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][1])
ycord.append(dataMat[i][2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
if __name__ == '__main__':
filename = 'ex0.txt'
plotDataSet(filename)
import matplotlib.pyplot as plt
import numpy as np
import types
"""
Desc:
加载数据
Parameters:
fileName - 文件名
Returns:
dataMat - 数据矩阵
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
Desc:
绘制数据集
Parameters:
fileName - 文件名
Returns:
None
"""
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
plt.show()
"""
Desc:
根据特征切分数据集合
Parameters:
dataSet - 数据集合
feature - 带切分的特征
value - 该特征的值
Returns:
mat0 - 切分的数据集合0
mat1 - 切分的数据集合1
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np