CART.py
# encoding=utf8
import numpy as np
import matplotlib.pyplot as plt
from random import randrange
from FileReader import loadDataForCART
from FileWriter import writeByJson
from collections import defaultdict, Counter
from Metric import *
DISCRETE_FLAG = 0
CONTINUOUS_FLAG = 1
class Node:
def __init__(self, fea=None, val=None, res=None, flag=None, left=None, right=None):
self.fea = fea
self.val = val
self.res = res
self.flag = flag
self.left = left
self.right = right
class CART:
def __init__(self, epsilon=1e-3, minSample=1, maxDepth=20, task='classify'):
self.modelName = 'CART'
self.epsilon = epsilon
self.minSample = minSample
self.tree = None
self.maxDepth = maxDepth
self.task = task
# 这里需要根据时分类任务还是回归任务来选择不纯度的计算方法
if task == 'classify':
self.impurityFunc = self.getGini
self.resCalcFunc = self.calcNodeClassifyResult
self.getNoMergeErrorFunc = self.classifyByNode
self.getMergeErrorFunc = self.classifyByMajority
elif task == 'regression':
self.impurityFunc = self.getMSE
self.resCalcFunc = self.calcNodeRegressionResult
self.getNoMergeErrorFunc = self.regressByNode
self.getMergeErrorFunc = self.regressByMajority
# 判断数据集中某一维特征是连续的还是离散的
def __discreteOrContinuous(self, xData, feaUnique, beta=0.2):
# 如果是连续值则返回1,若为离散值则返回0
if np.shape(feaUnique)[0] > beta * (np.shape(xData)[0]):
return CONTINUOUS_FLAG
else:
return DISCRETE_FLAG
# 连续数据获得二分切分点
def __getSplits(self, feaArray):
feaArray = np.sort(feaArray)
array1 = feaArray[:-1]
array2 = feaArray[1:]
return (array1 + array2) / 2.0
def getGini(self, yData):
c = Counter(yData)
return 1 - sum([(val / float(yData.shape[0])) ** 2 for val in c.values()])
def getMSE(self, yData):
c = Counter(yData)
ySquaredSum = np.sum(np.power(yData, 2))
ySum = np.sum(yData)
return (ySquaredSum / c) - (ySum / c) ** 2
def __getFeaGini(self, set1, set2):
num = set1.shape[0] + set2.shape[0]
return float(set1.shape[0]) / num * self.getGini(set1) + float(set2.shape[0]) / num * self.getGini(set2)
def __getFeaImpurity(self, set1, set2):
num = set1.shape[0] + set2.shape[0]
return float(set1.shape[0]) / num * self.impurityFunc(set1) + \
float(set2.shape[0]) / num * self.impurityFunc(set2)
def calcNodeClassifyResult(self, yData):
return Counter(yData).most_common(1)[0][0]
def calcNodeRegressionResult(self, yData):
return np.mean(yData)
def bestSplit(self, splitSets, xData, yData):
preImpurity = self.impurityFunc(yData)
# subdataInds每个切分点包含的样本索引(特征维度索引, 切分值, flag):[样本索引]
subdataInds = defaultdict(list)
# 找出每一个满足切分点的子集索引,加入字典当中
for split in splitSets:
for index, sample in enumerate(xData):
if split[2] == 1:
if sample[split[0]] <= split[1]:
subdataInds[split].append(index)
elif split[2] == 0:
if sample[split[0]] == split[1]:
subdataInds[split].append(index)
minPurity = 1
bestSplit = None
bestSet = None
flag = None
for split, index in subdataInds.items():
set1 = yData[index]
set2Ind = list(set(range(yData.shape[0])) - set(index))
set2 = yData[set2Ind]
if set1.shape[0] < 1 or set2.shape[0] < 1:
continue
newImpurity = self.__getFeaImpurity(set1, set2)
if newImpurity < minPurity:
minPurity = newImpurity
bestSplit = split
bestSet = (index, set2Ind)
flag = split[2]
if abs(minPurity - preImpurity) < self.epsilon:
bestSplit = None
# else:
# TODO 计算importance,需要计算当前总不纯度,impurity_all - impurity_new
# importance[bestSplit[0]] = preGini - minGini
return bestSplit, bestSet, minPurity, flag
def build(self, splitSets, xData, yData, currentDepth):
# 集合内若样本数少于阈值则返回最多的样本类别
if (yData.shape[0] < self.minSample) or (currentDepth == self.maxDepth):
return Node(res=self.resCalcFunc(yData))
# 计算当前集合中最佳的切分点
bestSplit, bestSet, minImpurity, flag = self.bestSplit(splitSets, xData, yData)
if bestSplit is None:
return Node(res=self.resCalcFunc(yData))
else:
splitSets.remove(bestSplit)
left = self.build(splitSets, xData=xData[bestSet[0]], yData=yData[bestSet[0]],
currentDepth=currentDepth + 1)
right = self.build(splitSets, xData=xData[bestSet[1]], yData=yData[bestSet[1]],
currentDepth=currentDepth + 1)
return Node(fea=bestSplit[0], val=bestSplit[1], flag=flag, right=right, left=left)
# 前续遍历训练好的模型
def preorderTraversal(self, currentNode, dictWrite, depth, verbose=False):
if currentNode is None:
return
if verbose:
print("---" * depth, "[depth:{} feature:{}, val:{}, res:{}]".
format(depth, currentNode.fea, currentNode.val, currentNode.res))
dictWrite['fea'] = currentNode.fea
dictWrite['val'] = currentNode.val
dictWrite['res'] = currentNode.res
dictWrite['flag'] = currentNode.flag
if currentNode.left is not None:
dictWrite['left'] = {}
self.preorderTraversal(currentNode.left, dictWrite['left'], depth + 1)
else:
dictWrite['left'] = None
if currentNode.right is not None:
dictWrite['right'] = {}
self.preorderTraversal(currentNode.right, dictWrite['right'], depth + 1)
else:
dictWrite['right'] = None
def fit(self, xData, yData, randomFeature=False, nFeatures=None, mustReservedInd=None):
"""
Fit the input data and train the model
Parameters
----------
xData : numpy.ndarray
Feature set of input samples
yData : numpy.ndarray
Label set of input samples
randomFeature : bool, optional
Whether the random jitter of the feature is adopted
nFeatures : int, optional
Number of randomly sampled features, which used while randomFeature is True. If nFeatures is None,
the value of the nFeatures will be calculated according to a specific formula.
mustReservedInd : int, optional
The feature index must be added to the training set, which used while randomFeature is True.
"""
if not isinstance(xData, np.ndarray):
print("xData must be type of numpy.ndarray")
return
if not isinstance(yData, np.ndarray):
print("yData must be type of numpy.ndarray")
return
splitsSet = []
features = set()
# 如果采用随机特征抖动(随机森林)
if randomFeature:
if nFeatures is None:
nFeatures = np.round(np.sqrt(np.shape(xData)[1]))
while len(features) < nFeatures:
ind = randrange(1, np.shape(xData)[1])
if ind not in features:
features.add(ind)
# 因为第一个参数表示当前链路是哪一条链路,所以默认必须加入
if mustReservedInd is not None:
if isinstance(mustReservedInd, int) is False:
print("mustReservedInd must be of type int or None")
return
features.add(mustReservedInd)
else:
features = set(range(np.shape(xData)[1]))
for feaInd in features:
# 将同一维特征的所有的取值提取出来
feaUnique = np.unique(xData[:, feaInd])
# 自适应判断当前特征是否为连续类型
flag = self.__discreteOrContinuous(xData, feaUnique)
if feaUnique.shape[0] < 2:
continue
elif feaUnique.shape[0] == 2:
if flag == CONTINUOUS_FLAG:
midVal = (feaUnique[0] + feaUnique[1]) / 2.0
splitsSet.append((feaInd, midVal, flag))
elif flag == DISCRETE_FLAG:
splitsSet.append((feaInd, feaUnique[0], flag))
else:
if flag == 1:
feaUnique = self.__getSplits(feaUnique)
for val in feaUnique:
splitsSet.append((feaInd, val, flag))
self.tree = self.build(splitsSet, xData, yData, 0)
# 进行后剪枝操作
dataSet = np.hstack((xData, yData.reshape(-1, 1)))
self.prune(dataSet, self.tree)
return self
def findRes(self, data, tree):
if tree.res is not None:
return tree.res
else:
if tree.flag == CONTINUOUS_FLAG:
if data[tree.fea] <= tree.val:
return self.findRes(data, tree.left)
elif data[tree.fea] > tree.val:
return self.findRes(data, tree.right)
elif tree.flag == DISCRETE_FLAG:
if data[tree.fea] == tree.val:
return self.findRes(data, tree.left)
else:
return self.findRes(data, tree.right)
def predict(self, xData):
return self.findRes(xData, self.tree)
# ======================================后剪枝======================================
def __binSplit(self, dataSet, node):
fea = node.fea
val = node.val
flag = node.flag
if flag == DISCRETE_FLAG:
leftSet = dataSet[np.nonzero(dataSet[:, fea] == val)[0], :]
rightSet = dataSet[np.nonzero(dataSet[:, fea] != val)[0], :]
elif flag == CONTINUOUS_FLAG:
leftSet = dataSet[np.nonzero(dataSet[:, fea] <= val)[0], :]
rightSet = dataSet[np.nonzero(dataSet[:, fea] > val)[0], :]
return leftSet, rightSet
def classifyByNode(self, dataSet, node):
errorCount = 0
for data in dataSet:
res = self.findRes(data, node)
if res != data[-1]:
errorCount += 1
return errorCount
def regressByNode(self, dataSet, node):
sumError = 0
dataSize = dataSet.shape[0]
for data in dataSet:
res = self.findRes(data, node)
sumError += np.abs(res - data[-1])
return sumError / dataSize
def classifyByMajority(self, dataSet):
errorCount = 0
majLabel = Counter(dataSet[:, -1]).most_common(1)[0][0]
for data in dataSet:
if majLabel != data[-1]:
errorCount += 1
return errorCount, majLabel
def regressByMajority(self, dataSet):
sumError = 0
dataSize = dataSet.shape[0]
meanValue = np.mean(dataSet[:, -1])
for data in dataSet:
sumError += np.abs(data[-1] - meanValue)
return sumError / dataSize, meanValue
# testSet内的每一个样本的最后一维都是他们的分类
def prune(self, testSet, node, verbose=False):
if verbose:
print("pruning now=====>")
if len(testSet) == 0:
return
lSet = None
rSet = None
if (node.left and node.left.res is None) or (node.right and node.right.res is None):
lSet, rSet = self.__binSplit(testSet, node)
if node.left and node.left.res is None:
self.prune(lSet, node.left)
if node.right and node.right.res is None:
self.prune(rSet, node.right)
if (node.left and node.left.res is not None) and (node.right and node.right.res is not None):
errorNoMerge = self.getNoMergeErrorFunc(testSet, node)
errorMerge, label = self.getMergeErrorFunc(testSet)
if errorMerge <= errorNoMerge:
node.res = label
node.left = None
node.right = None
node.fea = None
node.val = None
node.flag = None
def main():
tree = {}
features, labels = loadDataForCART('switch.txt', 'no_switch.txt', 0)
features = np.array(features)
labels = np.array(labels)
cart = CART(minSample=5)
cart.fit(np.array(features), np.array(labels))
cart.preorderTraversal(cart.tree, tree, 0)
# if hasattr(cart, "modelName"):
# print("True modelName")
#
# m = Metric(cart, 'f_beta', beta=0.7)
# m.eval(features, labels, True)
sample = [-1.0, 363.0, 168.0, 42.0, 2.0, 187.0, 3.0, 254.0, 361.0, 54.0, 165.0, 4.0, 0.0, -55.0, 47.0, 69.0, 367.0,
274.0, 0.0, 249.0]
print("res is: ", cart.predict(sample))
writeByJson(tree)
if __name__ == "__main__":
main()
RandomForest.py
# encoding=utf8
import numpy as np
from random import randrange
from collections import Counter
from CART import CART
import multiprocessing as mp
class RandomForest:
def __init__(self, nTrees, epsilon=1e-3, minSample=1, maxDepth=20, task='classify'):
self.epsilon = epsilon
self.minSample = minSample
self.maxDepth = maxDepth
# the number of trees in random forest
self.nTrees = nTrees
# a list of random forest
self.trees = []
self.task = task
self.xData = None
self.yData = None
self.randomFeature = None
self.subSampleRatio = None
self.nFeatures = None
self.mustReservedInd = None
if task == 'classify':
self.getResFunc = self.getResultForClassification
elif task == 'regression':
self.getResFunc = self.getResultForRegression
def __subsample(self, features, labels, ratio):
subSet = []
subLabel = []
numSample = round(len(features) * ratio)
while len(subSet) <= numSample:
randomIndex = randrange(len(features))
subSet.append(features[randomIndex])
subLabel.append(labels[randomIndex])
return np.array(subSet), np.array(subLabel)
def getResultForClassification(self, resList):
return Counter(resList).most_common(1)[0][0]
def getResultForRegression(self, resList):
return np.mean(resList)
def singleTreeFit(self, tree):
subSet, subLabel = self.__subsample(self.xData, self.yData, self.subSampleRatio)
return tree.fit(subSet, subLabel, randomFeature=self.randomFeature,
nFeatures=self.nFeatures, mustReservedInd=self.mustReservedInd)
def fit(self, xData, yData, randomFeature=False, subSampleRatio=0.8, nFeatures_=None, mustReservedInd=None, nJobs=None):
self.xData = xData
self.yData = yData
self.randomFeature = randomFeature
self.subSampleRatio = subSampleRatio
self.nFeatures = nFeatures_
self.mustReservedInd = mustReservedInd
try:
workers = mp.cpu_count()
except NotImplementedError:
workers = 1
if nJobs:
workers = nJobs
pool = mp.Pool(processes=workers)
for i in range(self.nTrees):
tree = CART(self.epsilon, self.minSample, self.maxDepth, self.task)
self.trees.append(tree)
res = pool.map(self.singleTreeFit, self.trees)
self.trees = list(res)
# for i in range(self.nTrees):
# subSet, subLabel = self.__subsample(xData, yData, subSampleRatio)
# tree = CART(self.epsilon, self.minSample, self.maxDepth)
# tree.fit(subSet, subLabel, randomFeature=randomFeature, nFeatures=nFeatures_, mustReservedInd=mustReservedInd)
# self.trees.append(tree)
def predict(self, xData):
resList = []
for tree in self.trees:
res = tree.predict(xData)
resList.append(res)
return Counter(resList).most_common(1)[0][0]
Metric.py
# encoding=utf8
import numpy as np
from queue import PriorityQueue
class Metric:
def __init__(self, predictor, evalType='error_rate', **kwargs):
"""
Parameters
----------
predictor: Object
Specific predictor object, such as CART
type: string
Specific criteria for measuring model quality, such as error_rate, accuracy, precision,
recall, f1, f_beta
"""
self.predictor = predictor
self.evalType = evalType
self.kwargs = kwargs
def __calcErrorRate(self, features, labels, verbose=False):
error = 0
for index, feature in enumerate(features):
res = self.predictor.predict(feature)
if res != labels[index]:
error += 1
if verbose:
print("[wrong data info] {} truth:{} predict:{}".
format(list(feature), labels[index], res))
errorRate = error / float(len(features))
if verbose:
print("error rate: %f %%" % (errorRate * 100))
return errorRate
def __calcPrecisionAndRecall(self, features, labels, positiveLabelValue=1, verbose=False):
if not isinstance(features, np.ndarray):
print("features must be type of numpy.ndarray")
return
if not isinstance(labels, np.ndarray):
print("labels must be type of numpy.ndarray")
return
TP = 0
FP = 0
error = 0
totalCount = len(features)
totalPositiveCount = len(labels[labels == positiveLabelValue])
for index, feature in enumerate(features):
res = self.predictor.predict(feature)
if res != labels[index]:
error += 1
if verbose:
print("[wrong data info] {} truth:{} predict:{}".
format(list(feature), labels[index], res))
if res == 1:
if res == labels[index]:
TP += 1
else:
FP += 1
errorRate = error / float(len(features))
precision = TP / (TP + FP)
recall = TP / totalPositiveCount
if verbose:
print("total:{0}, pTotal:{1}, TP:{2}, FP:{3}, precision:{4}, recall:{5}, errorRate:{6}%%".
format(totalCount, totalPositiveCount, TP, FP, precision, recall, errorRate))
return [precision, recall, errorRate]
def __calcF1Score(self, features, labels, verbose=False):
precision, recall, errorRate = self.__calcPrecisionAndRecall(features, labels, False)
f1 = 2 * precision * recall / (precision + recall)
if verbose:
print("precision:{0}, recall:{1}, f1:{2}".format(precision, recall, f1))
return f1
def __calcFBeta(self, features, labels, beta=1, verbose=False):
precision, recall, errorRate = self.__calcPrecisionAndRecall(features, labels, False)
beta2 = np.power(beta, 2)
fBeta = (1 + beta2) * precision * recall / (beta2 * precision + recall)
if verbose:
print("beta:{0}, precision:{1}, recall:{2}, fBeta:{3}".format(beta, precision, recall, fBeta))
return fBeta
def __calcAUC(self, features, labels, posLabelValue=1, negLabelValue=-1, verbose=False):
# 计算AUC性能必须是输出概率的模型,且只能计算单概率二分类事件
prioQueue = PriorityQueue()
for index, feature in features:
prob = self.predictor.predict(feature)
if prob < 0:
print("the probability calculated by predictor{} is {}, which must be non-zero value".
format(self.predictor.modelName if hasattr(self.predictor, 'modelName') else 'Unknown', prob))
return
# 由于PriorityQueue采用的是最小堆实现,所以为了模仿最大堆的现象,需要将输入值取反
prioQueue.put((-prob, index))
n0 = labels[labels == negLabelValue].shape[0]
n1 = len(labels) - n0
rank = len(labels)
rankScoreSum = 0
while not prioQueue.empty():
negProb, index = prioQueue.get()
if labels[index] == posLabelValue:
rankScoreSum += rank
rank -= 1
auc = rankScoreSum - n1 * (n1 + 1) / 2
auc = auc / (n0 * n1)
if verbose:
print("auc:{0}, rankScoreSum:{1}".format(auc, rankScoreSum))
return auc
def __executeAccordingToType(self, features, labels, evalType, verbose=False, **kwargs):
if evalType == 'error_rate':
errorRate = self.__calcErrorRate(features, labels, verbose)
return errorRate
elif evalType == 'accuracy':
accuracy = 1 - self.__calcErrorRate(features, labels, verbose)
return accuracy
elif evalType == 'precision' or evalType == 'recall':
return self.__calcPrecisionAndRecall(features, labels, verbose)
elif evalType == 'f1':
return self.__calcF1Score(features, labels, verbose)
elif evalType == 'f_beta':
beta = kwargs.get('beta', None)
if beta is None:
print("beta must be configured if fBeta used")
return
return self.__calcFBeta(features, labels, beta, verbose)
elif evalType == 'auc':
posLabelValue = kwargs.get('posLabelValue', 1)
negLabelValue = kwargs.get('negLabelValue', -1)
return self.__calcAUC(features, labels, posLabelValue=posLabelValue,
negLabelValue=negLabelValue, verbose=verbose)
def eval(self, features, labels, verbose=False):
if not isinstance(features, np.ndarray):
print("features must be type of numpy.ndarray")
return
if not isinstance(labels, np.ndarray):
print("labels must be type of numpy.ndarray")
return
if isinstance(self.evalType, str):
return self.__executeAccordingToType(features, labels, self.evalType, verbose, **self.kwargs)
# elif isinstance(self.evalType, list):
# res = {}
# for et in self.evalType:
# res[self.evalType] = self.__executeAccordingToType(features, labels, et, verbose)
Validation.py
# encoding=utf8
import numpy as np
from copy import deepcopy
def splitBigDataToKFold(samples, kFold=2, shuffle=False):
if isinstance(samples, np.ndarray) is False:
print("input samples must be type of numpy.ndarray")
return
sampleSetCopy = samples
if shuffle:
sampleSetCopy = deepcopy(samples)
np.random.shuffle(sampleSetCopy)
m, n = np.shape(samples)
if m <= 0 or n <= 0:
print("input samples is empty")
return
numOfEachFold = m // kFold
totalNum = len(samples)
if totalNum <= numOfEachFold:
print("the number of each folder is bigger than total number of samples")
return
startIndex = 0
endIndex = totalNum - 1
while startIndex + numOfEachFold - 1 <= endIndex:
nextIndex = startIndex + numOfEachFold
testSet = sampleSetCopy[startIndex:nextIndex]
if startIndex == 0:
trainingSet = sampleSetCopy[nextIndex:]
else:
trainingSet = np.vstack((sampleSetCopy[:startIndex], sampleSetCopy[nextIndex:]))
startIndex = nextIndex
yield trainingSet[:, :-1], trainingSet[:, -1], testSet[:, :-1], testSet[:, -1]
FileReader.py
# encoding=utf8
import os
import numpy as np
import pandas as pd
from Utils import *
def loadDataForCART(switchFileName, noSwitchFileName, mode=OUTPUT_BLEND_MODE):
features = []
labels = []
cwd = os.getcwd()
path = '{0}\\Data\\'.format(cwd)
switchFileSampleCounts = 0
noSwitchFileSampleCounts = 0
switchFile = open(path + switchFileName)
for line in switchFile.readlines():
lineArray = line.strip().split(" ")
lineArray = list(map(float, lineArray))
if mode == OUTPUT_BLEND_MODE:
lineArray.append(1.0)
elif mode == OUTPUT_SEPARATE_MODE:
labels.append(1.0)
features.append(lineArray)
switchFileSampleCounts = len(features)
switchFile.close()
noSwitchFile = open(path + noSwitchFileName)
for line in noSwitchFile.readlines():
lineArray = line.strip().split(" ")
lineArray = list(map(float, lineArray))
if mode == OUTPUT_BLEND_MODE:
lineArray.append(-1.0)
elif mode == OUTPUT_SEPARATE_MODE:
labels.append(-1.0)
features.append(lineArray)
noSwitchFileSampleCounts = len(features) - switchFileSampleCounts
noSwitchFile.close()
print("sample num is {}, {}'s number is {}, {}'s number is {}".
format(len(features), switchFileName,
switchFileSampleCounts, noSwitchFileName, noSwitchFileSampleCounts))
if mode == OUTPUT_BLEND_MODE:
return features
elif mode == OUTPUT_SEPARATE_MODE:
return features, labels
def readFromCSV(path, mode=OUTPUT_BLEND_MODE):
fileList = os.listdir(path)
df = None
for f in fileList:
if f.startswith('mobile_export'):
if df is None:
df = pd.read_csv(path + f, ',').drop(['id', 'rx_speed', 'tx_speed'], axis=1)
else:
tmp = pd.read_csv(path + f, ',').drop(['id', 'rx_speed', 'tx_speed'], axis=1)
df = pd.concat([df, tmp], ignore_index=True)
if mode == OUTPUT_BLEND_MODE:
return df.to_numpy()
elif mode == OUTPUT_SEPARATE_MODE:
tmp = df.to_numpy()
return tmp[:, :-1], tmp[:, -1]
if __name__ == "__main__":
cwd = os.getcwd()
path = '{0}\\Data\\'.format(cwd)
features, labels = readFromCSV(path, mode=OUTPUT_SEPARATE_MODE)
print("total sample count:{}, positive sample count:{}, negative sample count:{}".
format(len(labels), len(labels[labels == 1]), len(labels[labels == -1])))
FileWriter.py
# encoding=utf8
import os
import numpy as np
import json
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.int64):
return int(obj)
else:
return super(NpEncoder, self).default(obj)
def writeByJson(data, filename="CART.json"):
data_path = os.getcwd() + r'\Data\{0}'.format(filename)
if os.path.exists(data_path):
os.remove(data_path)
with open(data_path, 'a') as f:
if isinstance(data, list):
for idx, item in enumerate(data):
jsonStr = json.dumps(item, cls=NpEncoder, ensure_ascii=False)
# print("jsonStr[%d]:%s"%(idx + 1, jsonStr))
f.write(jsonStr + '\n')
else:
jsonStr = json.dumps(data, cls=NpEncoder, ensure_ascii=False)
# print("jsonStr:\n", jsonStr)
f.write(jsonStr)
train.py
import math
from Validation import splitBigDataToKFold
import RandomForest as rf
import numpy as np
from FileWriter import writeByJson
from FileReader import loadDataForCART, readFromCSV
from Metric import Metric
import os
from Utils import *
def crossForRF():
cwd = os.getcwd()
path = '{0}\\Data\\'.format(cwd)
dataInput = readFromCSV(path, mode=OUTPUT_BLEND_MODE)
nTrees = 9
nFeatures = 5
epoch = 0
totalErrorRate = 0
trainMinSample = 6
trainMaxDepth = 15
for trainSet, trainLabel, testSet, testLabel in splitBigDataToKFold(dataInput, kFold=5, shuffle=True):
rfObj = rf.RandomForest(nTrees, minSample=trainMinSample, maxDepth=trainMaxDepth)
rfObj.fit(trainSet, trainLabel, randomFeature=False, nFeatures_=nFeatures)
# build metric to get error_rate of current epoch
m = Metric(rfObj, 'error_rate')
curErrorRate = m.eval(testSet, testLabel)
totalErrorRate += curErrorRate
epoch += 1
print("current epoch [{}], trainNum[{}], testNum[{}] errorRate is [{}%]".
format(epoch, len(trainSet), len(testSet), curErrorRate * 100))
print("avgErrorRate is [{}%]".format(totalErrorRate / float(epoch) * 100))
features, labels = readFromCSV(path, mode=OUTPUT_SEPARATE_MODE)
rfObj = rf.RandomForest(nTrees, minSample=trainMinSample, maxDepth=trainMaxDepth)
rfObj.fit(features, labels, randomFeature=False, nFeatures_=nFeatures)
m = Metric(rfObj, 'error_rate')
errorRate = m.eval(features, labels)
print("training error rate[%d/%d]: %f%%" % (math.ceil(errorRate * len(features)), len(features), errorRate * 100))
Forest = []
for item in rfObj.trees:
Tree = {}
item.preorderTraversal(item.tree, Tree, 0)
Forest.append(Tree)
writeByJson(Forest, filename="RandomForest.json")
if __name__ == "__main__":
crossForRF()