回到C45PruneableClassifierTree.buildClassifier()中
buildTree(data, m_subtreeRaising || !m_cleanup);
collapse();
if (m_pruneTheTree) {
prune();
}
if (m_cleanup) {
cleanup(new Instances(data, 0));
}
这个buildTree()在父类 ClassifierTree 中
public void buildTree(Instances data, boolean keepData) throws Exception {
Instances [] localInstances;//数据结构是个vector
if (keepData) {
m_train = data;
}
m_test = null;
m_isLeaf = false;
m_isEmpty = false;
m_sons = null;
m_localModel = m_toSelectModel.selectModel(data);//在 C45PruneableClassifierTree.java中已经调用super(model)把model传了过来
if (m_localModel.numSubsets() > 1) {//numSubsets在C4.5Split中已经设置,大小=m_complexityIndex即划分属性的取值数或者说是划分的包数
localInstances = m_localModel.split(data);//对数据进行划分
data = null;
m_sons = new ClassifierTree [m_localModel.numSubsets()];
for (int i = 0; i < m_sons.length; i++) {
m_sons[i] = getNewTree(localInstances[i]);//把实例分别用来建一颗新树
localInstances[i] = null;
}
}else{
m_isLeaf = true;
if (Utils.eq(data.sumOfWeights(), 0))
m_isEmpty = true;
data = null;
}
}
ClassifierSplitModel.split()展开
public final Instances [] split(Instances data)
throws Exception {
Instances [] instances = new Instances [m_numSubsets];
double [] weights;
double newWeight;
Instance instance;
int subset, i, j;
for (j=0;j<m_numSubsets;j++)
instances[j] = new Instances((Instances)data,
data.numInstances());
for (i = 0; i < data.numInstances(); i++) {
instance = ((Instances) data).instance(i);
weights = weights(instance);
subset = whichSubset(instance);
if (subset > -1)
instances[subset].add(instance);
else
for (j = 0; j < m_numSubsets; j++)
if (Utils.gr(weights[j],0)) {
newWeight = weights[j]*instance.weight();
instances[j].add(instance);
instances[j].lastInstance().setWeight(newWeight);
}
}
for (j = 0; j < m_numSubsets; j++)
instances[j].compactify();
return instances;
}
C45PruneableClassifierTree中重写的getNewTree()展开
C45PruneableClassifierTree newTree =
new C45PruneableClassifierTree(m_toSelectModel, m_pruneTheTree, m_CF,
m_subtreeRaising, m_cleanup);
newTree.buildTree((Instances)data, m_subtreeRaising || !m_cleanup);
return newTree;
}
collaspe()是把一个树坍塌成一个结点,如果这么做不会导致训练错误增加的话。不同于剪枝,collaspe()无法增加精度。
public final void collapse(){
double errorsOfSubtree;
double errorsOfTree;
int i;
if (!m_isLeaf){//若不为叶子结点
errorsOfSubtree = getTrainingErrors();//获取子树的训练错误
errorsOfTree = localModel().distribution().numIncorrect();
if (errorsOfSubtree >= errorsOfTree-1E-3){//若划分子树产生的错误比不划分大
// Free adjacent trees
m_sons = null;
m_isLeaf = true;
m_localModel = new NoSplit(localModel().distribution());//不划分
}else
for (i=0;i<m_sons.length;i++)
son(i).collapse();
}
}
获取训练错误的方法
private double getTrainingErrors(){
double errors = 0;
int i;
if (m_isLeaf)
return localModel().distribution().numIncorrect();//如果是叶子结点,返回类最少的那一类实例个数
else{
for (i=0;i<m_sons.length;i++)
errors = errors+son(i).getTrainingErrors();
return errors;
}
}
prune()用来c4.5剪枝法进行剪枝
public void prune() throws Exception {
double errorsLargestBranch;
double errorsLeaf;
double errorsTree;
int indexOfLargestBranch;
C45PruneableClassifierTree largestBranch;
int i;
if (!m_isLeaf){
//从叶子结点到树根进行
for (i=0;i<m_sons.length;i++)
son(i).prune();
// 计算最大的分枝的错误
indexOfLargestBranch = localModel().distribution().maxBag();
if (m_subtreeRaising) {//如果允许子树提升
errorsLargestBranch = son(indexOfLargestBranch).
getEstimatedErrorsForBranch((Instances)m_train);
} else {
errorsLargestBranch = Double.MAX_VALUE;
}
// Compute error if this Tree would be leaf
errorsLeaf =
getEstimatedErrorsForDistribution(localModel().distribution());
// Compute error for the whole subtree
errorsTree = getEstimatedErrors();
// Decide if leaf is best choice.
if (Utils.smOrEq(errorsLeaf,errorsTree+0.1) &&
Utils.smOrEq(errorsLeaf,errorsLargestBranch+0.1)){
// Free son Trees
m_sons = null;
m_isLeaf = true;
// Get NoSplit Model for node.
m_localModel = new NoSplit(localModel().distribution());
return;
}
// Decide if largest branch is better choice
// than whole subtree.
if (Utils.smOrEq(errorsLargestBranch,errorsTree+0.1)){
largestBranch = son(indexOfLargestBranch);
m_sons = largestBranch.m_sons;
m_localModel = largestBranch.localModel();
m_isLeaf = largestBranch.m_isLeaf;
newDistribution(m_train);
prune();
}
}
}