DAY61 :决策树(准备工作)
决策树是最经典的机器学习算法. 决策树的基本介绍:
决策树是一种机器学习的方法。决策树的生成算法有ID3, C4.5和C5.0等。决策树是一种树形结构,其中每个内部节点表示一个属性上的判断,每个分支代表一个判断结果的输出,最后每个叶节点代表一种分类结果。
决策树是一种十分常用的分类方法,需要监管学习(有教师的Supervised Learning),监管学习就是给出一堆样本,每个样本都有一组属性和一个分类结果,也就是分类结果已知,那么通过学习这些样本得到一个决策树,这个决策树能够对新的数据给出正确的分类。
决策树的生成主要分以下两步,这两步通常通过学习已经知道分类结果的样本来实现。
-
节点的分裂:一般当一个节点所代表的属性无法给出判断时,则选择将这一节点分成2个子节点(如不是二叉树的情况会分成n个子节点)
-
阈值的确定:选择适当的阈值使得分类错误率最小 (Training Error)。
DAY62: 决策树 (2. 建树与分类)
1、构建决策树是一个递归的过程, 参数设计是核心.
2、分类 classify() 也是一个递归的过程.
3、当前仅在训练集上测试. 设计单独的测试集也不困难.
4、toString() 也用了递归的方式.
代码编写
package machinelearning.decisiontree;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.*;
/**
* The ID3 decision tree inductive algorithm.
*
* @author WeiZe 1025976860@qq.com
*/
public class ID3 {
/**
* The data.
*/
Instances dataset;
/**
* Is this dataset pure (only one label)?
*/
boolean pure;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* Available instances. Other instances do not belong this branch.
*/
int[] availableInstances;
/**
* Available attributes. Other attributes have been selected in the path
* from the root.
*/
int[] availableAttributes;
/**
* The selected attribute.
*/
int splitAttribute;
/**
* The children nodes.
*/
ID3[] children;
/**
* My label. Inner nodes also have a label. For example, <outlook = sunny,
* humidity = high> never appear in the training data, but <humidity = high>
* is valid in other cases.
*/
int label;
/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;
/**
* Small block cannot be split further.
*/
static int smallBlockThreshold = 3;
/**
********************
* The constructor.
*
* @param paraFilename
* The given file.
********************
*/
public ID3(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
} // Of try
dataset.setClassIndex(dataset.numAttributes() - 1);
numClasses = dataset.classAttribute().numValues();
availableInstances = new int[dataset.numInstances()];
for (int i = 0; i < availableInstances.length; i++) {
availableInstances[i] = i;
} // Of for i
availableAttributes = new int[dataset.numAttributes() - 1];
for (int i = 0; i < availableAttributes.length; i++) {
availableAttributes[i] = i;
} // Of for i
// Initialize.
children = null;
// Determine the label by simple voting.
label = getMajorityClass(availableInstances);
// Determine whether or not it is pure.
pure = pureJudge(availableInstances);
}// Of the first constructor
/**
********************
* The constructor.
*
* @param paraDataset
* The given dataset.
********************
*/
public ID3(Instances paraDataset, int[] paraAvailableInstances, int[] paraAvailableAttributes) {
// Copy its reference instead of clone the availableInstances.
dataset = paraDataset;
availableInstances = paraAvailableInstances;
availableAttributes = paraAvailableAttributes;
// Initialize.
children = null;
// Determine the label by simple voting.
label = getMajorityClass(availableInstances);
// Determine whether or not it is pure.
pure = pureJudge(availableInstances);
}// Of the second constructor
/**
**********************************
* Is the given block pure?
*
* @param paraBlock
* The block.
* @return True if pure.
**********************************
*/
public boolean pureJudge(int[] paraBlock) {
pure = true;
for (int i = 1; i < paraBlock.length; i++) {
if (dataset.instance(paraBlock[i]).classValue() != dataset.instance(paraBlock[0])
.classValue()) {
pure = false;
break;
} // Of if
} // Of for i
return pure;
}// Of pureJudge
/**
**********************************
* Compute the majority class of the given block for voting.
*
* @param paraBlock
* The block.
* @return The majority class.
**********************************
*/
public int getMajorityClass(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
} // Of for i
int resultMajorityClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
resultMajorityClass = i;
tempMaxCount = tempClassCounts[i];
} // Of if
} // Of for i
return resultMajorityClass;
}// Of getMajorityClass
/**
**********************************
* Select the best attribute.
*
* @return The best attribute index.
**********************************
*/
public int selectBestAttribute() {
splitAttribute = -1;
double tempMinimalEntropy = 10000;
double tempEntropy;
for (int i = 0; i < availableAttributes.length; i++) {
tempEntropy = conditionalEntropy(availableAttributes[i]);
if (tempMinimalEntropy > tempEntropy) {
tempMinimalEntropy = tempEntropy;
splitAttribute = availableAttributes[i];
} // Of if
} // Of for i
return splitAttribute;
}// Of selectBestAttribute
/**
**********************************
* Compute the conditional entropy of an attribute.
*
* @param paraAttribute
* The given attribute.
*
* @return The entropy.
**********************************
*/
public double conditionalEntropy(int paraAttribute) {
// Step 1. Statistics.
int tempNumClasses = dataset.numClasses();
int tempNumValues = dataset.attribute(paraAttribute).numValues();
int tempNumInstances = availableInstances.length;
double[] tempValueCounts = new double[tempNumValues];
double[][] tempCountMatrix = new double[tempNumValues][tempNumClasses];
int tempClass, tempValue;
for (int i = 0; i < tempNumInstances; i++) {
tempClass = (int) dataset.instance(availableInstances[i]).classValue();
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempValueCounts[tempValue]++;
tempCountMatrix[tempValue][tempClass]++;
} // Of for i
// Step 2.
double resultEntropy = 0;
double tempEntropy, tempFraction;
for (int i = 0; i < tempNumValues; i++) {
if (tempValueCounts[i] == 0) {
continue;
} // Of if
tempEntropy = 0;
for (int j = 0; j < tempNumClasses; j++) {
tempFraction = tempCountMatrix[i][j] / tempValueCounts[i];
if (tempFraction == 0) {
continue;
} // Of if
tempEntropy += -tempFraction * Math.log(tempFraction);
} // Of for j
resultEntropy += tempValueCounts[i] / tempNumInstances * tempEntropy;
} // Of for i
return resultEntropy;
}// Of conditionalEntropy
/**
**********************************
* Split the data according to the given attribute.
*
* @return The blocks.
**********************************
*/
public int[][] splitData(int paraAttribute) {
int tempNumValues = dataset.attribute(paraAttribute).numValues();
// System.out.println("Dataset " + dataset + "\r\n");
// System.out.println("Attribute " + paraAttribute + " has " +
// tempNumValues + " values.\r\n");
int[][] resultBlocks = new int[tempNumValues][];
int[] tempSizes = new int[tempNumValues];
// First scan to count the size of each block.
int tempValue;
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempSizes[tempValue]++;
} // Of for i
// Allocate space.
for (int i = 0; i < tempNumValues; i++) {
resultBlocks[i] = new int[tempSizes[i]];
} // Of for i
// Second scan to fill.
Arrays.fill(tempSizes, 0);
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
// Copy data.
resultBlocks[tempValue][tempSizes[tempValue]] = availableInstances[i];
tempSizes[tempValue]++;
} // Of for i
return resultBlocks;
}// Of splitData
/**
**********************************
* Build the tree recursively.
**********************************
*/
public void buildTree() {
if (pureJudge(availableInstances)) {
return;
} // Of if
if (availableInstances.length <= smallBlockThreshold) {
return;
} // Of if
selectBestAttribute();
int[][] tempSubBlocks = splitData(splitAttribute);
children = new ID3[tempSubBlocks.length];
// Construct the remaining attribute set.
int[] tempRemainingAttributes = new int[availableAttributes.length - 1];
for (int i = 0; i < availableAttributes.length; i++) {
if (availableAttributes[i] < splitAttribute) {
tempRemainingAttributes[i] = availableAttributes[i];
} else if (availableAttributes[i] > splitAttribute) {
tempRemainingAttributes[i - 1] = availableAttributes[i];
} // Of if
} // Of for i
// Construct children.
for (int i = 0; i < children.length; i++) {
if ((tempSubBlocks[i] == null) || (tempSubBlocks[i].length == 0)) {
children[i] = null;
continue;
} else {
// System.out.println("Building children #" + i + " with
// instances " + Arrays.toString(tempSubBlocks[i]));
children[i] = new ID3(dataset, tempSubBlocks[i], tempRemainingAttributes);
// Important code: do this recursively
children[i].buildTree();
} // Of if
} // Of for i
}// Of buildTree
/**
**********************************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return The prediction.
**********************************
*/
public int classify(Instance paraInstance) {
if (children == null) {
return label;
} // Of if
ID3 tempChild = children[(int) paraInstance.value(splitAttribute)];
if (tempChild == null) {
return label;
} // Of if
return tempChild.classify(paraInstance);
}// Of classify
/**
**********************************
* Test on a testing set.
*
* @param paraDataset
* The given testing data.
* @return The accuracy.
**********************************
*/
public double test(Instances paraDataset) {
double tempCorrect = 0;
for (int i = 0; i < paraDataset.numInstances(); i++) {
if (classify(paraDataset.instance(i)) == (int) paraDataset.instance(i).classValue()) {
tempCorrect++;
} // Of i
} // Of for i
return tempCorrect / paraDataset.numInstances();
}// Of test
public double selfTest() {
return test(dataset);
}// Of selfTest
public String toString() {
String resultString = "";
String tempAttributeName = dataset.attribute(splitAttribute).name();
if (children == null) {
resultString += "class = " + label;
} else {
for (int i = 0; i < children.length; i++) {
if (children[i] == null) {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + "class = " + label
+ "\r\n";
} else {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + children[i]
+ "\r\n";
} // Of if
} // Of for i
} // Of if
return resultString;
}// Of toString
public static void id3Test() {
ID3 tempID3 = new ID3("D:/data/weather.arff");
// ID3 tempID3 = new ID3("D:/data/mushroom.arff");
ID3.smallBlockThreshold = 3;
tempID3.buildTree();
System.out.println("The tree is: \r\n" + tempID3);
double tempAccuracy = tempID3.selfTest();
System.out.println("The accuracy is: " + tempAccuracy);
}// Of id3Test
/**
*************************
* Test this class.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
id3Test();
}// Of main
}// Of class ID3
DAY63:集成学习之 AdaBoosting (1. 带权数据集)
1 Adaboost的原理
** Adaboost是什么**
AdaBoost,是英文"Adaptive Boosting"(自适应增强)的缩写,由Yoav Freund和Robert Schapire在1995年提出。它的自适应在于:前一个基本分类器分错的样本会得到加强,加权后的全体样本再次被用来训练下一个基本分类器。同时,在每一轮中加入一个新的弱分类器,直到达到某个预定的足够小的错误率或达到预先指定的最大迭代次数。
具体说来,整个Adaboost 迭代算法就3步:
初始化训练数据的权值分布。如果有N个样本,则每一个训练样本最开始时都被赋予相同的权值:1/N。
训练弱分类器。具体训练过程中,如果某个样本点已经被准确地分类,那么在构造下一个训练集中,它的权值就被降低;相反,如果某个样本点没有被准确地分类,那么它的权值就得到提高。然后,权值更新过的样本集被用于训练下一个分类器,整个训练过程如此迭代地进行下去。
将各个训练得到的弱分类器组合成强分类器。各个弱分类器的训练过程结束后,加大分类误差率小的弱分类器的权重,使其在最终的分类函数中起着较大的决定作用,而降低分类误差率大的弱分类器的权重,使其在最终的分类函数中起着较小的决定作用。换言之,误差率低的弱分类器在最终分类器中占的权重较大,否则较小。
代码编写
package machinelearning.adaboosting;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.Instances;
/**
* Weighted instances.<br>
*
* @author WeiZe 10259768602qq.com
*/
public class WeightedInstances extends Instances {
/**
* Just the requirement of some classes, any number is ok.
*/
private static final long serialVersionUID = 11087456L;
/**
* Weights.
*/
private double[] weights;
/**
******************
* The first constructor.
*
* @param paraFileReader
* The given reader to read data from file.
******************
*/
public WeightedInstances(FileReader paraFileReader) throws Exception {
super(paraFileReader);
setClassIndex(numAttributes() - 1);
// Initialize weights
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
} // Of for i
System.out.println("Instances weights are: " + Arrays.toString(weights));
} // Of the first constructor
/**
******************
* The second constructor.
*
* @param paraInstances
* The given instance.
******************
*/
public WeightedInstances(Instances paraInstances) {
super(paraInstances);
setClassIndex(numAttributes() - 1);
// Initialize weights
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
} // Of for i
System.out.println("Instances weights are: " + Arrays.toString(weights));
} // Of the second constructor
/**
******************
* Getter.
*
* @param paraIndex
* The given index.
* @return The weight of the given index.
******************
*/
public double getWeight(int paraIndex) {
return weights[paraIndex];
} // Of getWeight
/**
******************
* Adjust the weights.
*
* @param paraCorrectArray
* Indicate which instances have been correctly classified.
* @param paraAlpha
* The weight of the last classifier.
******************
*/
public void adjustWeights(boolean[] paraCorrectArray, double paraAlpha) {
// Step 1. Calculate alpha.
double tempIncrease = Math.exp(paraAlpha);
// Step 2. Adjust.
double tempWeightsSum = 0; // For normalization.
for (int i = 0; i < weights.length; i++) {
if (paraCorrectArray[i]) {
weights[i] /= tempIncrease;
} else {
weights[i] *= tempIncrease;
} // Of if
tempWeightsSum += weights[i];
} // Of for i
// Step 3. Normalize.
for (int i = 0; i < weights.length; i++) {
weights[i] /= tempWeightsSum;
} // Of for i
System.out.println("After adjusting, instances weights are: " + Arrays.toString(weights));
} // Of adjustWeights
/**
******************
* Test the method.
******************
*/
public void adjustWeightsTest() {
boolean[] tempCorrectArray = new boolean[numInstances()];
for (int i = 0; i < tempCorrectArray.length / 2; i++) {
tempCorrectArray[i] = true;
} // Of for i
double tempWeightedError = 0.3;
adjustWeights(tempCorrectArray, tempWeightedError);
System.out.println("After adjusting");
System.out.println(toString());
} // Of adjustWeightsTest
/**
******************
* For display.
******************
*/
public String toString() {
String resultString = "I am a weighted Instances object.\r\n" + "I have " + numInstances() + " instances and "
+ (numAttributes() - 1) + " conditional attributes.\r\n" + "My weights are: " + Arrays.toString(weights)
+ "\r\n" + "My data are: \r\n" + super.toString();
return resultString;
} // Of toString
/**
******************
* For unit test.
*
* @param args
* Not provided.
******************
*/
public static void main(String args[]) {
WeightedInstances tempWeightedInstances = null;
String tempFilename = "d:/data/iris.arff";
try {
FileReader tempFileReader = new FileReader(tempFilename);
tempWeightedInstances = new WeightedInstances(tempFileReader);
tempFileReader.close();
} catch (Exception exception1) {
System.out.println("Cannot read the file: " + tempFilename + "\r\n" + exception1);
System.exit(0);
} // Of try
System.out.println(tempWeightedInstances.toString());
tempWeightedInstances.adjustWeightsTest();
} // Of main
} // Of class WeightedInstances
DAY64:集成学习之 AdaBoosting (2. 树桩分类器)
树桩分类器每次只将数据分成两堆, 与决策树相比, 简单至极.
package machinelearning.adaboosting;
import java.util.Random;
import weka.core.Instance;
/**
* The super class of any simple classifier.
*
* @author WeiZe 1025976860@qq.com
*/
public abstract class SimpleClassifier {
/**
* The index of the current attribute.
*/
int selectedAttribute;
/**
* Weighted data.
*/
WeightedInstances weightedInstances;
/**
* The accuracy on the training set.
*/
double trainingAccuracy;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* The number of instances.
*/
int numInstances;
/**
* The number of conditional attributes.
*/
int numConditions;
/**
* For random number generation.
*/
Random random = new Random();
/**
******************
* The first constructor.
*
* @param paraWeightedInstances
* The given instances.
******************
*/
public SimpleClassifier(WeightedInstances paraWeightedInstances) {
weightedInstances = paraWeightedInstances;
numConditions = weightedInstances.numAttributes() - 1;
numInstances = weightedInstances.numInstances();
numClasses = weightedInstances.classAttribute().numValues();
}// Of the first constructor
/**
******************
* Train the classifier.
******************
*/
public abstract void train();
/**
******************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return Predicted label.
******************
*/
public abstract int classify(Instance paraInstance);
/**
******************
* Which instances in the training set are correctly classified.
*
* @return The correctness array.
******************
*/
public boolean[] computeCorrectnessArray() {
boolean[] resultCorrectnessArray = new boolean[weightedInstances.numInstances()];
for (int i = 0; i < resultCorrectnessArray.length; i++) {
Instance tempInstance = weightedInstances.instance(i);
if ((int) (tempInstance.classValue()) == classify(tempInstance)) {
resultCorrectnessArray[i] = true;
} // Of if
// System.out.print("\t" + classify(tempInstance));
} // Of for i
// System.out.println();
return resultCorrectnessArray;
}// Of computeCorrectnessArray
/**
******************
* Compute the accuracy on the training set.
*
* @return The training accuracy.
******************
*/
public double computeTrainingAccuracy() {
double tempCorrect = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (tempCorrectnessArray[i]) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / tempCorrectnessArray.length;
return resultAccuracy;
}// Of computeTrainingAccuracy
/**
******************
* Compute the weighted error on the training set. It is at least 1e-6 to
* avoid NaN.
*
* @return The weighted error.
******************
*/
public double computeWeightedError() {
double resultError = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (!tempCorrectnessArray[i]) {
resultError += weightedInstances.getWeight(i);
} // Of if
} // Of for i
if (resultError < 1e-6) {
resultError = 1e-6;
} // Of if
return resultError;
}// Of computeWeightedError
} // Of class SimpleClassifier
树桩分类器代码.
package machinelearning.adaboosting;
import weka.core.Instance;
import java.io.FileReader;
import java.util.*;
/**
* The stump classifier.<br>
*
* @author Wei Ze 1025976860@qq.com
*/
public class StumpClassifier extends SimpleClassifier {
/**
* The best cut for the current attribute on weightedInstances.
*/
double bestCut;
/**
* The class label for attribute value less than bestCut.
*/
int leftLeafLabel;
/**
* The class label for attribute value no less than bestCut.
*/
int rightLeafLabel;
/**
******************
* The only constructor.
*
* @param paraWeightedInstances
* The given instances.
******************
*/
public StumpClassifier(WeightedInstances paraWeightedInstances) {
super(paraWeightedInstances);
}// Of the only constructor
/**
******************
* Train the classifier.
******************
*/
public void train() {
// Step 1. Randomly choose an attribute.
selectedAttribute = random.nextInt(numConditions);
// Step 2. Find all attribute values and sort.
double[] tempValuesArray = new double[numInstances];
for (int i = 0; i < tempValuesArray.length; i++) {
tempValuesArray[i] = weightedInstances.instance(i).value(selectedAttribute);
} // Of for i
Arrays.sort(tempValuesArray);
// Step 3. Initialize, classify all instances as the same with the
// original cut.
int tempNumLabels = numClasses;
double[] tempLabelCountArray = new double[tempNumLabels];
int tempCurrentLabel;
// Step 3.1 Scan all labels to obtain their counts.
for (int i = 0; i < numInstances; i++) {
// The label of the ith instance
tempCurrentLabel = (int) weightedInstances.instance(i).classValue();
tempLabelCountArray[tempCurrentLabel] += weightedInstances.getWeight(i);
} // Of for i
// Step 3.2 Find the label with the maximal count.
double tempMaxCorrect = 0;
int tempBestLabel = -1;
for (int i = 0; i < tempLabelCountArray.length; i++) {
if (tempMaxCorrect < tempLabelCountArray[i]) {
tempMaxCorrect = tempLabelCountArray[i];
tempBestLabel = i;
} // Of if
} // Of for i
// Step 3.3 The cut is a little bit smaller than the minimal value.
bestCut = tempValuesArray[0] - 0.1;
leftLeafLabel = tempBestLabel;
rightLeafLabel = tempBestLabel;
// Step 4. Check candidate cuts one by one.
// Step 4.1 To handle multi-class data, left and right.
double tempCut;
double[][] tempLabelCountMatrix = new double[2][tempNumLabels];
for (int i = 0; i < tempValuesArray.length - 1; i++) {
// Step 4.1 Some attribute values are identical, ignore them.
if (tempValuesArray[i] == tempValuesArray[i + 1]) {
continue;
} // Of if
tempCut = (tempValuesArray[i] + tempValuesArray[i + 1]) / 2;
// Step 4.2 Scan all labels to obtain their counts wrt. the cut.
// Initialize again since it is used many times.
for (int j = 0; j < 2; j++) {
for (int k = 0; k < tempNumLabels; k++) {
tempLabelCountMatrix[j][k] = 0;
} // Of for k
} // Of for j
for (int j = 0; j < numInstances; j++) {
// The label of the jth instance
tempCurrentLabel = (int) weightedInstances.instance(j).classValue();
if (weightedInstances.instance(j).value(selectedAttribute) < tempCut) {
tempLabelCountMatrix[0][tempCurrentLabel] += weightedInstances.getWeight(j);
} else {
tempLabelCountMatrix[1][tempCurrentLabel] += weightedInstances.getWeight(j);
} // Of if
} // Of for i
// Step 4.3 Left leaf.
double tempLeftMaxCorrect = 0;
int tempLeftBestLabel = 0;
for (int j = 0; j < tempLabelCountMatrix[0].length; j++) {
if (tempLeftMaxCorrect < tempLabelCountMatrix[0][j]) {
tempLeftMaxCorrect = tempLabelCountMatrix[0][j];
tempLeftBestLabel = j;
} // Of if
} // Of for i
// Step 4.4 Right leaf.
double tempRightMaxCorrect = 0;
int tempRightBestLabel = 0;
for (int j = 0; j < tempLabelCountMatrix[1].length; j++) {
if (tempRightMaxCorrect < tempLabelCountMatrix[1][j]) {
tempRightMaxCorrect = tempLabelCountMatrix[1][j];
tempRightBestLabel = j;
} // Of if
} // Of for i
// Step 4.5 Compare with the current best.
if (tempMaxCorrect < tempLeftMaxCorrect + tempRightMaxCorrect) {
tempMaxCorrect = tempLeftMaxCorrect + tempRightMaxCorrect;
bestCut = tempCut;
leftLeafLabel = tempLeftBestLabel;
rightLeafLabel = tempRightBestLabel;
} // Of if
} // Of for i
System.out.println("Attribute = " + selectedAttribute + ", cut = " + bestCut + ", leftLeafLabel = "
+ leftLeafLabel + ", rightLeafLabel = " + rightLeafLabel);
}// Of train
/**
******************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return Predicted label.
******************
*/
public int classify(Instance paraInstance) {
int resultLabel = -1;
if (paraInstance.value(selectedAttribute) < bestCut) {
resultLabel = leftLeafLabel;
} else {
resultLabel = rightLeafLabel;
} // Of if
return resultLabel;
}// Of classify
/**
******************
* For display.
******************
*/
public String toString() {
String resultString = "I am a stump classifier.\r\n" + "I choose attribute #" + selectedAttribute
+ " with cut value " + bestCut + ".\r\n" + "The left and right leaf labels are " + leftLeafLabel
+ " and " + rightLeafLabel + ", respectively.\r\n" + "My weighted error is: " + computeWeightedError()
+ ".\r\n" + "My weighted accuracy is : " + computeTrainingAccuracy() + ".";
return resultString;
}// Of toString
/**
******************
* For unit test.
*
* @param args
* Not provided.
******************
*/
public static void main(String args[]) {
WeightedInstances tempWeightedInstances = null;
String tempFilename = "D:/data/iris.arff";
try {
FileReader tempFileReader = new FileReader(tempFilename);
tempWeightedInstances = new WeightedInstances(tempFileReader);
tempFileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + tempFilename + "\r\n" + ee);
System.exit(0);
} // Of try
StumpClassifier tempClassifier = new StumpClassifier(tempWeightedInstances);
tempClassifier.train();
System.out.println(tempClassifier);
System.out.println(Arrays.toString(tempClassifier.computeCorrectnessArray()));
}// Of main
}// Of class StumpClassifier
day65 集成学习之 AdaBoosting (3. 集成器)
package machinelearning.adaboosting;
import java.io.FileReader;
import weka.core.Instance;
import weka.core.Instances;
/**
* The booster which ensembles base classifiers.
*
* @author WeiZe 1025976860@qq.com
*/
public class Booster {
/**
* Classifiers.
*/
SimpleClassifier[] classifiers;
/**
* Number of classifiers.
*/
int numClassifiers;
/**
* Whether or not stop after the training error is 0.
*/
boolean stopAfterConverge = false;
/**
* The weights of classifiers.
*/
double[] classifierWeights;
/**
* The training data.
*/
Instances trainingData;
/**
* The testing data.
*/
Instances testingData;
/**
******************
* The first constructor. The testing set is the same as the training set.
*
* @param paraTrainingFilename
* The data filename.
******************
*/
public Booster(String paraTrainingFilename) {
// Step 1. Read training set.
try {
FileReader tempFileReader = new FileReader(paraTrainingFilename);
trainingData = new Instances(tempFileReader);
tempFileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraTrainingFilename + "\r\n" + ee);
System.exit(0);
} // Of try
// Step 2. Set the last attribute as the class index.
trainingData.setClassIndex(trainingData.numAttributes() - 1);
// Step 3. The testing data is the same as the training data.
testingData = trainingData;
stopAfterConverge = true;
System.out.println("****************Data**********\r\n" + trainingData);
}// Of the first constructor
/**
******************
* Set the number of base classifier, and allocate space for them.
*
* @param paraNumBaseClassifiers
* The number of base classifier.
******************
*/
public void setNumBaseClassifiers(int paraNumBaseClassifiers) {
numClassifiers = paraNumBaseClassifiers;
// Step 1. Allocate space (only reference) for classifiers
classifiers = new SimpleClassifier[numClassifiers];
// Step 2. Initialize classifier weights.
classifierWeights = new double[numClassifiers];
}// Of setNumBaseClassifiers
/**
******************
* Train the booster.
*
* @see algorithm.StumpClassifier#train()
******************
*/
public void train() {
// Step 1. Initialize.
WeightedInstances tempWeightedInstances = null;
double tempError;
numClassifiers = 0;
// Step 2. Build other classifiers.
for (int i = 0; i < classifiers.length; i++) {
// Step 2.1 Key code: Construct or adjust the weightedInstances
if (i == 0) {
tempWeightedInstances = new WeightedInstances(trainingData);
} else {
// Adjust the weights of the data.
tempWeightedInstances.adjustWeights(classifiers[i - 1].computeCorrectnessArray(),
classifierWeights[i - 1]);
} // Of if
// Step 2.2 Train the next classifier.
classifiers[i] = new StumpClassifier(tempWeightedInstances);
classifiers[i].train();
tempError = classifiers[i].computeWeightedError();
// Key code: Set the classifier weight.
classifierWeights[i] = 0.5 * Math.log(1 / tempError - 1);
if (classifierWeights[i] < 1e-6) {
classifierWeights[i] = 0;
} // Of if
System.out.println("Classifier #" + i + " , weighted error = " + tempError + ", weight = "
+ classifierWeights[i] + "\r\n");
numClassifiers++;
// The accuracy is enough.
if (stopAfterConverge) {
double tempTrainingAccuracy = computeTrainingAccuray();
System.out.println("The accuracy of the booster is: " + tempTrainingAccuracy + "\r\n");
if (tempTrainingAccuracy > 0.999999) {
System.out.println("Stop at the round: " + i + " due to converge.\r\n");
break;
} // Of if
} // Of if
} // Of for i
}// Of train
/**
******************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return The predicted label.
******************
*/
public int classify(Instance paraInstance) {
double[] tempLabelsCountArray = new double[trainingData.classAttribute().numValues()];
for (int i = 0; i < numClassifiers; i++) {
int tempLabel = classifiers[i].classify(paraInstance);
tempLabelsCountArray[tempLabel] += classifierWeights[i];
} // Of for i
int resultLabel = -1;
double tempMax = -1;
for (int i = 0; i < tempLabelsCountArray.length; i++) {
if (tempMax < tempLabelsCountArray[i]) {
tempMax = tempLabelsCountArray[i];
resultLabel = i;
} // Of if
} // Of for
return resultLabel;
}// Of classify
/**
******************
* Test the booster on the training data.
*
* @return The classification accuracy.
******************
*/
public double test() {
System.out.println("Testing on " + testingData.numInstances() + " instances.\r\n");
return test(testingData);
}// Of test
/**
******************
* Test the booster.
*
* @param paraInstances
* The testing set.
* @return The classification accuracy.
******************
*/
public double test(Instances paraInstances) {
double tempCorrect = 0;
paraInstances.setClassIndex(paraInstances.numAttributes() - 1);
for (int i = 0; i < paraInstances.numInstances(); i++) {
Instance tempInstance = paraInstances.instance(i);
if (classify(tempInstance) == (int) tempInstance.classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / paraInstances.numInstances();
System.out.println("The accuracy is: " + resultAccuracy);
return resultAccuracy;
} // Of test
/**
******************
* Compute the training accuracy of the booster. It is not weighted.
*
* @return The training accuracy.
******************
*/
public double computeTrainingAccuray() {
double tempCorrect = 0;
for (int i = 0; i < trainingData.numInstances(); i++) {
if (classify(trainingData.instance(i)) == (int) trainingData.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double tempAccuracy = tempCorrect / trainingData.numInstances();
return tempAccuracy;
}// Of computeTrainingAccuray
/**
******************
* For integration test.
*
* @param args
* Not provided.
******************
*/
public static void main(String args[]) {
System.out.println("Starting AdaBoosting...");
Booster tempBooster = new Booster("D:/data/iris.arff");
// Booster tempBooster = new Booster("src/data/smalliris.arff");
tempBooster.setNumBaseClassifiers(100);
tempBooster.train();
System.out.println("The training accuracy is: " + tempBooster.computeTrainingAccuray());
tempBooster.test();
}// Of main
}// Of class Booster