日撸 Java 三百行（61-70天，决策树与集成学习）

闵帆

已于 2022-05-08 09:33:28 修改

阅读量4.7k

点赞数 5

分类专栏： Java 程序设计基础文章标签：人工智能

于 2021-05-18 09:33:39 首次发布

本文链接：https://blog.csdn.net/minfanphd/article/details/116975999

版权

Java 程序设计基础专栏收录该内容

19 篇文章 75 订阅

订阅专栏

第 61 天: 决策树 (1. 准备工作)

可以先阅读一下决策树快问快答.
决策树是最经典的机器学习算法. 其实我不想在后面加上"之一". 它有非常好的可解释性.

今天可以抄到 284 行, 剩下的明天抄.
数据仅有一份. 分裂后的数据子集仅需要保存 availableInstances 和 availableAttributes 两个数组.
两个构造方法, 一个读入文件获得根节点, 另一个建立根据数据分裂的获得.
判断数据集是否纯, 即所有的类标签是否相同, 如果是就不用分裂了.
每个节点 (包括非叶节点) 都需要一个标签, 这样, 遇到未见过的属性就可以直接分类了. 为获得该标签, 可以通过投票的方式, 即 getMajorityClass().
最大化信息增益, 与最小化条件信息熵, 两者是等价的.
分裂的数据块有可能是空的, 这时使用长度为 0 的数组而不是 null.

package machinelearning.decisiontree;

import java.io.FileReader;
import java.util.Arrays;
import weka.core.*;

/**
 * The ID3 decision tree inductive algorithm.
 * 
 * @author Fan Min minfanphd@163.com.
 */
public class ID3 {
	/**
	 * The data.
	 */
	Instances dataset;

	/**
	 * Is this dataset pure (only one label)?
	 */
	boolean pure;

	/**
	 * The number of classes. For binary classification it is 2.
	 */
	int numClasses;

	/**
	 * Available instances. Other instances do not belong this branch.
	 */
	int[] availableInstances;

	/**
	 * Available attributes. Other attributes have been selected in the path
	 * from the root.
	 */
	int[] availableAttributes;

	/**
	 * The selected attribute.
	 */
	int splitAttribute;

	/**
	 * The children nodes.
	 */
	ID3[] children;

	/**
	 * My label. Inner nodes also have a label. For example, <outlook = sunny,
	 * humidity = high> never appear in the training data, but <humidity = high>
	 * is valid in other cases.
	 */
	int label;

	/**
	 * The prediction, including queried and predicted labels.
	 */
	int[] predicts;

	/**
	 * Small block cannot be split further.
	 */
	static int smallBlockThreshold = 3;

	/**
	 ********************
	 * The constructor.
	 * 
	 * @param paraFilename
	 *            The given file.
	 ********************
	 */
	public ID3(String paraFilename) {
		dataset = null;
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
			System.exit(0);
		} // Of try

		dataset.setClassIndex(dataset.numAttributes() - 1);
		numClasses = dataset.classAttribute().numValues();

		availableInstances = new int[dataset.numInstances()];
		for (int i = 0; i < availableInstances.length; i++) {
			availableInstances[i] = i;
		} // Of for i
		availableAttributes = new int[dataset.numAttributes() - 1];
		for (int i = 0; i < availableAttributes.length; i++) {
			availableAttributes[i] = i;
		} // Of for i

		// Initialize.
		children = null;
		// Determine the label by simple voting.
		label = getMajorityClass(availableInstances);
		// Determine whether or not it is pure.
		pure = pureJudge(availableInstances);
	}// Of the first constructor

	/**
	 ********************
	 * The constructor.
	 * 
	 * @param paraDataset
	 *            The given dataset.
	 ********************
	 */
	public ID3(Instances paraDataset, int[] paraAvailableInstances, int[] paraAvailableAttributes) {
		// Copy its reference instead of clone the availableInstances.
		dataset = paraDataset;
		availableInstances = paraAvailableInstances;
		availableAttributes = paraAvailableAttributes;

		// Initialize.
		children = null;
		// Determine the label by simple voting.
		label = getMajorityClass(availableInstances);
		// Determine whether or not it is pure.
		pure = pureJudge(availableInstances);
	}// Of the second constructor

	/**
	 ********************************** 
	 * Is the given block pure?
	 * 
	 * @param paraBlock
	 *            The block.
	 * @return True if pure.
	 ********************************** 
	 */
	public boolean pureJudge(int[] paraBlock) {
		pure = true;

		for (int i = 1; i < paraBlock.length; i++) {
			if (dataset.instance(paraBlock[i]).classValue() != dataset.instance(paraBlock[0])
					.classValue()) {
				pure = false;
				break;
			} // Of if
		} // Of for i

		return pure;
	}// Of pureJudge

	/**
	 ********************************** 
	 * Compute the majority class of the given block for voting.
	 * 
	 * @param paraBlock
	 *            The block.
	 * @return The majority class.
	 ********************************** 
	 */
	public int getMajorityClass(int[] paraBlock) {
		int[] tempClassCounts = new int[dataset.numClasses()];
		for (int i = 0; i < paraBlock.length; i++) {
			tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
		} // Of for i

		int resultMajorityClass = -1;
		int tempMaxCount = -1;
		for (int i = 0; i < tempClassCounts.length; i++) {
			if (tempMaxCount < tempClassCounts[i]) {
				resultMajorityClass = i;
				tempMaxCount = tempClassCounts[i];
			} // Of if
		} // Of for i

		return resultMajorityClass;
	}// Of getMajorityClass

	/**
	 ********************************** 
	 * Select the best attribute.
	 * 
	 * @return The best attribute index.
	 ********************************** 
	 */
	public int selectBestAttribute() {
		splitAttribute = -1;
		double tempMinimalEntropy = 10000;
		double tempEntropy;
		for (int i = 0; i < availableAttributes.length; i++) {
			tempEntropy = conditionalEntropy(availableAttributes[i]);
			if (tempMinimalEntropy > tempEntropy) {
				tempMinimalEntropy = tempEntropy;
				splitAttribute = availableAttributes[i];
			} // Of if
		} // Of for i
		return splitAttribute;
	}// Of selectBestAttribute

	/**
	 ********************************** 
	 * Compute the conditional entropy of an attribute.
	 * 
	 * @param paraAttribute
	 *            The given attribute.
	 * 
	 * @return The entropy.
	 ********************************** 
	 */
	public double conditionalEntropy(int paraAttribute) {
		// Step 1. Statistics.
		int tempNumClasses = dataset.numClasses();
		int tempNumValues = dataset.attribute(paraAttribute).numValues();
		int tempNumInstances = availableInstances.length;
		double[] tempValueCounts = new double[tempNumValues];
		double[][] tempCountMatrix = new double[tempNumValues][tempNumClasses];

		int tempClass, tempValue;
		for (int i = 0; i < tempNumInstances; i++) {
			tempClass = (int) dataset.instance(availableInstances[i]).classValue();
			tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
			tempValueCounts[tempValue]++;
			tempCountMatrix[tempValue][tempClass]++;
		} // Of for i

		// Step 2.
		double resultEntropy = 0;
		double tempEntropy, tempFraction;
		for (int i = 0; i < tempNumValues; i++) {
			if (tempValueCounts[i] == 0) {
				continue;
			} // Of if
			tempEntropy = 0;
			for (int j = 0; j < tempNumClasses; j++) {
				tempFraction = tempCountMatrix[i][j] / tempValueCounts[i];
				if (tempFraction == 0) {
					continue;
				} // Of if
				tempEntropy += -tempFraction * Math.log(tempFraction);
			} // Of for j
			resultEntropy += tempValueCounts[i] / tempNumInstances * tempEntropy;
		} // Of for i

		return resultEntropy;
	}// Of conditionalEntropy

	/**
	 ********************************** 
	 * Split the data according to the given attribute.
	 * 
	 * @return The blocks.
	 ********************************** 
	 */
	public int[][] splitData(int paraAttribute) {
		int tempNumValues = dataset.attribute(paraAttribute).numValues();
		// System.out.println("Dataset " + dataset + "\r\n");
		// System.out.println("Attribute " + paraAttribute + " has " +
		// tempNumValues + " values.\r\n");
		int[][] resultBlocks = new int[tempNumValues][];
		int[] tempSizes = new int[tempNumValues];

		// First scan to count the size of each block.
		int tempValue;
		for (int i = 0; i < availableInstances.length; i++) {
			tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
			tempSizes[tempValue]++;
		} // Of for i

		// Allocate space.
		for (int i = 0; i < tempNumValues; i++) {
			resultBlocks[i] = new int[tempSizes[i]];
		} // Of for i

		// Second scan to fill.
		Arrays.fill(tempSizes, 0);
		for (int i = 0; i < availableInstances.length; i++) {
			tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
			// Copy data.
			resultBlocks[tempValue][tempSizes[tempValue]] = availableInstances[i];
			tempSizes[tempValue]++;
		} // Of for i

		return resultBlocks;
	}// Of splitData

	/**
	 ********************************** 
	 * Build the tree recursively.
	 ********************************** 
	 */
	public void buildTree() {
		if (pureJudge(availableInstances)) {
			return;
		} // Of if
		if (availableInstances.length <= smallBlockThreshold) {
			return;
		} // Of if

		selectBestAttribute();
		int[][] tempSubBlocks = splitData(splitAttribute);
		children = new ID3[tempSubBlocks.length];

		// Construct the remaining attribute set.
		int[] tempRemainingAttributes = new int[availableAttributes.length - 1];
		for (int i = 0; i < availableAttributes.length; i++) {
			if (availableAttributes[i] < splitAttribute) {
				tempRemainingAttributes[i] = availableAttributes[i];
			} else if (availableAttributes[i] > splitAttribute) {
				tempRemainingAttributes[i - 1] = availableAttributes[i];
			} // Of if
		} // Of for i

		// Construct children.
		for (int i = 0; i < children.length; i++) {
			if ((tempSubBlocks[i] == null) || (tempSubBlocks[i].length == 0)) {
				children[i] = null;
				continue;
			} else {
				// System.out.println("Building children #" + i + " with
				// instances " + Arrays.toString(tempSubBlocks[i]));
				children[i] = new ID3(dataset, tempSubBlocks[i], tempRemainingAttributes);

				// Important code: do this recursively
				children[i].buildTree();
			} // Of if
		} // Of for i
	}// Of buildTree

	/**
	 ********************************** 
	 * Classify an instance.
	 * 
	 * @param paraInstance
	 *            The given instance.
	 * @return The prediction.
	 ********************************** 
	 */
	public int classify(Instance paraInstance) {
		if (children == null) {
			return label;
		} // Of if

		ID3 tempChild = children[(int) paraInstance.value(splitAttribute)];
		if (tempChild == null) {
			return label;
		} // Of if

		return tempChild.classify(paraInstance);
	}// Of classify

	/**
	 ********************************** 
	 * Test on a testing set.
	 * 
	 * @param paraDataset
	 *            The given testing data.
	 * @return The accuracy.
	 ********************************** 
	 */
	public double test(Instances paraDataset) {
		double tempCorrect = 0;
		for (int i = 0; i < paraDataset.numInstances(); i++) {
			if (classify(paraDataset.instance(i)) == (int) paraDataset.instance(i).classValue()) {
				tempCorrect++;
			} // Of i
		} // Of for i

		return tempCorrect / paraDataset.numInstances();
	}// Of test

	/**
	 ********************************** 
	 * Test on the training set.
	 * 
	 * @return The accuracy.
	 ********************************** 
	 */
	public double selfTest() {
		return test(dataset);
	}// Of selfTest

	/**
	 ******************* 
	 * Overrides the method claimed in Object.
	 * 
	 * @return The tree structure.
	 ******************* 
	 */
	public String toString() {
		String resultString = "";
		String tempAttributeName = dataset.attribute(splitAttribute).name();
		if (children == null) {
			resultString += "class = " + label;
		} else {
			for (int i = 0; i < children.length; i++) {
				if (children[i] == null) {
					resultString += tempAttributeName + " = "
							+ dataset.attribute(splitAttribute).value(i) + ":" + "class = " + label
							+ "\r\n";
				} else {
					resultString += tempAttributeName + " = "
							+ dataset.attribute(splitAttribute).value(i) + ":" + children[i]
							+ "\r\n";
				} // Of if
			} // Of for i
		} // Of if

		return resultString;
	}// Of toString

	/**
	 ************************* 
	 * Test this class.
	 * 
	 * @param args
	 *            Not used now.
	 ************************* 
	 */
	public static void id3Test() {
		ID3 tempID3 = new ID3("D:/data/weather.arff");
		// ID3 tempID3 = new ID3("D:/data/mushroom.arff");
		ID3.smallBlockThreshold = 3;
		tempID3.buildTree();

		System.out.println("The tree is: \r\n" + tempID3);

		double tempAccuracy = tempID3.selfTest();
		System.out.println("The accuracy is: " + tempAccuracy);
	}// Of id3Test

	/**
	 ************************* 
	 * Test this class.
	 * 
	 * @param args
	 *            Not used now.
	 ************************* 
	 */
	public static void main(String[] args) {
		id3Test();
	}// Of main
}// Of class ID3

第 62 天: 决策树 (2. 建树与分类)

抄完昨天列出的代码.

构建决策树是一个递归的过程, 参数设计是核心.
分类 classify() 也是一个递归的过程.
当前仅在训练集上测试. 设计单独的测试集也不困难.
toString() 也用了递归的方式. 但输出的树格式不太好看.
在 https://github.com/FanSmale/sampledata/ 可下载 iris.arff 和 mushroom.arff. 万一访问不畅, 把下面的内容拷贝另存成 weather.arff 即可.

@relation weather
@attribute Outlook {Sunny, Overcast, Rain}
@attribute Temperature {Hot, Mild, Cool}
@attribute Humidity {High, Normal, Low}
@attribute Windy {FALSE, TRUE}
@attribute Play {N, P}
@data
Sunny,Hot,High,FALSE,N
Sunny,Hot,High,TRUE,N
Overcast,Hot,High,FALSE,P
Rain,Mild,High,FALSE,P
Rain,Cool,Normal,FALSE,P
Rain,Cool,Normal,TRUE,N
Overcast,Cool,Normal,TRUE,P
Sunny,Mild,High,FALSE,N
Sunny,Cool,Normal,FALSE,P
Rain,Mild,Normal,FALSE,P
Sunny,Mild,Normal,TRUE,P
Overcast,Mild,High,TRUE,P
Overcast,Hot,Normal,FALSE,P
Rain,Mild,High,TRUE,N

第 63 天: 集成学习之 AdaBoosting (1. 带权数据集)

开始时应先看一下 AdaBoosting 的贴子, 知道其基本算法.
平时的数据集各条数据相同重要. 带权数据集则为每个对象赋予一定权重.
adjustWeights() 是根据相应式子写的, 为核心代码.
仍然有很多简单代码 (方法), 它们起到基础的作用.
更完备代码见 https://github.com/fansmale/mfadaboosting.

package machinelearning.adaboosting;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.Instances;

/**
 * Weighted instances.<br>
 * 
 * @author Fan Min minfanphd@163.com.
 */
public class WeightedInstances extends Instances {

	/**
	 * Just the requirement of some classes, any number is ok.
	 */
	private static final long serialVersionUID = 11087456L;

	/**
	 * Weights.
	 */
	private double[] weights;

	/**
	 ****************** 
	 * The first constructor.
	 * 
	 * @param paraFileReader
	 *            The given reader to read data from file.
	 ****************** 
	 */
	public WeightedInstances(FileReader paraFileReader) throws Exception {
		super(paraFileReader);
		setClassIndex(numAttributes() - 1);

		// Initialize weights
		weights = new double[numInstances()];
		double tempAverage = 1.0 / numInstances();
		for (int i = 0; i < weights.length; i++) {
			weights[i] = tempAverage;
		} // Of for i
		System.out.println("Instances weights are: " + Arrays.toString(weights));
	} // Of the first constructor

	/**
	 ****************** 
	 * The second constructor.
	 * 
	 * @param paraInstances
	 *            The given instance.
	 ****************** 
	 */
	public WeightedInstances(Instances paraInstances) {
		super(paraInstances);
		setClassIndex(numAttributes() - 1);

		// Initialize weights
		weights = new double[numInstances()];
		double tempAverage = 1.0 / numInstances();
		for (int i = 0; i < weights.length; i++) {
			weights[i] = tempAverage;
		} // Of for i
		System.out.println("Instances weights are: " + Arrays.toString(weights));
	} // Of the second constructor

	/**
	 ****************** 
	 * Getter.
	 * 
	 * @param paraIndex
	 *            The given index.
	 * @return The weight of the given index.
	 ****************** 
	 */
	public double getWeight(int paraIndex) {
		return weights[paraIndex];
	} // Of getWeight

	/**
	 ****************** 
	 * Adjust the weights.
	 * 
	 * @param paraCorrectArray
	 *            Indicate which instances have been correctly classified.
	 * @param paraAlpha
	 *            The weight of the last classifier.
	 ****************** 
	 */
	public void adjustWeights(boolean[] paraCorrectArray, double paraAlpha) {
		// Step 1. Calculate alpha.
		double tempIncrease = Math.exp(paraAlpha);

		// Step 2. Adjust.
		double tempWeightsSum = 0; // For normalization.
		for (int i = 0; i < weights.length; i++) {
			if (paraCorrectArray[i]) {
				weights[i] /= tempIncrease;
			} else {
				weights[i] *= tempIncrease;
			} // Of if
			tempWeightsSum += weights[i];
		} // Of for i

		// Step 3. Normalize.
		for (int i = 0; i < weights.length; i++) {
			weights[i] /= tempWeightsSum;
		} // Of for i

		System.out.println("After adjusting, instances weights are: " + Arrays.toString(weights));
	} // Of adjustWeights

	/**
	 ****************** 
	 * Test the method.
	 ****************** 
	 */
	public void adjustWeightsTest() {
		boolean[] tempCorrectArray = new boolean[numInstances()];
		for (int i = 0; i < tempCorrectArray.length / 2; i++) {
			tempCorrectArray[i] = true;
		} // Of for i

		double tempWeightedError = 0.3;

		adjustWeights(tempCorrectArray, tempWeightedError);

		System.out.println("After adjusting");

		System.out.println(toString());
	} // Of adjustWeightsTest

	/**
	 ****************** 
	 * For display.
	 ****************** 
	 */
	public String toString() {
		String resultString = "I am a weighted Instances object.\r\n" + "I have " + numInstances() + " instances and "
				+ (numAttributes() - 1) + " conditional attributes.\r\n" + "My weights are: " + Arrays.toString(weights)
				+ "\r\n" + "My data are: \r\n" + super.toString();

		return resultString;
	} // Of toString

	/**
	 ****************** 
	 * For unit test.
	 * 
	 * @param args
	 *            Not provided.
	 ****************** 
	 */
	public static void main(String args[]) {
		WeightedInstances tempWeightedInstances = null;
		String tempFilename = "d:/data/iris.arff";
		try {
			FileReader tempFileReader = new FileReader(tempFilename);
			tempWeightedInstances = new WeightedInstances(tempFileReader);
			tempFileReader.close();
		} catch (Exception exception1) {
			System.out.println("Cannot read the file: " + tempFilename + "\r\n" + exception1);
			System.exit(0);
		} // Of try

		System.out.println(tempWeightedInstances.toString());

		tempWeightedInstances.adjustWeightsTest();
	} // Of main

} // Of class WeightedInstances

第 64 天: 集成学习之 AdaBoosting (2. 树桩分类器)

做了一个超类, 用于支持不同的基础分类器. 这里为了减少代码量, 只实现了树桩分类器.
树桩分类器每次只将数据分成两堆, 与决策树相比, 简单至极. 当然, 这里处理的是实型数据, 而 ID3 处理的是符号型数据.

抽象分类器代码.

package machinelearning.adaboosting;

import java.util.Random;

import weka.core.Instance;

/**
 * The super class of any simple classifier.
 * 
 * @author Fan Min minfanphd@163.com.
 */

public abstract class SimpleClassifier {

	/**
	 * The index of the current attribute.
	 */
	int selectedAttribute;

	/**
	 * Weighted data.
	 */
	WeightedInstances weightedInstances;

	/**
	 * The accuracy on the training set.
	 */
	double trainingAccuracy;

	/**
	 * The number of classes. For binary classification it is 2.
	 */
	int numClasses;

	/**
	 * The number of instances.
	 */
	int numInstances;

	/**
	 * The number of conditional attributes.
	 */
	int numConditions;

	/**
	 * For random number generation.
	 */
	Random random = new Random();

	/**
	 ****************** 
	 * The first constructor.
	 * 
	 * @param paraWeightedInstances
	 *            The given instances.
	 ****************** 
	 */
	public SimpleClassifier(WeightedInstances paraWeightedInstances) {
		weightedInstances = paraWeightedInstances;

		numConditions = weightedInstances.numAttributes() - 1;
		numInstances = weightedInstances.numInstances();
		numClasses = weightedInstances.classAttribute().numValues();
	}// Of the first constructor

	/**
	 ****************** 
	 * Train the classifier.
	 ****************** 
	 */
	public abstract void train();

	/**
	 ****************** 
	 * Classify an instance.
	 * 
	 * @param paraInstance
	 *            The given instance.
	 * @return Predicted label.
	 ****************** 
	 */
	public abstract int classify(Instance paraInstance);

	/**
	 ****************** 
	 * Which instances in the training set are correctly classified.
	 * 
	 * @return The correctness array.
	 ****************** 
	 */
	public boolean[] computeCorrectnessArray() {
		boolean[] resultCorrectnessArray = new boolean[weightedInstances.numInstances()];
		for (int i = 0; i < resultCorrectnessArray.length; i++) {
			Instance tempInstance = weightedInstances.instance(i);
			if ((int) (tempInstance.classValue()) == classify(tempInstance)) {
				resultCorrectnessArray[i] = true;
			} // Of if

			// System.out.print("\t" + classify(tempInstance));
		} // Of for i
			// System.out.println();
		return resultCorrectnessArray;
	}// Of computeCorrectnessArray

	/**
	 ****************** 
	 * Compute the accuracy on the training set.
	 * 
	 * @return The training accuracy.
	 ****************** 
	 */
	public double computeTrainingAccuracy() {
		double tempCorrect = 0;
		boolean[] tempCorrectnessArray = computeCorrectnessArray();
		for (int i = 0; i < tempCorrectnessArray.length; i++) {
			if (tempCorrectnessArray[i]) {
				tempCorrect++;
			} // Of if
		} // Of for i

		double resultAccuracy = tempCorrect / tempCorrectnessArray.length;

		return resultAccuracy;
	}// Of computeTrainingAccuracy

	/**
	 ****************** 
	 * Compute the weighted error on the training set. It is at least 1e-6 to
	 * avoid NaN.
	 * 
	 * @return The weighted error.
	 ****************** 
	 */
	public double computeWeightedError() {
		double resultError = 0;
		boolean[] tempCorrectnessArray = computeCorrectnessArray();
		for (int i = 0; i < tempCorrectnessArray.length; i++) {
			if (!tempCorrectnessArray[i]) {
				resultError += weightedInstances.getWeight(i);
			} // Of if
		} // Of for i

		if (resultError < 1e-6) {
			resultError = 1e-6;
		} // Of if

		return resultError;
	}// Of computeWeightedError
} // Of class SimpleClassifier

树桩分类器代码.

package machinelearning.adaboosting;

import weka.core.Instance;
import java.io.FileReader;
import java.util.*;

/**
 * The stump classifier.<br>
 * 
 * @author Fan Min minfanphd@163.com.
 */
public class StumpClassifier extends SimpleClassifier {

	/**
	 * The best cut for the current attribute on weightedInstances.
	 */
	double bestCut;

	/**
	 * The class label for attribute value less than bestCut.
	 */
	int leftLeafLabel;

	/**
	 * The class label for attribute value no less than bestCut.
	 */
	int rightLeafLabel;

	/**
	 ****************** 
	 * The only constructor.
	 * 
	 * @param paraWeightedInstances
	 *            The given instances.
	 ****************** 
	 */
	public StumpClassifier(WeightedInstances paraWeightedInstances) {
		super(paraWeightedInstances);
	}// Of the only constructor

	/**
	 ****************** 
	 * Train the classifier.
	 ****************** 
	 */
	public void train() {
		// Step 1. Randomly choose an attribute.
		selectedAttribute = random.nextInt(numConditions);

		// Step 2. Find all attribute values and sort.
		double[] tempValuesArray = new double[numInstances];
		for (int i = 0; i < tempValuesArray.length; i++) {
			tempValuesArray[i] = weightedInstances.instance(i).value(selectedAttribute);
		} // Of for i
		Arrays.sort(tempValuesArray);

		// Step 3. Initialize, classify all instances as the same with the
		// original cut.
		int tempNumLabels = numClasses;
		double[] tempLabelCountArray = new double[tempNumLabels];
		int tempCurrentLabel;

		// Step 3.1 Scan all labels to obtain their counts.
		for (int i = 0; i < numInstances; i++) {
			// The label of the ith instance
			tempCurrentLabel = (int) weightedInstances.instance(i).classValue();
			tempLabelCountArray[tempCurrentLabel] += weightedInstances.getWeight(i);
		} // Of for i

		// Step 3.2 Find the label with the maximal count.
		double tempMaxCorrect = 0;
		int tempBestLabel = -1;
		for (int i = 0; i < tempLabelCountArray.length; i++) {
			if (tempMaxCorrect < tempLabelCountArray[i]) {
				tempMaxCorrect = tempLabelCountArray[i];
				tempBestLabel = i;
			} // Of if
		} // Of for i

		// Step 3.3 The cut is a little bit smaller than the minimal value.
		bestCut = tempValuesArray[0] - 0.1;
		leftLeafLabel = tempBestLabel;
		rightLeafLabel = tempBestLabel;

		// Step 4. Check candidate cuts one by one.
		// Step 4.1 To handle multi-class data, left and right.
		double tempCut;
		double[][] tempLabelCountMatrix = new double[2][tempNumLabels];

		for (int i = 0; i < tempValuesArray.length - 1; i++) {
			// Step 4.1 Some attribute values are identical, ignore them.
			if (tempValuesArray[i] == tempValuesArray[i + 1]) {
				continue;
			} // Of if
			tempCut = (tempValuesArray[i] + tempValuesArray[i + 1]) / 2;

			// Step 4.2 Scan all labels to obtain their counts wrt. the cut.
			// Initialize again since it is used many times.
			for (int j = 0; j < 2; j++) {
				for (int k = 0; k < tempNumLabels; k++) {
					tempLabelCountMatrix[j][k] = 0;
				} // Of for k
			} // Of for j

			for (int j = 0; j < numInstances; j++) {
				// The label of the jth instance
				tempCurrentLabel = (int) weightedInstances.instance(j).classValue();
				if (weightedInstances.instance(j).value(selectedAttribute) < tempCut) {
					tempLabelCountMatrix[0][tempCurrentLabel] += weightedInstances.getWeight(j);
				} else {
					tempLabelCountMatrix[1][tempCurrentLabel] += weightedInstances.getWeight(j);
				} // Of if
			} // Of for i

			// Step 4.3 Left leaf.
			double tempLeftMaxCorrect = 0;
			int tempLeftBestLabel = 0;
			for (int j = 0; j < tempLabelCountMatrix[0].length; j++) {
				if (tempLeftMaxCorrect < tempLabelCountMatrix[0][j]) {
					tempLeftMaxCorrect = tempLabelCountMatrix[0][j];
					tempLeftBestLabel = j;
				} // Of if
			} // Of for i

			// Step 4.4 Right leaf.
			double tempRightMaxCorrect = 0;
			int tempRightBestLabel = 0;
			for (int j = 0; j < tempLabelCountMatrix[1].length; j++) {
				if (tempRightMaxCorrect < tempLabelCountMatrix[1][j]) {
					tempRightMaxCorrect = tempLabelCountMatrix[1][j];
					tempRightBestLabel = j;
				} // Of if
			} // Of for i

			// Step 4.5 Compare with the current best.
			if (tempMaxCorrect < tempLeftMaxCorrect + tempRightMaxCorrect) {
				tempMaxCorrect = tempLeftMaxCorrect + tempRightMaxCorrect;
				bestCut = tempCut;
				leftLeafLabel = tempLeftBestLabel;
				rightLeafLabel = tempRightBestLabel;
			} // Of if
		} // Of for i

		System.out.println("Attribute = " + selectedAttribute + ", cut = " + bestCut + ", leftLeafLabel = "
				+ leftLeafLabel + ", rightLeafLabel = " + rightLeafLabel);
	}// Of train

	/**
	 ****************** 
	 * Classify an instance.
	 * 
	 * @param paraInstance
	 *            The given instance.
	 * @return Predicted label.
	 ****************** 
	 */
	public int classify(Instance paraInstance) {
		int resultLabel = -1;
		if (paraInstance.value(selectedAttribute) < bestCut) {
			resultLabel = leftLeafLabel;
		} else {
			resultLabel = rightLeafLabel;
		} // Of if
		return resultLabel;
	}// Of classify

	/**
	 ****************** 
	 * For display.
	 ****************** 
	 */
	public String toString() {
		String resultString = "I am a stump classifier.\r\n" + "I choose attribute #" + selectedAttribute
				+ " with cut value " + bestCut + ".\r\n" + "The left and right leaf labels are " + leftLeafLabel
				+ " and " + rightLeafLabel + ", respectively.\r\n" + "My weighted error is: " + computeWeightedError()
				+ ".\r\n" + "My weighted accuracy is : " + computeTrainingAccuracy() + ".";

		return resultString;
	}// Of toString

	/**
	 ****************** 
	 * For unit test.
	 * 
	 * @param args
	 *            Not provided.
	 ****************** 
	 */
	public static void main(String args[]) {
		WeightedInstances tempWeightedInstances = null;
		String tempFilename = "D:/data/iris.arff";
		try {
			FileReader tempFileReader = new FileReader(tempFilename);
			tempWeightedInstances = new WeightedInstances(tempFileReader);
			tempFileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + tempFilename + "\r\n" + ee);
			System.exit(0);
		} // Of try

		StumpClassifier tempClassifier = new StumpClassifier(tempWeightedInstances);
		tempClassifier.train();
		System.out.println(tempClassifier);

		System.out.println(Arrays.toString(tempClassifier.computeCorrectnessArray()));
	}// Of main
}// Of class StumpClassifier

第 65 天: 集成学习之 AdaBoosting (3. 集成器)

核心代码在 train() 里面.
为简化代码, 直接使用训练集做测试.

package machinelearning.adaboosting;

import java.io.FileReader;
import weka.core.Instance;
import weka.core.Instances;

/**
 * The booster which ensembles base classifiers.
 * 
 * @author Fan Min minfanphd@163.com.
 */
public class Booster {

	/**
	 * Classifiers.
	 */
	SimpleClassifier[] classifiers;

	/**
	 * Number of classifiers.
	 */
	int numClassifiers;

	/**
	 * Whether or not stop after the training error is 0.
	 */
	boolean stopAfterConverge = false;

	/**
	 * The weights of classifiers.
	 */
	double[] classifierWeights;

	/**
	 * The training data.
	 */
	Instances trainingData;

	/**
	 * The testing data.
	 */
	Instances testingData;

	/**
	 ****************** 
	 * The first constructor. The testing set is the same as the training set.
	 * 
	 * @param paraTrainingFilename
	 *            The data filename.
	 ****************** 
	 */
	public Booster(String paraTrainingFilename) {
		// Step 1. Read training set.
		try {
			FileReader tempFileReader = new FileReader(paraTrainingFilename);
			trainingData = new Instances(tempFileReader);
			tempFileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + paraTrainingFilename + "\r\n" + ee);
			System.exit(0);
		} // Of try

		// Step 2. Set the last attribute as the class index.
		trainingData.setClassIndex(trainingData.numAttributes() - 1);

		// Step 3. The testing data is the same as the training data.
		testingData = trainingData;

		stopAfterConverge = true;

		System.out.println("****************Data**********\r\n" + trainingData);
	}// Of the first constructor

	/**
	 ****************** 
	 * Set the number of base classifier, and allocate space for them.
	 * 
	 * @param paraNumBaseClassifiers
	 *            The number of base classifier.
	 ****************** 
	 */
	public void setNumBaseClassifiers(int paraNumBaseClassifiers) {
		numClassifiers = paraNumBaseClassifiers;

		// Step 1. Allocate space (only reference) for classifiers
		classifiers = new SimpleClassifier[numClassifiers];

		// Step 2. Initialize classifier weights.
		classifierWeights = new double[numClassifiers];
	}// Of setNumBaseClassifiers

	/**
	 ****************** 
	 * Train the booster.
	 * 
	 * @see algorithm.StumpClassifier#train()
	 ****************** 
	 */
	public void train() {
		// Step 1. Initialize.
		WeightedInstances tempWeightedInstances = null;
		double tempError;
		numClassifiers = 0;

		// Step 2. Build other classifiers.
		for (int i = 0; i < classifiers.length; i++) {
			// Step 2.1 Key code: Construct or adjust the weightedInstances
			if (i == 0) {
				tempWeightedInstances = new WeightedInstances(trainingData);
			} else {
				// Adjust the weights of the data.
				tempWeightedInstances.adjustWeights(classifiers[i - 1].computeCorrectnessArray(),
						classifierWeights[i - 1]);
			} // Of if

			// Step 2.2 Train the next classifier.
			classifiers[i] = new StumpClassifier(tempWeightedInstances);
			classifiers[i].train();

			tempError = classifiers[i].computeWeightedError();

			// Key code: Set the classifier weight.
			classifierWeights[i] = 0.5 * Math.log(1 / tempError - 1);
			if (classifierWeights[i] < 1e-6) {
				classifierWeights[i] = 0;
			} // Of if

			System.out.println("Classifier #" + i + " , weighted error = " + tempError + ", weight = "
					+ classifierWeights[i] + "\r\n");

			numClassifiers++;

			// The accuracy is enough.
			if (stopAfterConverge) {
				double tempTrainingAccuracy = computeTrainingAccuray();
				System.out.println("The accuracy of the booster is: " + tempTrainingAccuracy + "\r\n");
				if (tempTrainingAccuracy > 0.999999) {
					System.out.println("Stop at the round: " + i + " due to converge.\r\n");
					break;
				} // Of if
			} // Of if
		} // Of for i
	}// Of train

	/**
	 ****************** 
	 * Classify an instance.
	 * 
	 * @param paraInstance
	 *            The given instance.
	 * @return The predicted label.
	 ****************** 
	 */
	public int classify(Instance paraInstance) {
		double[] tempLabelsCountArray = new double[trainingData.classAttribute().numValues()];
		for (int i = 0; i < numClassifiers; i++) {
			int tempLabel = classifiers[i].classify(paraInstance);
			tempLabelsCountArray[tempLabel] += classifierWeights[i];
		} // Of for i

		int resultLabel = -1;
		double tempMax = -1;
		for (int i = 0; i < tempLabelsCountArray.length; i++) {
			if (tempMax < tempLabelsCountArray[i]) {
				tempMax = tempLabelsCountArray[i];
				resultLabel = i;
			} // Of if
		} // Of for

		return resultLabel;
	}// Of classify

	/**
	 ****************** 
	 * Test the booster on the training data.
	 * 
	 * @return The classification accuracy.
	 ****************** 
	 */
	public double test() {
		System.out.println("Testing on " + testingData.numInstances() + " instances.\r\n");

		return test(testingData);
	}// Of test

	/**
	 ****************** 
	 * Test the booster.
	 * 
	 * @param paraInstances
	 *            The testing set.
	 * @return The classification accuracy.
	 ****************** 
	 */
	public double test(Instances paraInstances) {
		double tempCorrect = 0;
		paraInstances.setClassIndex(paraInstances.numAttributes() - 1);

		for (int i = 0; i < paraInstances.numInstances(); i++) {
			Instance tempInstance = paraInstances.instance(i);
			if (classify(tempInstance) == (int) tempInstance.classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		double resultAccuracy = tempCorrect / paraInstances.numInstances();
		System.out.println("The accuracy is: " + resultAccuracy);

		return resultAccuracy;
	} // Of test

	/**
	 ****************** 
	 * Compute the training accuracy of the booster. It is not weighted.
	 * 
	 * @return The training accuracy.
	 ****************** 
	 */
	public double computeTrainingAccuray() {
		double tempCorrect = 0;

		for (int i = 0; i < trainingData.numInstances(); i++) {
			if (classify(trainingData.instance(i)) == (int) trainingData.instance(i).classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		double tempAccuracy = tempCorrect / trainingData.numInstances();

		return tempAccuracy;
	}// Of computeTrainingAccuray

	/**
	 ****************** 
	 * For integration test.
	 * 
	 * @param args
	 *            Not provided.
	 ****************** 
	 */
	public static void main(String args[]) {
		System.out.println("Starting AdaBoosting...");
		Booster tempBooster = new Booster("D:/data/iris.arff");
		// Booster tempBooster = new Booster("src/data/smalliris.arff");

		tempBooster.setNumBaseClassifiers(100);
		tempBooster.train();

		System.out.println("The training accuracy is: " + tempBooster.computeTrainingAccuray());
		tempBooster.test();
	}// Of main

}// Of class Booster

第 66 天: 主动学习之 ALEC

ALEC 是我们第一篇主动学习论文的源代码.
Min Wang, Fan Min, Yan-Xue Wu, Zhi-Heng Zhang, Active learning through density clustering, Expert Systems with Applications 85 (2017) 305–317.
这里是我们网站的链接, 论文编号 31.
为了便于理解, 进行了实现上的一些简化. 要获得更规范的风格和更强的适应性, 可参见我们另一篇论文
Fan Min, Shi‑Ming Zhang, Davide Ciucci, Min Wang. Three‑way active learning through clustering selection. International Journal of Machine Learning and Cybernetics . (2020-03)1033–1046.
也请见以上链接, 论文编号 54.
TACS 源码, 含 GUI.

从这个讲座可以获得主动学习的基本理解主动学习: 从三支决策到代价敏感.

基于聚类主动学习的基本思想如下:
Step 1. 将对象按代表性递减排序;
Step 2. 假设当前数据块有 $N$ 个对象, 选择最具代表性的 $\sqrt{N}$ 个查询其标签 (类别).
Step 3. 如果这 $\sqrt{N}$ 个标签具有相同类别, 就认为该块为纯的, 其它对象均分类为同一类别. 结束.
Step 4. 将当前块划分为两个子块, 分别 Goto Step 3.

今天只抄到 280 行.
mergeSortToIndices 是排序算法在本论文中的灵活运用. 它起到了关键的作用. 趁机把归并排序复习一下.
distance 仅实现了欧氏距离. 为了简化.
computeMaximalDistance 获得数据集的直径.
computeDensitiesGaussian 使用了高斯. 论文里面是截断距离 cutoff, 导致很多对象的密度相同, 难于区分其重要程度. 高斯则有效避免这个问题.

package machinelearning.activelearning;

import java.io.FileReader;
import java.util.*;
import weka.core.Instances;

/**
 * Active learning through density clustering.
 * http://www.sciencedirect.com/science/article/pii/S095741741730369X
 * 
 * @author Fan Min minfanphd@163.com.
 */
public class Alec {
	/**
	 * The whole dataset.
	 */
	Instances dataset;

	/**
	 * The maximal number of queries that can be provided.
	 */
	int maxNumQuery;

	/**
	 * The actual number of queries.
	 */
	int numQuery;

	/**
	 * The radius, also dc in the paper. It is employed for density computation.
	 */
	double radius;

	/**
	 * The densities of instances, also rho in the paper.
	 */
	double[] densities;

	/**
	 * distanceToMaster
	 */
	double[] distanceToMaster;

	/**
	 * Sorted indices, where the first element indicates the instance with the
	 * biggest density.
	 */
	int[] descendantDensities;

	/**
	 * Priority
	 */
	double[] priority;

	/**
	 * The maximal distance between any pair of points.
	 */
	double maximalDistance;

	/**
	 * Who is my master?
	 */
	int[] masters;

	/**
	 * Predicted labels.
	 */
	int[] predictedLabels;

	/**
	 * Instance status. 0 for unprocessed, 1 for queried, 2 for classified.
	 */
	int[] instanceStatusArray;

	/**
	 * The descendant indices to show the representativeness of instances in a
	 * descendant order.
	 */
	int[] descendantRepresentatives;

	/**
	 * Indicate the cluster of each instance. It is only used in
	 * clusterInTwo(int[]);
	 */
	int[] clusterIndices;

	/**
	 * Blocks with size no more than this threshold should not be split further.
	 */
	int smallBlockThreshold = 3;

	/**
	 ********************************** 
	 * The constructor.
	 * 
	 * @param paraFilename
	 *            The data filename.
	 ********************************** 
	 */
	public Alec(String paraFilename) {
		try {
			FileReader tempReader = new FileReader(paraFilename);
			dataset = new Instances(tempReader);
			dataset.setClassIndex(dataset.numAttributes() - 1);
			tempReader.close();
		} catch (Exception ee) {
			System.out.println(ee);
			System.exit(0);
		} // Of fry
		computeMaximalDistance();
		clusterIndices = new int[dataset.numInstances()];
	}// Of the constructor

	/**
	 ********************************** 
	 * Merge sort in descendant order to obtain an index array. The original
	 * array is unchanged. The method should be tested further. <br>
	 * Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2]. <br>
	 * input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3].
	 * 
	 * @param paraArray
	 *            the original array
	 * @return The sorted indices.
	 ********************************** 
	 */
	public static int[] mergeSortToIndices(double[] paraArray) {
		int tempLength = paraArray.length;
		int[][] resultMatrix = new int[2][tempLength];// For merge sort.

		// Initialize
		int tempIndex = 0;
		for (int i = 0; i < tempLength; i++) {
			resultMatrix[tempIndex][i] = i;
		} // Of for i

		// Merge
		int tempCurrentLength = 1;
		// The indices for current merged groups.
		int tempFirstStart, tempSecondStart, tempSecondEnd;

		while (tempCurrentLength < tempLength) {
			// Divide into a number of groups.
			// Here the boundary is adaptive to array length not equal to 2^k.
			for (int i = 0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength / 2); i++) {
				// Boundaries of the group
				tempFirstStart = i * tempCurrentLength * 2;

				tempSecondStart = tempFirstStart + tempCurrentLength;

				tempSecondEnd = tempSecondStart + tempCurrentLength - 1;
				if (tempSecondEnd >= tempLength) {
					tempSecondEnd = tempLength - 1;
				} // Of if

				// Merge this group
				int tempFirstIndex = tempFirstStart;
				int tempSecondIndex = tempSecondStart;
				int tempCurrentIndex = tempFirstStart;

				if (tempSecondStart >= tempLength) {
					for (int j = tempFirstIndex; j < tempLength; j++) {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
								% 2][j];
						tempFirstIndex++;
						tempCurrentIndex++;
					} // Of for j
					break;
				} // Of if

				while ((tempFirstIndex <= tempSecondStart - 1)
						&& (tempSecondIndex <= tempSecondEnd)) {

					if (paraArray[resultMatrix[tempIndex
							% 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex
									% 2][tempSecondIndex]]) {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
								% 2][tempFirstIndex];
						tempFirstIndex++;
					} else {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
								% 2][tempSecondIndex];
						tempSecondIndex++;
					} // Of if
					tempCurrentIndex++;
				} // Of while

				// Remaining part
				for (int j = tempFirstIndex; j < tempSecondStart; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
							% 2][j];
					tempCurrentIndex++;
				} // Of for j
				for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
							% 2][j];
					tempCurrentIndex++;
				} // Of for j
			} // Of for i

			tempCurrentLength *= 2;
			tempIndex++;
		} // Of while

		return resultMatrix[tempIndex % 2];
	}// Of mergeSortToIndices

	/**
	 *********************
	 * The Euclidean distance between two instances. Other distance measures
	 * unsupported for simplicity.
	 * 
	 * 
	 * @param paraI
	 *            The index of the first instance.
	 * @param paraJ
	 *            The index of the second instance.
	 * @return The distance.
	 *********************
	 */
	public double distance(int paraI, int paraJ) {
		double resultDistance = 0;
		double tempDifference;
		for (int i = 0; i < dataset.numAttributes() - 1; i++) {
			tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
			resultDistance += tempDifference * tempDifference;
		} // Of for i
		resultDistance = Math.sqrt(resultDistance);

		return resultDistance;
	}// Of distance

	/**
	 ********************************** 
	 * Compute the maximal distance. The result is stored in a member variable.
	 ********************************** 
	 */
	public void computeMaximalDistance() {
		maximalDistance = 0;
		double tempDistance;
		for (int i = 0; i < dataset.numInstances(); i++) {
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);
				if (maximalDistance < tempDistance) {
					maximalDistance = tempDistance;
				} // Of if
			} // Of for j
		} // Of for i

		System.out.println("maximalDistance = " + maximalDistance);
	}// Of computeMaximalDistance

	/**
	 ****************** 
	 * Compute the densities using Gaussian kernel.
	 * 
	 * @param paraBlock
	 *            The given block.
	 ****************** 
	 */
	public void computeDensitiesGaussian() {
		System.out.println("radius = " + radius);
		densities = new double[dataset.numInstances()];
		double tempDistance;

		for (int i = 0; i < dataset.numInstances(); i++) {
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);
				densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius);
			} // Of for j
		} // Of for i

		System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
	}// Of computeDensitiesGaussian

	/**
	 ********************************** 
	 * Compute distanceToMaster, the distance to its master.
	 ********************************** 
	 */
	public void computeDistanceToMaster() {
		distanceToMaster = new double[dataset.numInstances()];
		masters = new int[dataset.numInstances()];
		descendantDensities = new int[dataset.numInstances()];
		instanceStatusArray = new int[dataset.numInstances()];

		descendantDensities = mergeSortToIndices(densities);
		distanceToMaster[descendantDensities[0]] = maximalDistance;

		double tempDistance;
		for (int i = 1; i < dataset.numInstances(); i++) {
			// Initialize.
			distanceToMaster[descendantDensities[i]] = maximalDistance;
			for (int j = 0; j <= i - 1; j++) {
				tempDistance = distance(descendantDensities[i], descendantDensities[j]);
				if (distanceToMaster[descendantDensities[i]] > tempDistance) {
					distanceToMaster[descendantDensities[i]] = tempDistance;
					masters[descendantDensities[i]] = descendantDensities[j];
				} // Of if
			} // Of for j
		} // Of for i
		System.out.println("First compute, masters = " + Arrays.toString(masters));
		System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
	}// Of computeDistanceToMaster

	/**
	 ********************************** 
	 * Compute priority. Element with higher priority is more likely to be
	 * selected as a cluster center. Now it is rho * distanceToMaster. It can
	 * also be rho^alpha * distanceToMaster.
	 ********************************** 
	 */
	public void computePriority() {
		priority = new double[dataset.numInstances()];
		for (int i = 0; i < dataset.numInstances(); i++) {
			priority[i] = densities[i] * distanceToMaster[i];
		} // Of for i
	}// Of computePriority

	/**
	 ************************* 
	 * The block of a node should be same as its master. This recursive method
	 * is efficient.
	 * 
	 * @param paraIndex
	 *            The index of the given node.
	 * @return The cluster index of the current node.
	 ************************* 
	 */
	public int coincideWithMaster(int paraIndex) {
		if (clusterIndices[paraIndex] == -1) {
			int tempMaster = masters[paraIndex];
			clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
		} // Of if

		return clusterIndices[paraIndex];
	}// Of coincideWithMaster

	/**
	 ************************* 
	 * Cluster a block in two. According to the master tree.
	 * 
	 * @param paraBlock
	 *            The given block.
	 * @return The new blocks where the two most represent instances serve as
	 *         the root.
	 ************************* 
	 */
	public int[][] clusterInTwo(int[] paraBlock) {
		// Reinitialize. In fact, only instances in the given block is
		// considered.
		Arrays.fill(clusterIndices, -1);

		// Initialize the cluster number of the two roots.
		for (int i = 0; i < 2; i++) {
			clusterIndices[paraBlock[i]] = i;
		} // Of for i

		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] != -1) {
				// Already have a cluster number.
				continue;
			} // Of if

			clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
		} // Of for i

		// The sub blocks.
		int[][] resultBlocks = new int[2][];
		int tempFistBlockCount = 0;
		for (int i = 0; i < clusterIndices.length; i++) {
			if (clusterIndices[i] == 0) {
				tempFistBlockCount++;
			} // Of if
		} // Of for i
		resultBlocks[0] = new int[tempFistBlockCount];
		resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];

		// Copy. You can design shorter code when the number of clusters is
		// greater than 2.
		int tempFirstIndex = 0;
		int tempSecondIndex = 0;
		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] == 0) {
				resultBlocks[0][tempFirstIndex] = paraBlock[i];
				tempFirstIndex++;
			} else {
				resultBlocks[1][tempSecondIndex] = paraBlock[i];
				tempSecondIndex++;
			} // Of if
		} // Of for i

		System.out.println("Split (" + paraBlock.length + ") instances "
				+ Arrays.toString(paraBlock) + "\r\nto (" + resultBlocks[0].length + ") instances "
				+ Arrays.toString(resultBlocks[0]) + "\r\nand (" + resultBlocks[1].length
				+ ") instances " + Arrays.toString(resultBlocks[1]));
		return resultBlocks;
	}// Of clusterInTwo

	/**
	 ********************************** 
	 * Classify instances in the block by simple voting.
	 * 
	 * @param paraBlock
	 *            The given block.
	 ********************************** 
	 */
	public void vote(int[] paraBlock) {
		int[] tempClassCounts = new int[dataset.numClasses()];
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 1) {
				tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
			} // Of if
		} // Of for i

		int tempMaxClass = -1;
		int tempMaxCount = -1;
		for (int i = 0; i < tempClassCounts.length; i++) {
			if (tempMaxCount < tempClassCounts[i]) {
				tempMaxClass = i;
				tempMaxCount = tempClassCounts[i];
			} // Of if
		} // Of for i

		// Classify unprocessed instances.
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 0) {
				predictedLabels[paraBlock[i]] = tempMaxClass;
				instanceStatusArray[paraBlock[i]] = 2;
			} // Of if
		} // Of for i
	}// Of vote

	/**
	 ********************************** 
	 * Cluster based active learning. Prepare for
	 * 
	 * @param paraRatio
	 *            The ratio of the maximal distance as the dc.
	 * @param paraMaxNumQuery
	 *            The maximal number of queries for the whole dataset.
	 * @parm paraSmallBlockThreshold The small block threshold.
	 ********************************** 
	 */
	public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery,
			int paraSmallBlockThreshold) {
		radius = maximalDistance * paraRatio;
		smallBlockThreshold = paraSmallBlockThreshold;

		maxNumQuery = paraMaxNumQuery;
		predictedLabels = new int[dataset.numInstances()];

		for (int i = 0; i < dataset.numInstances(); i++) {
			predictedLabels[i] = -1;
		} // Of for i

		computeDensitiesGaussian();
		computeDistanceToMaster();
		computePriority();
		descendantRepresentatives = mergeSortToIndices(priority);
		System.out.println(
				"descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));

		numQuery = 0;
		clusterBasedActiveLearning(descendantRepresentatives);
	}// Of clusterBasedActiveLearning

	/**
	 ********************************** 
	 * Cluster based active learning.
	 * 
	 * @param paraBlock
	 *            The given block. This block must be sorted according to the
	 *            priority in descendant order.
	 ********************************** 
	 */
	public void clusterBasedActiveLearning(int[] paraBlock) {
		System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));

		// Step 1. How many labels are queried for this block.
		int tempExpectedQueries = (int) Math.sqrt(paraBlock.length);
		int tempNumQuery = 0;
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 1) {
				tempNumQuery++;
			} // Of if
		} // Of for i

		// Step 2. Vote for small blocks.
		if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
			System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n"
					+ Arrays.toString(paraBlock));
			vote(paraBlock);

			return;
		} // Of if

		// Step 3. Query enough labels.
		for (int i = 0; i < tempExpectedQueries; i++) {
			if (numQuery >= maxNumQuery) {
				System.out.println("No more queries are provided, numQuery = " + numQuery + ".");
				vote(paraBlock);
				return;
			} // Of if

			if (instanceStatusArray[paraBlock[i]] == 0) {
				instanceStatusArray[paraBlock[i]] = 1;
				predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
				// System.out.println("Query #" + paraBlock[i] + ", numQuery = "
				// + numQuery);
				numQuery++;
			} // Of if
		} // Of for i

		// Step 4. Pure?
		int tempFirstLabel = predictedLabels[paraBlock[0]];
		boolean tempPure = true;
		for (int i = 1; i < tempExpectedQueries; i++) {
			if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
				tempPure = false;
				break;
			} // Of if
		} // Of for i
		if (tempPure) {
			System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
			for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
				if (instanceStatusArray[paraBlock[i]] == 0) {
					predictedLabels[paraBlock[i]] = tempFirstLabel;
					instanceStatusArray[paraBlock[i]] = 2;
				} // Of if
			} // Of for i
			return;
		} // Of if

		// Step 5. Split in two and process them independently.
		int[][] tempBlocks = clusterInTwo(paraBlock);
		for (int i = 0; i < 2; i++) {
			// Attention: recursive invoking here.
			clusterBasedActiveLearning(tempBlocks[i]);
		} // Of for i
	}// Of clusterBasedActiveLearning

	/**
	 ******************* 
	 * Show the statistics information.
	 ******************* 
	 */
	public String toString() {
		int[] tempStatusCounts = new int[3];
		double tempCorrect = 0;
		for (int i = 0; i < dataset.numInstances(); i++) {
			tempStatusCounts[instanceStatusArray[i]]++;
			if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		String resultString = "(unhandled, queried, classified) = "
				+ Arrays.toString(tempStatusCounts);
		resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = "
				+ (tempCorrect / dataset.numInstances());

		return resultString;
	}// Of toString

	/**
	 ********************************** 
	 * The entrance of the program.
	 * 
	 * @param args:
	 *            Not used now.
	 ********************************** 
	 */
	public static void main(String[] args) {
		long tempStart = System.currentTimeMillis();

		System.out.println("Starting ALEC.");
		String arffFilename = "D:/data/iris.arff";
		// String arffFilename = "D:/data/mushroom.arff";

		Alec tempAlec = new Alec(arffFilename);
		// The settings for iris
		tempAlec.clusterBasedActiveLearning(0.15, 30, 3);
		// The settings for mushroom
		// tempAlec.clusterBasedActiveLearning(0.1, 800, 3);
		System.out.println(tempAlec);

		long tempEnd = System.currentTimeMillis();
		System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
	}// Of main
}// Of class Alec

第 67 天: 主动学习之 ALEC (续)

今天抄完程序, 并且进行初步理解.

computeDistanceToMaster 是密度聚类的核心. 节点的父节点 (master), 是比它密度大的节点中, 距离最近那个. 到父节点的距离越远, 表示独立性越强.
computePriority 综合考虑密度 (能力) 与距离 (独立性). 这两者乘积越大的节点 (对象), 代表性越强.
coincideWithMaster 在聚类算法中使用, 需要用例子来跟踪来能明白其作用. 简单而言, 节点应与其父节点拥有同样的簇编号.
clusterInTwo 将一个块分成两块, 其根节点依次是第一个与第二个 (注意到每个块都是按节点的代表性递减排序).
vote 根据已经查询的标签, 对一个块中其它对象进行投票分类.
clusterBasedActiveLearning(double, double, int) 为核心算法提供初始化服务.
clusterBasedActiveLearning(int[]) 是核心算法, 它是递归的.各种情况的处理要小心.

第 68 天: 主动学习之 ALEC (续)

继续理解, 特别是回头理解成员变量的作用.

descendantDensities 按照密度将对象排序 (降序). 该数组第一个元素所指定的对象具有最大的密度.
descendantRepresentatives 按照代表性将对象排序 (降序). 这样, 在为某一块挑选样本的时候, 只需要根据这个顺序从前向后选择即可. 使用它很大程度简化了程序.
用不同的数据集做实验.

第 69 天: 矩阵分解

矩阵分解是推荐系统的一种重要算法. 同时, 它也可以用到许多其它地方.

用三元组来存放一个数据. 与 MBR 里面的方式相似.
子矩阵更新代码是核心.
在训练集中进行测试, 所以拟合效果好 (MAE = 0.51).
作为基础训练, 没有考虑正则项.

package machinelearning.recommendersystem;

import java.io.*;
import java.util.Random;

/*
 * Matrix factorization for recommender systems.
 * 
 * @author Fan Min minfanphd@163.com.
 */

public class MatrixFactorization {
	/**
	 * Used to generate random numbers.
	 */
	Random rand = new Random();

	/**
	 * Number of users.
	 */
	int numUsers;

	/**
	 * Number of items.
	 */
	int numItems;

	/**
	 * Number of ratings.
	 */
	int numRatings;

	/**
	 * Training data.
	 */
	Triple[] dataset;

	/**
	 * A parameter for controlling learning regular.
	 */
	double alpha;

	/**
	 * A parameter for controlling the learning speed.
	 */
	double lambda;

	/**
	 * The low rank of the small matrices.
	 */
	int rank;

	/**
	 * The user matrix U.
	 */
	double[][] userSubspace;

	/**
	 * The item matrix V.
	 */
	double[][] itemSubspace;

	/**
	 * The lower bound of the rating value.
	 */
	double ratingLowerBound;

	/**
	 * The upper bound of the rating value.
	 */
	double ratingUpperBound;

	/**
	 ************************ 
	 * The first constructor.
	 * 
	 * @param paraFilename
	 *            The data filename.
	 * @param paraNumUsers
	 *            The number of users.
	 * @param paraNumItems
	 *            The number of items.
	 * @param paraNumRatings
	 *            The number of ratings.
	 ************************ 
	 */
	public MatrixFactorization(String paraFilename, int paraNumUsers, int paraNumItems,
			int paraNumRatings, double paraRatingLowerBound, double paraRatingUpperBound) {
		numUsers = paraNumUsers;
		numItems = paraNumItems;
		numRatings = paraNumRatings;
		ratingLowerBound = paraRatingLowerBound;
		ratingUpperBound = paraRatingUpperBound;

		try {
			readData(paraFilename, paraNumUsers, paraNumItems, paraNumRatings);
			// adjustUsingMeanRating();
		} catch (Exception ee) {
			System.out.println("File " + paraFilename + " cannot be read! " + ee);
			System.exit(0);
		} // Of try
	}// Of the first constructor

	/**
	 ************************ 
	 * Set parameters.
	 * 
	 * @param paraRank
	 *            The given rank.
	 * @throws IOException
	 ************************ 
	 */
	public void setParameters(int paraRank, double paraAlpha, double paraLambda) {
		rank = paraRank;
		alpha = paraAlpha;
		lambda = paraLambda;
	}// Of setParameters

	/**
	 ************************ 
	 * Read the data from the file.
	 * 
	 * @param paraFilename
	 *            The given file.
	 * @throws IOException
	 ************************ 
	 */
	public void readData(String paraFilename, int paraNumUsers, int paraNumItems,
			int paraNumRatings) throws IOException {
		File tempFile = new File(paraFilename);
		if (!tempFile.exists()) {
			System.out.println("File " + paraFilename + " does not exists.");
			System.exit(0);
		} // Of if
		BufferedReader tempBufferReader = new BufferedReader(new FileReader(tempFile));

		// Allocate space.
		dataset = new Triple[paraNumRatings];
		String tempString;
		String[] tempStringArray;
		for (int i = 0; i < paraNumRatings; i++) {
			tempString = tempBufferReader.readLine();
			tempStringArray = tempString.split(",");
			dataset[i] = new Triple(Integer.parseInt(tempStringArray[0]),
					Integer.parseInt(tempStringArray[1]), Double.parseDouble(tempStringArray[2]));
		} // Of for i

		tempBufferReader.close();
	}// Of readData

	/**
	 ************************ 
	 * Initialize subspaces. Each value is in [0, 1].
	 ************************ 
	 */
	void initializeSubspaces() {
		userSubspace = new double[numUsers][rank];

		for (int i = 0; i < numUsers; i++) {
			for (int j = 0; j < rank; j++) {
				userSubspace[i][j] = rand.nextDouble();
			} // Of for j
		} // Of for i

		itemSubspace = new double[numItems][rank];
		for (int i = 0; i < numItems; i++) {
			for (int j = 0; j < rank; j++) {
				itemSubspace[i][j] = rand.nextDouble();
			} // Of for j
		} // Of for i
	}// Of initializeSubspaces

	/**
	 ************************ 
	 * Predict the rating of the user to the item
	 * 
	 * @param paraUser
	 *            The user index.
	 ************************ 
	 */
	public double predict(int paraUser, int paraItem) {
		double resultValue = 0;
		for (int i = 0; i < rank; i++) {
			// The row vector of an user and the column vector of an item
			resultValue += userSubspace[paraUser][i] * itemSubspace[paraItem][i];
		} // Of for i
		return resultValue;
	}// Of predict

	/**
	 ************************ 
	 * Train.
	 * 
	 * @param paraRounds
	 *            The number of rounds.
	 ************************ 
	 */
	public void train(int paraRounds) {
		initializeSubspaces();

		for (int i = 0; i < paraRounds; i++) {
			updateNoRegular();
			if (i % 50 == 0) {
				// Show the process
				System.out.println("Round " + i);
				System.out.println("MAE: " + mae());
			} // Of if
		} // Of for i
	}// Of train

	/**
	 ************************ 
	 * Update sub-spaces using the training data.
	 ************************ 
	 */
	public void updateNoRegular() {
		for (int i = 0; i < numRatings; i++) {
			int tempUserId = dataset[i].user;
			int tempItemId = dataset[i].item;
			double tempRate = dataset[i].rating;

			double tempResidual = tempRate - predict(tempUserId, tempItemId); // Residual

			// Update user subspace
			double tempValue = 0;
			for (int j = 0; j < rank; j++) {
				tempValue = 2 * tempResidual * itemSubspace[tempItemId][j];
				userSubspace[tempUserId][j] += alpha * tempValue;
			} // Of for j

			// Update item subspace
			for (int j = 0; j < rank; j++) {
				tempValue = 2 * tempResidual * userSubspace[tempUserId][j];

				itemSubspace[tempItemId][j] += alpha * tempValue;
			} // Of for j
		} // Of for i
	}// Of updateNoRegular

	/**
	 ************************ 
	 * Compute the RSME.
	 * 
	 * @return RSME of the current factorization.
	 ************************ 
	 */
	public double rsme() {
		double resultRsme = 0;
		int tempTestCount = 0;

		for (int i = 0; i < numRatings; i++) {
			int tempUserIndex = dataset[i].user;
			int tempItemIndex = dataset[i].item;
			double tempRate = dataset[i].rating;

			double tempPrediction = predict(tempUserIndex, tempItemIndex);// +
																			// DataInfo.mean_rating;

			if (tempPrediction < ratingLowerBound) {
				tempPrediction = ratingLowerBound;
			} else if (tempPrediction > ratingUpperBound) {
				tempPrediction = ratingUpperBound;
			} // Of if

			double tempError = tempRate - tempPrediction;
			resultRsme += tempError * tempError;
			tempTestCount++;
		} // Of for i

		return Math.sqrt(resultRsme / tempTestCount);
	}// Of rsme

	/**
	 ************************ 
	 * Compute the MAE.
	 * 
	 * @return MAE of the current factorization.
	 ************************ 
	 */
	public double mae() {
		double resultMae = 0;
		int tempTestCount = 0;

		for (int i = 0; i < numRatings; i++) {
			int tempUserIndex = dataset[i].user;
			int tempItemIndex = dataset[i].item;
			double tempRate = dataset[i].rating;

			double tempPrediction = predict(tempUserIndex, tempItemIndex);

			if (tempPrediction < ratingLowerBound) {
				tempPrediction = ratingLowerBound;
			} // Of if
			if (tempPrediction > ratingUpperBound) {
				tempPrediction = ratingUpperBound;
			} // Of if

			double tempError = tempRate - tempPrediction;

			resultMae += Math.abs(tempError);
			// System.out.println("resultMae: " + resultMae);
			tempTestCount++;
		} // Of for i

		return (resultMae / tempTestCount);
	}// Of mae

	/**
	 ************************ 
	 * Compute the MAE.
	 * 
	 * @return MAE of the current factorization.
	 ************************ 
	 */
	public static void testTrainingTesting(String paraFilename, int paraNumUsers, int paraNumItems,
			int paraNumRatings, double paraRatingLowerBound, double paraRatingUpperBound,
			int paraRounds) {
		try {
			// Step 1. read the training and testing data
			MatrixFactorization tempMF = new MatrixFactorization(paraFilename, paraNumUsers,
					paraNumItems, paraNumRatings, paraRatingLowerBound, paraRatingUpperBound);

			tempMF.setParameters(5, 0.0001, 0.005);

			// Step 3. update and predict
			System.out.println("Begin Training ! ! !");
			tempMF.train(paraRounds);

			double tempMAE = tempMF.mae();
			double tempRSME = tempMF.rsme();
			System.out.println("Finally, MAE = " + tempMAE + ", RSME = " + tempRSME);
		} catch (Exception e) {
			e.printStackTrace();
		} // Of try
	}// Of testTrainingTesting

	/**
	 ************************ 
	 * @param args
	 ************************ 
	 */
	public static void main(String args[]) {
		testTrainingTesting("D:/data/movielens-943u1682m.txt", 943, 1682, 10000, 1, 5, 2000);
	}// Of main

	public class Triple {
		public int user;
		public int item;
		public double rating;

		/**
		 *********************
		 * The constructor.
		 *********************
		 */
		public Triple() {
			user = -1;
			item = -1;
			rating = -1;
		}// Of the first constructor

		/**
		 *********************
		 * The constructor.
		 *********************
		 */
		public Triple(int paraUser, int paraItem, double paraRating) {
			user = paraUser;
			item = paraItem;
			rating = paraRating;
		}// Of the first constructor

		/**
		 *********************
		 * Show me.
		 *********************
		 */
		public String toString() {
			return "" + user + ", " + item + ", " + rating;
		}// Of toString
	}// Of class Triple

}// Of class MatrixFactorization