第 61 天: 决策树 (1. 准备工作)
决策树是最经典的机器学习算法. 其实我不想在后面加上"之一". 它有非常好的可解释性.
今天可以抄到 284 行, 剩下的明天抄.
数据仅有一份. 分裂后的数据子集仅需要保存 availableInstances 和 availableAttributes 两个数组.
两个构造方法, 一个读入文件获得根节点, 另一个建立根据数据分裂的获得.
判断数据集是否纯, 即所有的类标签是否相同, 如果是就不用分裂了.
每个节点 (包括非叶节点) 都需要一个标签, 这样, 遇到未见过的属性就可以直接分类了. 为获得该标签, 可以通过投票的方式, 即 getMajorityClass().
最大化信息增益, 与最小化条件信息熵, 两者是等价的.
分裂的数据块有可能是空的, 这时使用长度为 0 的数组而不是 null.
package machinelearning.decisiontree;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.*;
/**
* The ID3 decision tree inductive algorithm.
*
* @author Fan Min minfanphd@163.com.
*/
public class ID3 {
/**
* The data.
*/
Instances dataset;
/**
* Is this dataset pure (only one label)?
*/
boolean pure;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* Available instances. Other instances do not belong this branch.
*/
int[] availableInstances;
/**
* Available attributes. Other attributes have been selected in the path
* from the root.
*/
int[] availableAttributes;
/**
* The selected attribute.
*/
int splitAttribute;
/**
* The children nodes.
*/
ID3[] children;
/**
* My label. Inner nodes also have a label. For example, <outlook = sunny,
* humidity = high> never appear in the training data, but <humidity = high>
* is valid in other cases.
*/
int label;
/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;
/**
* Small block cannot be split further.
*/
static int smallBlockThreshold = 3;
/**
********************
* The constructor.
*
* @param paraFilename
* The given file.
********************
*/
public ID3(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
} // Of try
dataset.setClassIndex(dataset.numAttributes() - 1);
numClasses = dataset.classAttribute().numValues();
availableInstances = new int[dataset.numInstances()];
for (int i = 0; i < availableInstances.length; i++) {
availableInstances[i] = i;
} // Of for i
availableAttributes = new int[dataset.numAttributes() - 1];
for (int i = 0; i < availableAttributes.length; i++) {
availableAttributes[i] = i;
} // Of for i
// Initialize.
children = null;
// Determine the label by simple voting.
label = getMajorityClass(availableInstances);
// Determine whether or not it is pure.
pure = pureJudge(availableInstances);
}// Of the first constructor
/**
********************
* The constructor.
*
* @param paraDataset
* The given dataset.
********************
*/
public ID3(Instances paraDataset, int[] paraAvailableInstances, int[] paraAvailableAttributes) {
// Copy its reference instead of clone the availableInstances.
dataset = paraDataset;
availableInstances = paraAvailableInstances;
availableAttributes = paraAvailableAttributes;
// Initialize.
children = null;
// Determine the label by simple voting.
label = getMajorityClass(availableInstances);
// Determine whether or not it is pure.
pure = pureJudge(availableInstances);
}// Of the second constructor
/**
**********************************
* Is the given block pure?
*
* @param paraBlock
* The block.
* @return True if pure.
**********************************
*/
public boolean pureJudge(int[] paraBlock) {
pure = true;
for (int i = 1; i < paraBlock.length; i++) {
if (dataset.instance(paraBlock[i]).classValue() != dataset.instance(paraBlock[0])
.classValue()) {
pure = false;
break;
} // Of if
} // Of for i
return pure;
}// Of pureJudge
/**
**********************************
* Compute the majority class of the given block for voting.
*
* @param paraBlock
* The block.
* @return The majority class.
**********************************
*/
public int getMajorityClass(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
} // Of for i
int resultMajorityClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
resultMajorityClass = i;
tempMaxCount = tempClassCounts[i];
} // Of if
} // Of for i
return resultMajorityClass;
}// Of getMajorityClass
/**
**********************************
* Select the best attribute.
*
* @return The best attribute index.
**********************************
*/
public int selectBestAttribute() {
splitAttribute = -1;
double tempMinimalEntropy = 10000;
double tempEntropy;
for (int i = 0; i < availableAttributes.length; i++) {
tempEntropy = conditionalEntropy(availableAttributes[i]);
if (tempMinimalEntropy > tempEntropy) {
tempMinimalEntropy = tempEntropy;
splitAttribute = availableAttributes[i];
} // Of if
} // Of for i
return splitAttribute;
}// Of selectBestAttribute
/**
**********************************
* Compute the conditional entropy of an attribute.
*
* @param paraAttribute
* The given attribute.
*
* @return The entropy.
**********************************
*/
public double conditionalEntropy(int paraAttribute) {
// Step 1. Statistics.
int tempNumClasses = dataset.numClasses();
int tempNumValues = dataset.attribute(paraAttribute).numValues();
int tempNumInstances = availableInstances.length;
double[] tempValueCounts = new double[tempNumValues];
double[][] tempCountMatrix = new double[tempNumValues][tempNumClasses];
int tempClass, tempValue;
for (int i = 0; i < tempNumInstances; i++) {
tempClass = (int) dataset.instance(availableInstances[i]).classValue();
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempValueCounts[tempValue]++;
tempCountMatrix[tempValue][tempClass]++;
} // Of for i
// Step 2.
double resultEntropy = 0;
double tempEntropy, tempFraction;
for (int i = 0; i < tempNumValues; i++) {
if (tempValueCounts[i] == 0) {
continue;
} // Of if
tempEntropy = 0;
for (int j = 0; j < tempNumClasses; j++) {
tempFraction = tempCountMatrix[i][j] / tempValueCounts[i];
if (tempFraction == 0) {
continue;
} // Of if
tempEntropy += -tempFraction * Math.log(tempFraction);
} // Of for j
resultEntropy += tempValueCounts[i] / tempNumInstances * tempEntropy;
} // Of for i
return resultEntropy;
}// Of conditionalEntropy
/**
**********************************
* Split the data according to the given attribute.
*
* @return The blocks.
**********************************
*/
public int[][] splitData(int paraAttribute) {
int tempNumValues = dataset.attribute(paraAttribute).numValues();
// System.out.println("Dataset " + dataset + "\r\n");
// System.out.println("Attribute " + paraAttribute + " has " +
// tempNumValues + " values.\r\n");
int[][] resultBlocks = new int[tempNumValues][];
int[] tempSizes = new int[tempNumValues];
// First scan to count the size of each block.
int tempValue;
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempSizes[tempValue]++;
} // Of for i
// Allocate space.
for (int i = 0; i < tempNumValues; i++) {
resultBlocks[i] = new int[tempSizes[i]];
} // Of for i
// Second scan to fill.
Arrays.fill(tempSizes, 0);
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
// Copy data.
resultBlocks[tempValue][tempSizes[tempValue]] = availableInstances[i];
tempSizes[tempValue]++;
} // Of for i
return resultBlocks;
}// Of splitData
/**
**********************************
* Build the tree recursively.
**********************************
*/
public void buildTree() {
if (pureJudge(availableInstances)) {
return;
} // Of if
if (availableInstances.length <= smallBlockThreshold) {
return;
} // Of if
selectBestAttribute();
int[][] tempSubBlocks = splitData(splitAttribute);
children = new ID3[tempSubBlocks.length];
// Construct the remaining attribute set.
int[] tempRemainingAttributes = new int[availableAttributes.length - 1];
for (int i = 0; i < availableAttributes.length; i++) {
if (availableAttributes[i] < splitAttribute) {
tempRemainingAttributes[i] = availableAttributes[i];
} else if (availableAttributes[i] > splitAttribute) {
tempRemainingAttributes[i - 1] = availableAttributes[i];
} // Of if
} // Of for i
// Construct children.
for (int i = 0; i < children.length; i++) {
if ((tempSubBlocks[i] == null) || (tempSubBlocks[i].length == 0)) {
children[i] = null;
continue;
} else {
// System.out.println("Building children #" + i + " with
// instances " + Arrays.toString(tempSubBlocks[i]));
children[i] = new ID3(dataset, tempSubBlocks[i], tempRemainingAttributes);
// Important code: do this recursively
children[i].buildTree();
} // Of if
} // Of for i
}// Of buildTree
/**
**********************************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return The prediction.
**********************************
*/
public int classify(Instance paraInstance) {
if (children == null) {
return label;
} // Of if
ID3 tempChild = children[(int) paraInstance.value(splitAttribute)];
if (tempChild == null) {
return label;
} // Of if
return tempChild.classify(paraInstance);
}// Of classify
/**
**********************************
* Test on a testing set.
*
* @param paraDataset
* The given testing data.
* @return The accuracy.
**********************************
*/
public double test(Instances paraDataset) {
double tempCorrect = 0;
for (int i = 0; i < paraDataset.numInstances(); i++) {
if (classify(paraDataset.instance(i)) == (int) paraDataset.instance(i).classValue()) {
tempCorrect++;
} // Of i
} // Of for i
return tempCorrect / paraDataset.numInstances();
}// Of test
/**
**********************************
* Test on the training set.
*
* @return The accuracy.
**********************************
*/
public double selfTest() {
return test(dataset);
}// Of selfTest
/**
*******************
* Overrides the method claimed in Object.
*
* @return The tree structure.
*******************
*/
public String toString() {
String resultString = "";
String tempAttributeName = dataset.attribute(splitAttribute).name();
if (children == null) {
resultString += "class = " + label;
} else {
for (int i = 0; i < children.length; i++) {
if (children[i] == null) {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + "class = " + label
+ "\r\n";
} else {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + children[i]
+ "\r\n";
} // Of if
} // Of for i
} // Of if
return resultString;
}// Of toString
/**
*************************
* Test this class.
*
* @param args
* Not used now.
*************************
*/
public static void id3Test() {
ID3 tempID3 = new ID3("D:/data/weather.arff");
// ID3 tempID3 = new ID3("D:/data/mushroom.arff");
ID3.smallBlockThreshold = 3;
tempID3.buildTree();
System.out.println("The tree is: \r\n" + tempID3);
double tempAccuracy = tempID3.selfTest();
System.out.println("The accuracy is: " + tempAccuracy);
}// Of id3Test
/**
*************************
* Test this class.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
id3Test();
}// Of main
}// Of class ID3
运行出来有很多小问题不会修改,对理论基础部分进行了了解
第 62 天: 决策树 (2. 建树与分类)
抄完昨天列出的代码.
构建决策树是一个递归的过程, 参数设计是核心.
分类 classify() 也是一个递归的过程.
当前仅在训练集上测试. 设计单独的测试集也不困难.
toString() 也用了递归的方式. 但输出的树格式不太好看.
在 https://github.com/FanSmale/sampledata/ 可下载 iris.arff 和 mushroom.arff. 万一访问不畅, 把下面的内容拷贝另存成 weather.arff 即可.
@relation weather
@attribute Outlook {Sunny, Overcast, Rain}
@attribute Temperature {Hot, Mild, Cool}
@attribute Humidity {High, Normal, Low}
@attribute Windy {FALSE, TRUE}
@attribute Play {N, P}
@data
Sunny,Hot,High,FALSE,N
Sunny,Hot,High,TRUE,N
Overcast,Hot,High,FALSE,P
Rain,Mild,High,FALSE,P
Rain,Cool,Normal,FALSE,P
Rain,Cool,Normal,TRUE,N
Overcast,Cool,Normal,TRUE,P
Sunny,Mild,High,FALSE,N
Sunny,Cool,Normal,FALSE,P
Rain,Mild,Normal,FALSE,P
Sunny,Mild,Normal,TRUE,P
Overcast,Mild,High,TRUE,P
Overcast,Hot,Normal,FALSE,P
Rain,Mild,High,TRUE,N
第 63 天: 集成学习之 AdaBoosting (1. 带权数据集)
开始时应先看一下 AdaBoosting 的贴子, 知道其基本算法.
平时的数据集各条数据相同重要. 带权数据集则为每个对象赋予一定权重.
adjustWeights() 是根据相应式子写的, 为核心代码.
仍然有很多简单代码 (方法), 它们起到基础的作用.
更完备代码见 https://github.com/fansmale/mfadaboosting.
package machinelearning.adaboosting;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.Instances;
/**
* Weighted instances.<br>
*
* @author Fan Min minfanphd@163.com.
*/
public class WeightedInstances extends Instances {
/**
* Just the requirement of some classes, any number is ok.
*/
private static final long serialVersionUID = 11087456L;
/**
* Weights.
*/
private double[] weights;
/**
******************
* The first constructor.
*
* @param paraFileReader
* The given reader to read data from file.
******************
*/
public WeightedInstances(FileReader paraFileReader) throws Exception {
super(paraFileReader);
setClassIndex(numAttributes() - 1);
// Initialize weights
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
} // Of for i
System.out.println("Instances weights are: " + Arrays.toString(weights));
} // Of the first constructor
/**
******************
* The second constructor.
*
* @param paraInstances
* The given instance.
******************
*/
public WeightedInstances(Instances paraInstances) {
super(paraInstances);
setClassIndex(numAttributes() - 1);
// Initialize weights
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
} // Of for i
System.out.println("Instances weights are: " + Arrays.toString(weights));
} // Of the second constructor
/**
******************
* Getter.
*
* @param paraIndex
* The given index.
* @return The weight of the given index.
******************
*/
public double getWeight(int paraIndex) {
return weights[paraIndex];
} // Of getWeight
/**
******************
* Adjust the weights.
*
* @param paraCorrectArray
* Indicate which instances have been correctly classified.
* @param paraAlpha
* The weight of the last classifier.
******************
*/
public void adjustWeights(boolean[] paraCorrectArray, double paraAlpha) {
// Step 1. Calculate alpha.
double tempIncrease = Math.exp(paraAlpha);
// Step 2. Adjust.
double tempWeightsSum = 0; // For normalization.
for (int i = 0; i < weights.length; i++) {
if (paraCorrectArray[i]) {
weights[i] /= tempIncrease;
} else {
weights[i] *= tempIncrease;
} // Of if
tempWeightsSum += weights[i];
} // Of for i
// Step 3. Normalize.
for (int i = 0; i < weights.length; i++) {
weights[i] /= tempWeightsSum;
} // Of for i
System.out.println("After adjusting, instances weights are: " + Arrays.toString(weights));
} // Of adjustWeights
/**
******************
* Test the method.
******************
*/
public void adjustWeightsTest() {
boolean[] tempCorrectArray = new boolean[numInstances()];
for (int i = 0; i < tempCorrectArray.length / 2; i++) {
tempCorrectArray[i] = true;
} // Of for i
double tempWeightedError = 0.3;
adjustWeights(tempCorrectArray, tempWeightedError);
System.out.println("After adjusting");
System.out.println(toString());
} // Of adjustWeightsTest
/**
******************
* For display.
******************
*/
public String toString() {
String resultString = "I am a weighted Instances object.\r\n" + "I have " + numInstances() + " instances and "
+ (numAttributes() - 1) + " conditional attributes.\r\n" + "My weights are: " + Arrays.toString(weights)
+ "\r\n" + "My data are: \r\n" + super.toString();
return resultString;
} // Of toString
/**
******************
* For unit test.
*
* @param args
* Not provided.
******************
*/
public static void main(String args[]) {
WeightedInstances tempWeightedInstances = null;
String tempFilename = "d:/data/iris.arff";
try {
FileReader tempFileReader = new FileReader(tempFilename);
tempWeightedInstances = new WeightedInstances(tempFileReader);
tempFileReader.close();
} catch (Exception exception1) {
System.out.println("Cannot read the file: " + tempFilename + "\r\n" + exception1);
System.exit(0);
} // Of try
System.out.println(tempWeightedInstances.toString());
tempWeightedInstances.adjustWeightsTest();
} // Of main
} // Of class WeightedInstances
第 64 天: 集成学习之 AdaBoosting (2. 树桩分类器)
做了一个超类, 用于支持不同的基础分类器. 这里为了减少代码量, 只实现了树桩分类器.
树桩分类器每次只将数据分成两堆, 与决策树相比, 简单至极. 当然, 这里处理的是实型数据, 而 ID3 处理的是符号型数据.
如何构造第一个弱分类器(树桩)
1.先给每个样本一个初始的权重=1/样本总数
2.确定选用哪个特征:Gini系数
分别计算左右两边的纯度:1-(预测正确的比例)2-(预测错误的比例)2 然后加权平均
3.得到Gini系数 选最小的作为第一棵树桩
4.上面确定了树桩使用哪个特征 接下来要确定这个树桩(弱分类器)的话语权有多大 根据下面的公式:
A m o u n t o f S a y = 1 / 2 ∗ l o g ( ( 1 − T o t a l E r r o r ) / T o t a l E r r o r ) Amount of Say=1/2*log((1-Total Error)/Total Error)
AmountofSay=1/2∗log((1−TotalError)/TotalError)
Total Error表示错误样本权重相加。
5.这样第一棵弱分类器就构造好了 然后我们需要更新各个样本的权重 使得分类正确的样本权重减小 错误的权重增大
根据下面的公式来改变错误样本的权重
N e w s a m p l e w e i g h t = s a m p l e w e i g h t × e 的 ( A m o u n t o f S a y ) 次 方 New sample weight=sample weight × e 的(Amount of Say) 次方
Newsampleweight=sampleweight×e的(AmountofSay)次方
根据下面公式改变正确样本的权重
N e w s a m p l e w e i g h t = s a m p l e w e i g h t × e 的 ( − A m o u n t o f S a y ) 次 方 New sample weight=sample weight × e 的(-Amount of Say) 次方
Newsampleweight=sampleweight×e的(−AmountofSay)次方
package machinelearning.adaboosting;
import java.util.Random;
import weka.core.Instance;
/**
* The super class of any simple classifier.
*
* @author Fan Min minfanphd@163.com.
*/
public abstract class SimpleClassifier {
/**
* The index of the current attribute.
*/
int selectedAttribute;
/**
* Weighted data.
*/
WeightedInstances weightedInstances;
/**
* The accuracy on the training set.
*/
double trainingAccuracy;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* The number of instances.
*/
int numInstances;
/**
* The number of conditional attributes.
*/
int numConditions;
/**
* For random number generation.
*/
Random random = new Random();
/**
******************
* The first constructor.
*
* @param paraWeightedInstances
* The given instances.
******************
*/
public SimpleClassifier(WeightedInstances paraWeightedInstances) {
weightedInstances = paraWeightedInstances;
numConditions = weightedInstances.numAttributes() - 1;
numInstances = weightedInstances.numInstances();
numClasses = weightedInstances.classAttribute().numValues();
}// Of the first constructor
/**
******************
* Train the classifier.
******************
*/
public abstract void train();
/**
******************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return Predicted label.
******************
*/
public abstract int classify(Instance paraInstance);
/**
******************
* Which instances in the training set are correctly classified.
*
* @return The correctness array.
******************
*/
public boolean[] computeCorrectnessArray() {
boolean[] resultCorrectnessArray = new boolean[weightedInstances.numInstances()];
for (int i = 0; i < resultCorrectnessArray.length; i++) {
Instance tempInstance = weightedInstances.instance(i);
if ((int) (tempInstance.classValue()) == classify(tempInstance)) {
resultCorrectnessArray[i] = true;
} // Of if
// System.out.print("\t" + classify(tempInstance));
} // Of for i
// System.out.println();
return resultCorrectnessArray;
}// Of computeCorrectnessArray
/**
******************
* Compute the accuracy on the training set.
*
* @return The training accuracy.
******************
*/
public double computeTrainingAccuracy() {
double tempCorrect = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (tempCorrectnessArray[i]) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / tempCorrectnessArray.length;
return resultAccuracy;
}// Of computeTrainingAccuracy
/**
******************
* Compute the weighted error on the training set. It is at least 1e-6 to
* avoid NaN.
*
* @return The weighted error.
******************
*/
public double computeWeightedError() {
double resultError = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (!tempCorrectnessArray[i]) {
resultError += weightedInstances.getWeight(i);
} // Of if
} // Of for i
if (resultError < 1e-6) {
resultError = 1e-6;
} // Of if
return resultError;
}// Of computeWeightedError
} // Of class SimpleClassifier
第 65 天: 集成学习之 AdaBoosting (3. 集成器)
1.核心代码在 train() 里面.
2.为简化代码, 直接使用训练集做测试.
将各个训练得到的弱分类器组合成强分类器。各个弱分类器的训练过程结束后,加大分类误差率小的弱分类器的权重,使其在最终的分类函数中起着较大的决定作用,而降低分类误差率大的弱分类器的权重,使其在最终的分类函数中起着较小的决定作用。
package machinelearning.adaboosting;
import java.io.FileReader;
import weka.core.Instance;
import weka.core.Instances;
/**
* The booster which ensembles base classifiers.
*
* @author Fan Min minfanphd@163.com.
*/
public class Booster {
/**
* Classifiers.
*/
SimpleClassifier[] classifiers;
/**
* Number of classifiers.
*/
int numClassifiers;
/**
* Whether or not stop after the training error is 0.
*/
boolean stopAfterConverge = false;
/**
* The weights of classifiers.
*/
double[] classifierWeights;
/**
* The training data.
*/
Instances trainingData;
/**
* The testing data.
*/
Instances testingData;
/**
******************
* The first constructor. The testing set is the same as the training set.
*
* @param paraTrainingFilename
* The data filename.
******************
*/
public Booster(String paraTrainingFilename) {
// Step 1. Read training set.
try {
FileReader tempFileReader = new FileReader(paraTrainingFilename);
trainingData = new Instances(tempFileReader);
tempFileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraTrainingFilename + "\r\n" + ee);
System.exit(0);
} // Of try
// Step 2. Set the last attribute as the class index.
trainingData.setClassIndex(trainingData.numAttributes() - 1);
// Step 3. The testing data is the same as the training data.
testingData = trainingData;
stopAfterConverge = true;
System.out.println("****************Data**********\r\n" + trainingData);
}// Of the first constructor
/**
******************
* Set the number of base classifier, and allocate space for them.
*
* @param paraNumBaseClassifiers
* The number of base classifier.
******************
*/
public void setNumBaseClassifiers(int paraNumBaseClassifiers) {
numClassifiers = paraNumBaseClassifiers;
// Step 1. Allocate space (only reference) for classifiers
classifiers = new SimpleClassifier[numClassifiers];
// Step 2. Initialize classifier weights.
classifierWeights = new double[numClassifiers];
}// Of setNumBaseClassifiers
/**
******************
* Train the booster.
*
* @see algorithm.StumpClassifier#train()
******************
*/
public void train() {
// Step 1. Initialize.
WeightedInstances tempWeightedInstances = null;
double tempError;
numClassifiers = 0;
// Step 2. Build other classifiers.
for (int i = 0; i < classifiers.length; i++) {
// Step 2.1 Key code: Construct or adjust the weightedInstances
if (i == 0) {
tempWeightedInstances = new WeightedInstances(trainingData);
} else {
// Adjust the weights of the data.
tempWeightedInstances.adjustWeights(classifiers[i - 1].computeCorrectnessArray(),
classifierWeights[i - 1]);
} // Of if
// Step 2.2 Train the next classifier.
classifiers[i] = new StumpClassifier(tempWeightedInstances);
classifiers[i].train();
tempError = classifiers[i].computeWeightedError();
// Key code: Set the classifier weight.
classifierWeights[i] = 0.5 * Math.log(1 / tempError - 1);
if (classifierWeights[i] < 1e-6) {
classifierWeights[i] = 0;
} // Of if
System.out.println("Classifier #" + i + " , weighted error = " + tempError + ", weight = "
+ classifierWeights[i] + "\r\n");
numClassifiers++;
// The accuracy is enough.
if (stopAfterConverge) {
double tempTrainingAccuracy = computeTrainingAccuray();
System.out.println("The accuracy of the booster is: " + tempTrainingAccuracy + "\r\n");
if (tempTrainingAccuracy > 0.999999) {
System.out.println("Stop at the round: " + i + " due to converge.\r\n");
break;
} // Of if
} // Of if
} // Of for i
}// Of train
/**
******************
* Classify an instance.
*
* @param paraInstance
* The given instance.
* @return The predicted label.
******************
*/
public int classify(Instance paraInstance) {
double[] tempLabelsCountArray = new double[trainingData.classAttribute().numValues()];
for (int i = 0; i < numClassifiers; i++) {
int tempLabel = classifiers[i].classify(paraInstance);
tempLabelsCountArray[tempLabel] += classifierWeights[i];
} // Of for i
int resultLabel = -1;
double tempMax = -1;
for (int i = 0; i < tempLabelsCountArray.length; i++) {
if (tempMax < tempLabelsCountArray[i]) {
tempMax = tempLabelsCountArray[i];
resultLabel = i;
} // Of if
} // Of for
return resultLabel;
}// Of classify
/**
******************
* Test the booster on the training data.
*
* @return The classification accuracy.
******************
*/
public double test() {
System.out.println("Testing on " + testingData.numInstances() + " instances.\r\n");
return test(testingData);
}// Of test
/**
******************
* Test the booster.
*
* @param paraInstances
* The testing set.
* @return The classification accuracy.
******************
*/
public double test(Instances paraInstances) {
double tempCorrect = 0;
paraInstances.setClassIndex(paraInstances.numAttributes() - 1);
for (int i = 0; i < paraInstances.numInstances(); i++) {
Instance tempInstance = paraInstances.instance(i);
if (classify(tempInstance) == (int) tempInstance.classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / paraInstances.numInstances();
System.out.println("The accuracy is: " + resultAccuracy);
return resultAccuracy;
} // Of test
/**
******************
* Compute the training accuracy of the booster. It is not weighted.
*
* @return The training accuracy.
******************
*/
public double computeTrainingAccuray() {
double tempCorrect = 0;
for (int i = 0; i < trainingData.numInstances(); i++) {
if (classify(trainingData.instance(i)) == (int) trainingData.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double tempAccuracy = tempCorrect / trainingData.numInstances();
return tempAccuracy;
}// Of computeTrainingAccuray
/**
******************
* For integration test.
*
* @param args
* Not provided.
******************
*/
public static void main(String args[]) {
System.out.println("Starting AdaBoosting...");
Booster tempBooster = new Booster("D:/data/iris.arff");
// Booster tempBooster = new Booster("src/data/smalliris.arff");
tempBooster.setNumBaseClassifiers(100);
tempBooster.train();
System.out.println("The training accuracy is: " + tempBooster.computeTrainingAccuray());
tempBooster.test();
}// Of main
}// Of class Booster
第 66 天: 主动学习之 ALEC
通过找到聚类中心,聚类中心的特点是密度高于邻居,与密度较高的实例相距较远。再为每个中心实例构建集群,以递归方式将集群索引分配给非中心实例,最终生成块信息表。该算法需要用户输入半径和阈值,这将降低聚类的准确性,并需要准确找到根节点,一旦错误将会导致分类错误,从而引起代价增加。
基本思想如下:
Step 1. 将对象按代表性递减排序;
Step 2. 假设当前数据块有N个对象, 选择最具代表性的根号N个查询其标签 (类别).
Step 3. 如果这根号N个标签具有相同类别, 就认为该块为纯的, 其它对象均分类为同一类别. 结束.
Step 4. 将当前块划分为两个子块, 分别 Goto Step 3.
ALEC 是我们第一篇主动学习论文的源代码.
Min Wang, Fan Min, Yan-Xue Wu, Zhi-Heng Zhang, Active learning through density clustering, Expert Systems with Applications 85 (2017) 305–317.
这里是我们网站的链接, 论文编号 31.
为了便于理解, 进行了实现上的一些简化. 要获得更规范的风格和更强的适应性, 可参见我们另一篇论文
Fan Min, Shi‑Ming Zhang, Davide Ciucci, Min Wang. Three‑way active learning through clustering selection. International Journal of Machine Learning and Cybernetics . (2020-03)1033–1046.
package machinelearning.activelearning;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import weka.core.Instances;
public class Alec {
/**
* The whole dataset.
*/
Instances dataset;
/**
* A random instance;
*/
public static final Random random = new Random();
/**
* The maximal number of queries that can be provided.
*/
int maxNumQuery;
/**
* The actual number of queries.
*/
int numQuery;
/**
* The radius, also dc in the paper. It is employed for density computation.
*/
double radius;
/**
* The densities of instances, also rho in the paper.
*/
double[] densities;
/**
* distanceToMaster
*/
double[] distanceToMaster;
/**
* Sorted indices, where the first element indicates the instance with the
* biggest density.
*/
int[] descendantDensities;
/**
* Priority
*/
double[] priority;
/**
* The maximal distance between any pair of points.
*/
double maximalDistance;
/**
* The maximal distanceToMaster
*/
double maximalDelta;
/**
* Who is my master?
*/
int[] masters;
/**
* Predicted labels.
*/
int[] predictedLabels;
/**
* Instance status. 0 for unprocessed, 1 for queried, 2 for classified.
*/
int[] instanceStatusArray;
/**
* The descendant indices to show the representativeness of instances in a
* descendant order.
*/
int[] descendantRepresentatives;
/**
* Indicate the cluster of each instance. It is only used in
* clusterInTwo(int[]);
*/
int[] clusterIndices;
/**
* Blocks with size no more than this threshold should not be split further.
*/
int smallBlockThreshold = 3;
/**
**********************************
* The constructor.
*
* @param paraFilename
* The data filename.
**********************************
*/
public Alec(String paraFilename) {
try {
FileReader tempReader = new FileReader(paraFilename);
dataset = new Instances(tempReader);
dataset.setClassIndex(dataset.numAttributes() - 1);
tempReader.close();
} catch (Exception ee) {
System.out.println(ee);
System.exit(0);
} // Of fry
computeMaximalDistance();
clusterIndices = new int[dataset.numInstances()];
}// Of the constructor
/**
**********************************
* Merge sort in descendant order to obtain an index array. The original
* array is unchanged.<br>
* Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2].<br>
* input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3].<br>
* This method is equivalent to argsort() in numpy module of the Python programming language.
*
* @param paraArray
* the original array
* @return The sorted indices.
**********************************
*/
public static int[] mergeSortToIndices(double[] paraArray) {
int tempLength = paraArray.length;
int[][] resultMatrix = new int[2][tempLength];// For merge sort.
// Initialize
int tempIndex = 0;
for (int i = 0; i < tempLength; i++) {
resultMatrix[tempIndex][i] = i;
} // Of for i
// Merge
int tempCurrentLength = 1;
// The indices for current merged groups.
int tempFirstStart, tempSecondStart, tempSecondEnd;
while (tempCurrentLength < tempLength) {
// Divide into a number of groups.
// Here the boundary is adaptive to array length not equal to 2^k.
for (int i = 0; i < Math.ceil(tempLength + 0.0 / tempCurrentLength) / 2; i++) {
// Boundaries of the group
tempFirstStart = i * tempCurrentLength * 2;
tempSecondStart = tempFirstStart + tempCurrentLength;
tempSecondEnd = tempSecondStart + tempCurrentLength - 1;
if (tempSecondEnd >= tempLength) {
tempSecondEnd = tempLength - 1;
} // Of if
// Merge this group
int tempFirstIndex = tempFirstStart;
int tempSecondIndex = tempSecondStart;
int tempCurrentIndex = tempFirstStart;
if (tempSecondStart >= tempLength) {
for (int j = tempFirstIndex; j < tempLength; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempFirstIndex++;
tempCurrentIndex++;
} // Of for j
break;
} // Of if
while ((tempFirstIndex <= tempSecondStart - 1) && (tempSecondIndex <= tempSecondEnd)) {
if (paraArray[resultMatrix[tempIndex % 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex
% 2][tempSecondIndex]]) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
% 2][tempFirstIndex];
tempFirstIndex++;
} else {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
% 2][tempSecondIndex];
tempSecondIndex++;
} // Of if
tempCurrentIndex++;
} // Of while
// Remaining part
for (int j = tempFirstIndex; j < tempSecondStart; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempCurrentIndex++;
} // Of for j
for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempCurrentIndex++;
} // Of for j
} // Of for i
tempCurrentLength *= 2;
tempIndex++;
} // Of while
return resultMatrix[tempIndex % 2];
}// Of mergeSortToIndices
/**
*********************
* The Euclidean distance between two instances. Other distance measures
* unsupported for simplicity.
*
*
* @param paraI
* The index of the first instance.
* @param paraJ
* The index of the second instance.
* @return The distance.
*********************
*/
public double distance(int paraI, int paraJ) {
double resultDistance = 0;
double tempDifference;
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
resultDistance += tempDifference * tempDifference;
} // Of for i
resultDistance = Math.sqrt(resultDistance);
return resultDistance;
}// Of distance
/**
**********************************
* Compute the maximal distance. The result is stored in a member variable.
**********************************
*/
public void computeMaximalDistance() {
maximalDistance = 0;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);
if (maximalDistance < tempDistance) {
maximalDistance = tempDistance;
} // Of if
} // Of for j
} // Of for i
System.out.println("maximalDistance = " + maximalDistance);
}// Of computeMaximalDistance
/**
******************
* Compute the densities using Gaussian kernel.
*
* @param paraBlock
* The given block.
******************
*/
public void computeDensitiesGaussian() {
System.out.println("radius = " + radius);
densities = new double[dataset.numInstances()];
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);
densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius);
} // Of for j
} // Of for i
System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
}// Of computeDensitiesGaussian
/**
**********************************
* Compute distanceToMaster, the distance to its master.
**********************************
*/
public void computeDistanceToMaster() {
distanceToMaster = new double[dataset.numInstances()];
masters = new int[dataset.numInstances()];
descendantDensities = new int[dataset.numInstances()];
instanceStatusArray = new int[dataset.numInstances()];
descendantDensities = mergeSortToIndices(densities);
distanceToMaster[descendantDensities[0]] = maximalDistance;
double tempDistance;
for (int i = 1; i < dataset.numInstances(); i++) {
// Initialize.
distanceToMaster[descendantDensities[i]] = maximalDistance;
for (int j = 0; j <= i - 1; j++) {
tempDistance = distance(descendantDensities[i], descendantDensities[j]);
if (distanceToMaster[descendantDensities[i]] > tempDistance) {
distanceToMaster[descendantDensities[i]] = tempDistance;
masters[descendantDensities[i]] = descendantDensities[j];
} // Of if
} // Of for j
} // Of for i
System.out.println("First compute, masters = " + Arrays.toString(masters));
System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
}// Of computeDistanceToMaster
/**
**********************************
* Compute priority. Element with higher priority is more likely to be
* selected as a cluster center. Now it is rho * distanceToMaster. It can
* also be rho^alpha * distanceToMaster.
**********************************
*/
public void computePriority() {
priority = new double[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
priority[i] = densities[i] * distanceToMaster[i];
} // Of for i
}// Of computePriority
/**
*************************
* The block of a node should be same as its master. This recursive method
* is efficient.
*
* @param paraIndex
* The index of the given node.
* @return The cluster index of the current node.
*************************
*/
public int coincideWithMaster(int paraIndex) {
if (clusterIndices[paraIndex] == -1) {
int tempMaster = masters[paraIndex];
clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
} // Of if
return clusterIndices[paraIndex];
}// Of coincideWithMaster
/**
*************************
* Cluster a block in two. According to the master tree.
*
* @param paraBlock
* The given block.
* @return The new blocks where the two most represent instances serve as
* the root.
*************************
*/
public int[][] clusterInTwo(int[] paraBlock) {
// Reinitialize. In fact, only instances in the given block is
// considered.
Arrays.fill(clusterIndices, -1);
// Initialize the cluster number of the two roots.
for (int i = 0; i < 2; i++) {
clusterIndices[paraBlock[i]] = i;
} // Of for i
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] != -1) {
// Already have a cluster number.
continue;
} // Of if
clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
} // Of for i
// The sub blocks.
int[][] resultBlocks = new int[2][];
int tempFistBlockCount = 0;
for (int i = 0; i < clusterIndices.length; i++) {
if (clusterIndices[i] == 0) {
tempFistBlockCount++;
} // Of if
} // Of for i
resultBlocks[0] = new int[tempFistBlockCount];
resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];
// Copy. You can design shorter code when the number of clusters is
// greater than 2.
int tempFirstIndex = 0;
int tempSecondIndex = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] == 0) {
resultBlocks[0][tempFirstIndex] = paraBlock[i];
tempFirstIndex++;
} else {
resultBlocks[1][tempSecondIndex] = paraBlock[i];
tempSecondIndex++;
} // Of if
} // Of for i
System.out.println("Split (" + paraBlock.length + ") instances " + Arrays.toString(paraBlock) + "\r\nto ("
+ resultBlocks[0].length + ") instances " + Arrays.toString(resultBlocks[0]) + "\r\nand ("
+ resultBlocks[1].length + ") instances " + Arrays.toString(resultBlocks[1]));
return resultBlocks;
}// Of clusterInTwo
/**
**********************************
* Classify instances in the block by simple voting.
*
* @param paraBlock
* The given block.
**********************************
*/
public void vote(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
} // Of if
} // Of for i
int tempMaxClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
tempMaxClass = i;
tempMaxCount = tempClassCounts[i];
} // Of if
} // Of for i
// Classify unprocessed instances.
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempMaxClass;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
}// Of vote
/**
**********************************
* Cluster based active learning. Prepare for
*
* @param paraRatio
* The ratio of the maximal distance as the dc.
* @param paraMaxNumQuery
* The maximal number of queries for the whole dataset.
* paraSmallBlockThreshold The small block threshold.
**********************************
*/
public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) {
radius = maximalDistance * paraRatio;
smallBlockThreshold = paraSmallBlockThreshold;
maxNumQuery = paraMaxNumQuery;
predictedLabels = new int[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
predictedLabels[i] = -1;
} // Of for i
computeDensitiesGaussian();
computeDistanceToMaster();
computePriority();
descendantRepresentatives = mergeSortToIndices(priority);
System.out.println("descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
numQuery = 0;
clusterBasedActiveLearning(descendantRepresentatives);
}// Of clusterBasedActiveLearning
/**
**********************************
* Cluster based active learning.
*
* @param paraBlock
* The given block. This block must be sorted according to the
* priority in descendant order.
**********************************
*/
public void clusterBasedActiveLearning(int[] paraBlock) {
System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// Step 1. How many labels are queried for this block.
int tempExpectedQueries = (int) Math.sqrt(paraBlock.length);
int tempNumQuery = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempNumQuery++;
} // Of if
} // Of for i
// Step 2. Vote for small blocks.
if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
System.out.println(
"" + tempNumQuery + " instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock));
vote(paraBlock);
return;
} // Of if
// Step 3. Query enough labels.
for (int i = 0; i < tempExpectedQueries; i++) {
if (numQuery >= maxNumQuery) {
System.out.println("No more quries are provided, numQuery = " + numQuery + ".");
vote(paraBlock);
return;
} // Of if
if (instanceStatusArray[paraBlock[i]] == 0) {
instanceStatusArray[paraBlock[i]] = 1;
predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
// System.out.println("Query #" + paraBlock[i] + ", numQuery = "
// + numQuery);
numQuery++;
} // Of if
} // Of for i
// Step 4. Pure?
int tempFirstLabel = predictedLabels[paraBlock[0]];
boolean tempPure = true;
for (int i = 1; i < tempExpectedQueries; i++) {
if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
tempPure = false;
break;
} // Of if
} // Of for i
if (tempPure) {
System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempFirstLabel;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
return;
} // Of if
// Step 5. Split in two and process them independently.
int[][] tempBlocks = clusterInTwo(paraBlock);
for (int i = 0; i < 2; i++) {
// Attention: recursive invoking here.
clusterBasedActiveLearning(tempBlocks[i]);
} // Of for i
}// Of clusterBasedActiveLearning
/**
*******************
* Show the statistics information.
*******************
*/
public String toString() {
int[] tempStatusCounts = new int[3];
double tempCorrect = 0;
for (int i = 0; i < dataset.numInstances(); i++) {
tempStatusCounts[instanceStatusArray[i]]++;
if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
String resultString = "(unhandled, queried, classified) = " + Arrays.toString(tempStatusCounts);
resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances());
return resultString;
}// Of toString
/**
**********************************
* The entrance of the program.
*
* @param args:
* Not used now.
**********************************
*/
public static void main(String[] args) {
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
String arffFilename = "D:/data/iris.arff";
// String arffFilename = "D:/data/mushroom.arff";
Alec tempAlec = new Alec(arffFilename);
tempAlec.clusterBasedActiveLearning(0.1, 30, 3); // For iris
// tempAlec.clusterBasedActiveLearning(0.1, 800, 3); //For mushroom
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}// Of main
}// Of class Alec
第 67 天: 主动学习之 ALEC (续)
今天抄完程序, 并且进行初步理解.
computeDistanceToMaster 是密度聚类的核心. 节点的父节点 (master), 是比它密度大的节点中, 距离最近那个. 到父节点的距离越远, 表示独立性越强.
computePriority 综合考虑密度 (能力) 与距离 (独立性). 这两者乘积越大的节点 (对象), 代表性越强.
coincideWithMaster 在聚类算法中使用, 需要用例子来跟踪来能明白其作用. 简单而言, 节点应与其父节点拥有同样的簇编号.
clusterInTwo 将一个块分成两块, 其根节点依次是第一个与第二个 (注意到每个块都是按节点的代表性递减排序).
vote 根据已经查询的标签, 对一个块中其它对象进行投票分类.
clusterBasedActiveLearning(double, double, int) 为核心算法提供初始化服务.
clusterBasedActiveLearning(int[]) 是核心算法, 它是递归的.各种情况的处理要小心.
public void computeDistanceToMaster() {
distanceToMaster = new double[dataset.numInstances()];
masters = new int[dataset.numInstances()];
descendantDensities = new int[dataset.numInstances()];
instanceStatusArray = new int[dataset.numInstances()];
descendantDensities = mergeSortToIndices(densities);
distanceToMaster[descendantDensities[0]] = maximalDistance;
double tempDistance;
for (int i = 1; i < dataset.numInstances(); i++) {
// Initialize.
distanceToMaster[descendantDensities[i]] = maximalDistance;
for (int j = 0; j <= i - 1; j++) {
tempDistance = distance(descendantDensities[i], descendantDensities[j]);
if (distanceToMaster[descendantDensities[i]] > tempDistance) {
distanceToMaster[descendantDensities[i]] = tempDistance;
masters[descendantDensities[i]] = descendantDensities[j];
} // Of if
} // Of for j
} // Of for i
System.out.println("First compute, masters = " + Arrays.toString(masters));
System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
}// Of computeDistanceToMaster
/**
**********************************
* Compute priority. Element with higher priority is more likely to be
* selected as a cluster center. Now it is rho * distanceToMaster. It can
* also be rho^alpha * distanceToMaster.
**********************************
*/
public void computePriority() {
priority = new double[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
priority[i] = densities[i] * distanceToMaster[i];
} // Of for i
}// Of computePriority
/**
*************************
* The block of a node should be same as its master. This recursive method
* is efficient.
*
* @param paraIndex
* The index of the given node.
* @return The cluster index of the current node.
*************************
*/
public int coincideWithMaster(int paraIndex) {
if (clusterIndices[paraIndex] == -1) {
int tempMaster = masters[paraIndex];
clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
} // Of if
return clusterIndices[paraIndex];
}// Of coincideWithMaster
/**
*************************
* Cluster a block in two. According to the master tree.
*
* @param paraBlock
* The given block.
* @return The new blocks where the two most represent instances serve as
* the root.
*************************
*/
public int[][] clusterInTwo(int[] paraBlock) {
// 重新初始化,实际上只考虑了给定块中的示例
Arrays.fill(clusterIndices, -1);
// 初始化两个根的群集号
for (int i = 0; i < 2; i++) {
clusterIndices[paraBlock[i]] = i;
} // Of for i
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] != -1) {
// 已经有群集号
continue;
} // Of if
clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
} // Of for i
// 子块
int[][] resultBlocks = new int[2][];
int tempFistBlockCount = 0;
for (int i = 0; i < clusterIndices.length; i++) {
if (clusterIndices[i] == 0) {
tempFistBlockCount++;
} // Of if
} // Of for i
resultBlocks[0] = new int[tempFistBlockCount];
resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];
int tempFirstIndex = 0;
int tempSecondIndex = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] == 0) {
resultBlocks[0][tempFirstIndex] = paraBlock[i];
tempFirstIndex++;
} else {
resultBlocks[1][tempSecondIndex] = paraBlock[i];
tempSecondIndex++;
} // Of if
} // Of for i
System.out.println("Split (" + paraBlock.length + ") instances " + Arrays.toString(paraBlock) + "\r\nto ("
+ resultBlocks[0].length + ") instances " + Arrays.toString(resultBlocks[0]) + "\r\nand ("
+ resultBlocks[1].length + ") instances " + Arrays.toString(resultBlocks[1]));
return resultBlocks;
}// Of clusterInTwo
/**
**********************************
* Classify instances in the block by simple voting.
*
* @param paraBlock
* The given block.
**********************************
*/
public void vote(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
} // Of if
} // Of for i
int tempMaxClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
tempMaxClass = i;
tempMaxCount = tempClassCounts[i];
} // Of if
} // Of for i
// Classify unprocessed instances.
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempMaxClass;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
}// Of vote
/**
**********************************
* Cluster based active learning. Prepare for
*
* @param paraRatio
* The ratio of the maximal distance as the dc.
* @param paraMaxNumQuery
* The maximal number of queries for the whole dataset.
* paraSmallBlockThreshold The small block threshold.
**********************************
*/
public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) {
radius = maximalDistance * paraRatio;
smallBlockThreshold = paraSmallBlockThreshold;
maxNumQuery = paraMaxNumQuery;
predictedLabels = new int[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
predictedLabels[i] = -1;
} // Of for i
computeDensitiesGaussian();
computeDistanceToMaster();
computePriority();
descendantRepresentatives = mergeSortToIndices(priority);
System.out.println("descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
numQuery = 0;
clusterBasedActiveLearning(descendantRepresentatives);
}// Of clusterBasedActiveLearning
/**
**********************************
* Cluster based active learning.
*
* @param paraBlock
* The given block. This block must be sorted according to the
* priority in descendant order.
**********************************
*/
public void clusterBasedActiveLearning(int[] paraBlock) {
System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// 查询此块标签数
int tempExpectedQueries = (int) Math.sqrt(paraBlock.length);
int tempNumQuery = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempNumQuery++;
} // Of if
} // Of for i
// 投票
if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
System.out.println(
"" + tempNumQuery + " instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock));
vote(paraBlock);
return;
} // Of if
// 查询足够的标签
for (int i = 0; i < tempExpectedQueries; i++) {
if (numQuery >= maxNumQuery) {
System.out.println("No more quries are provided, numQuery = " + numQuery + ".");
vote(paraBlock);
return;
} // Of if
if (instanceStatusArray[paraBlock[i]] == 0) {
instanceStatusArray[paraBlock[i]] = 1;
predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
// System.out.println("Query #" + paraBlock[i] + ", numQuery = " + numQuery);
numQuery++;
} // Of if
} // Of for i
// 是否纯
int tempFirstLabel = predictedLabels[paraBlock[0]];
boolean tempPure = true;
for (int i = 1; i < tempExpectedQueries; i++) {
if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
tempPure = false;
break;
} // Of if
} // Of for i
if (tempPure) {
System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempFirstLabel;
instanceStatusArray[paraBlock[i]] = 2;
} // Of if
} // Of for i
return;
} // Of if
// 独立处理
int[][] tempBlocks = clusterInTwo(paraBlock);
for (int i = 0; i < 2; i++) {
// 递归调用
clusterBasedActiveLearning(tempBlocks[i]);
} // Of for i
}// Of clusterBasedActiveLearning
/**
*******************
* Show the statistics information.
*******************
*/
public String toString() {
int[] tempStatusCounts = new int[3];
double tempCorrect = 0;
for (int i = 0; i < dataset.numInstances(); i++) {
tempStatusCounts[instanceStatusArray[i]]++;
if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
String resultString = "(unhandled, queried, classified) = " + Arrays.toString(tempStatusCounts);
resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances());
return resultString;
}// Of toString
/**
**********************************
* The entrance of the program.
*
* @param args:
* Not used now.
**********************************
*/
public static void main(String[] args) {
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
String arffFilename = "D:/data/iris.arff";
// String arffFilename = "D:/data/mushroom.arff";
Alec tempAlec = new Alec(arffFilename);
tempAlec.clusterBasedActiveLearning(0.1, 30, 3); // For iris
// tempAlec.clusterBasedActiveLearning(0.1, 800, 3); //For mushroom
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}
第 68 天: 主动学习之 ALEC (续)
继续理解, 特别是回头理解成员变量的作用.
用不同的数据集做实验.
public static void main(String[] args) {
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
//String arffFilename = "D:/data/iris.arff";
String arffFilename = "D:/data/mushroom.arff";
Alec tempAlec = new Alec(arffFilename);
//tempAlec.clusterBasedActiveLearning(0.1, 30, 3); // For iris
tempAlec.clusterBasedActiveLearning(0.1, 800, 3); //For mushroom
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}
第 69 天: 矩阵分解
矩阵分解是推荐系统的一种重要算法. 同时, 它也可以用到许多其它地方.
- 用三元组来存放一个数据. 与 MBR 里面的方式相似.
- 子矩阵更新代码是核心.
- 在训练集中进行测试, 所以拟合效果好 (MAE = 0.51).
- 作为基础训练, 没有考虑正则项.
package machinelearning.recommendersystem;
import java.io.*;
import java.util.Random;
/*
* Matrix factorization for recommender systems.
*
* @author Fan Min minfanphd@163.com.
*/
public class MatrixFactorization {
/**
* Used to generate random numbers.
*/
Random rand = new Random();
/**
* Number of users.
*/
int numUsers;
/**
* Number of items.
*/
int numItems;
/**
* Number of ratings.
*/
int numRatings;
/**
* Training data.
*/
Triple[] dataset;
/**
* A parameter for controlling learning regular.
*/
double alpha;
/**
* A parameter for controlling the learning speed.
*/
double lambda;
/**
* The low rank of the small matrices.
*/
int rank;
/**
* The user matrix U.
*/
double[][] userSubspace;
/**
* The item matrix V.
*/
double[][] itemSubspace;
/**
* The lower bound of the rating value.
*/
double ratingLowerBound;
/**
* The upper bound of the rating value.
*/
double ratingUpperBound;
/**
************************
* The first constructor.
*
* @param paraFilename
* The data filename.
* @param paraNumUsers
* The number of users.
* @param paraNumItems
* The number of items.
* @param paraNumRatings
* The number of ratings.
************************
*/
public MatrixFactorization(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings,
double paraRatingLowerBound, double paraRatingUpperBound) {
numUsers = paraNumUsers;
numItems = paraNumItems;
numRatings = paraNumRatings;
ratingLowerBound = paraRatingLowerBound;
ratingUpperBound = paraRatingUpperBound;
try {
readData(paraFilename, paraNumUsers, paraNumItems, paraNumRatings);
// adjustUsingMeanRating();
} catch (Exception ee) {
System.out.println("File " + paraFilename + " cannot be read! " + ee);
System.exit(0);
} // Of try
initialize();
}// Of the first constructor
/**
************************
* Set parameters.
*
* @param paraRank
* The given rank.
* @throws IOException
************************
*/
public void setParameters(int paraRank, double paraAlpha, double paraLambda) {
rank = paraRank;
alpha = paraAlpha;
lambda = paraLambda;
}// Of setParameters
/**
************************
* Read the data from the file.
*
* @param paraFilename
* The given file.
* @throws IOException
************************
*/
public void readData(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings)
throws IOException {
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
} // Of if
BufferedReader tempBufferReader = new BufferedReader(new FileReader(tempFile));
// Allocate space.
dataset = new Triple[paraNumRatings];
String tempString;
String[] tempStringArray;
for (int i = 0; i < paraNumRatings; i++) {
tempString = tempBufferReader.readLine();
tempStringArray = tempString.split(",");
dataset[i] = new Triple(Integer.parseInt(tempStringArray[0]), Integer.parseInt(tempStringArray[1]),
Double.parseDouble(tempStringArray[2]));
} // Of for i
tempBufferReader.close();
}// Of readData
/**
************************
* Initialize some variables.
************************
*/
void initialize() {
rank = 5;
alpha = 0.0001;
lambda = 0.005;
}// Of initialize
/**
************************
* Initialize subspaces. Each value is in [0, 1].
************************
*/
void initializeSubspaces() {
userSubspace = new double[numUsers][rank];
for (int i = 0; i < numUsers; i++) {
for (int j = 0; j < rank; j++) {
userSubspace[i][j] = rand.nextDouble();
} // Of for j
} // Of for i
itemSubspace = new double[numItems][rank];
for (int i = 0; i < numItems; i++) {
for (int j = 0; j < rank; j++) {
itemSubspace[i][j] = rand.nextDouble();
} // Of for j
} // Of for i
}// Of initializeSubspaces
/**
************************
* Predict the rating of the user to the item
*
* @param paraUser
* The user index.
************************
*/
public double predict(int paraUser, int paraItem) {
double resultValue = 0;
for (int i = 0; i < rank; i++) {
// The row vector of an user and the column vector of an item
resultValue += userSubspace[paraUser][i] * itemSubspace[paraItem][i];
} // Of for i
return resultValue;
}// Of predict
/**
************************
* Train.
*
* @param paraRounds
* The number of rounds.
************************
*/
public void train(int paraRounds) {
initializeSubspaces();
for (int i = 0; i < paraRounds; i++) {
updateNoRegular();
if (i % 50 == 0) {
// Show the process
System.out.println("Round " + i);
System.out.println("MAE: " + mae());
} // Of if
} // Of for i
}// Of train
/**
************************
* Update sub-spaces using the training data.
************************
*/
public void updateNoRegular() {
for (int i = 0; i < numRatings; i++) {
int tempUserId = dataset[i].user;
int tempItemId = dataset[i].item;
double tempRate = dataset[i].rating;
double tempResidual = tempRate - predict(tempUserId, tempItemId); // Residual
// Update user subspace
double tempValue = 0;
for (int j = 0; j < rank; j++) {
tempValue = 2 * tempResidual * itemSubspace[tempItemId][j];
userSubspace[tempUserId][j] += alpha * tempValue;
} // Of for j
// Update item subspace
for (int j = 0; j < rank; j++) {
tempValue = 2 * tempResidual * userSubspace[tempUserId][j];
itemSubspace[tempItemId][j] += alpha * tempValue;
} // Of for j
} // Of for i
}// Of updateNoRegular
/**
************************
* Compute the RSME.
*
* @return RSME of the current factorization.
************************
*/
public double rsme() {
double resultRsme = 0;
int tempTestCount = 0;
for (int i = 0; i < numRatings; i++) {
int tempUserIndex = dataset[i].user;
int tempItemIndex = dataset[i].item;
double tempRate = dataset[i].rating;
double tempPrediction = predict(tempUserIndex, tempItemIndex);// +
// DataInfo.mean_rating;
if (tempPrediction < ratingLowerBound) {
tempPrediction = ratingLowerBound;
} else if (tempPrediction > ratingUpperBound) {
tempPrediction = ratingUpperBound;
} // Of if
double tempError = tempRate - tempPrediction;
resultRsme += tempError * tempError;
tempTestCount++;
} // Of for i
return Math.sqrt(resultRsme / tempTestCount);
}// Of rsme
/**
************************
* Compute the MAE.
*
* @return MAE of the current factorization.
************************
*/
public double mae() {
double resultMae = 0;
int tempTestCount = 0;
for (int i = 0; i < numRatings; i++) {
int tempUserIndex = dataset[i].user;
int tempItemIndex = dataset[i].item;
double tempRate = dataset[i].rating;
double tempPrediction = predict(tempUserIndex, tempItemIndex);
if (tempPrediction < ratingLowerBound) {
tempPrediction = ratingLowerBound;
} // Of if
if (tempPrediction > ratingUpperBound) {
tempPrediction = ratingUpperBound;
} // Of if
double tempError = tempRate - tempPrediction;
resultMae += Math.abs(tempError);
// System.out.println("resultMae: " + resultMae);
tempTestCount++;
} // Of for i
return (resultMae / tempTestCount);
}// Of mae
/**
************************
* Compute the MAE.
*
* @return MAE of the current factorization.
************************
*/
public static void testTrainingTesting(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings,
double paraRatingLowerBound, double paraRatingUpperBound, int paraRounds) {
try {
// Step 1. read the training and testing data
MatrixFactorization tempMF = new MatrixFactorization(paraFilename, paraNumUsers, paraNumItems,
paraNumRatings, paraRatingLowerBound, paraRatingUpperBound);
tempMF.setParameters(5, 0.0001, 0.005);
// Step 2. Initialize the feature matrices U and V
tempMF.initializeSubspaces();
// Step 3. update and predict
System.out.println("Begin Training ! ! !");
tempMF.train(paraRounds);
double tempMAE = tempMF.mae();
double tempRSME = tempMF.rsme();
System.out.println("Finally, MAE = " + tempMAE + ", RSME = " + tempRSME);
} catch (Exception e) {
e.printStackTrace();
} // Of try
}// Of testTrainingTesting
/**
************************
* @param args
************************
*/
public static void main(String args[]) {
testTrainingTesting("D:/data/movielens-943u1682m.txt", 943, 1682, 10000, 1, 5, 2000);
}// Of main
public class Triple {
public int user;
public int item;
public double rating;
/**
*********************
* The constructor.
*********************
*/
public Triple() {
user = -1;
item = -1;
rating = -1;
}// Of the first constructor
/**
*********************
* The constructor.
*********************
*/
public Triple(int paraUser, int paraItem, double paraRating) {
user = paraUser;
item = paraItem;
rating = paraRating;
}// Of the first constructor
/**
*********************
* Show me.
*********************
*/
public String toString() {
return "" + user + ", " + item + ", " + rating;
}// Of toString
}// Of class Triple
}// Of class MatrixFactorization
第 70 天: 矩阵分解 (续)
矩阵的LU分解:
A=L*U
其中:
L是下三角矩阵,是一系列初等矩阵的乘积,且主对角线都为1
U是上三角矩阵,是一上述初等矩阵的逆矩阵乘积,主对角线没有要求
可行性:在对矩阵进行高斯约旦消元法A=L*D*U
和上述情况类似,这里把U化为主对角线都为1的矩阵,D是只有主对角线有元素的矩阵
矩阵的QR分解:
A=QR
其中:
Q是标准正交矩阵
R是上三角矩阵矩阵的QR分解用来求解Ax=b的线性问题时非常方便