/**
********************
* Compute distanceToMaster, the distance to its master.
********************
*/
public void computeDistanceToMaster() {
distanceToMaster = new double[dataset.numInstances()];
masters = new int[dataset.numInstances()];
descendantDensities = new int[dataset.numInstances()];
instanceStatusArray = new int[dataset.numInstances()];
descendantDensities = mergeSortToIndices(densities);
distanceToMaster[descendantDensities[0]] = maximalDistance;
double tempDistance;
for (int i = 1; i < dataset.numInstances(); i++) {
//Initialize.
distanceToMaster[descendantDensities[i]] = maximalDistance;
for (int j = 0; j <= i - 1; j++) {
tempDistance = distance(descendantDensities[i], descendantDensities[j]);
if (distanceToMaster[descendantDensities[i]] > tempDistance) {
distanceToMaster[descendantDensities[i]] = tempDistance;
masters[descendantDensities[i]] = descendantDensities[j];
}//of if
}//of for j
}//of for i
System.out.println("First compute, masters = " + Arrays.toString(masters));
System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
}//of computeDistanceToMaster
/**
********************
* Compute priority. Elements with higher priority is more likely to be
* selected as a cluster center. Now it is rho * distanceToMaster. It can
* also be rho^alpha * distanceToMaster.
********************
*/
public void computePriority() {
priority = new double[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
priority[i] = densities[i] * distanceToMaster[i];
}//of for i
}//of computePriority
/**
********************
* The block of a node should be same as its master. This recursive method
* is efficient.
*
* @param paraIndex
* The index of the given node.
* @return The cluster index of the current node.
********************
*/
public int coincideWithMaster(int paraIndex) {
if (clusterIndices[paraIndex] == -1) {
int tempMaster = masters[paraIndex];
clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
}//of if
return clusterIndices[paraIndex];
}//of coincideWithMaster
/**
*************************
* Cluster a block in two. According to the master tree.
*
* @param paraBlock
* The given block.
* @return The new blocks where the two most represent instances serve as
* the root.
*************************
*/
public int[][] clusterInTwo(int[] paraBlock) {
//Reinitialize. In fact, only instances in the given block is
//considered.
Arrays.fill(clusterIndices, -1);
//Initialize the cluster number of the two roots.
for (int i = 0; i < 2; i++) {
clusterIndices[paraBlock[i]] = i;
}//of for i
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] != -1) {
// Already have a cluster number.
continue;
}//of if
clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
}//of for i
//The sub blocks.
int[][] resultBlocks = new int[2][];
int tempFistBlockCount = 0;
for (int i = 0; i < clusterIndices.length; i++) {
if (clusterIndices[i] == 0) {
tempFistBlockCount++;
}//of if
}//of for i
resultBlocks[0] = new int[tempFistBlockCount];
resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];
// Copy. You can design shorter code when the number of clusters is
// greater than 2.
int tempFirstIndex = 0;
int tempSecondIndex = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] == 0) {
resultBlocks[0][tempFirstIndex] = paraBlock[i];
tempFirstIndex++;
} else {
resultBlocks[1][tempSecondIndex] = paraBlock[i];
tempSecondIndex++;
} // Of if
}//of for i
System.out.println("Split (" + paraBlock.length + ") instances " + Arrays.toString(paraBlock) + "\r\n to ("
+ resultBlocks[0].length + " ) instances" + Arrays.toString(resultBlocks[0]) + "\r\n and (" +
resultBlocks[1].length + ") instances " + Arrays.toString(resultBlocks[1]));
return resultBlocks;
}//of clusterInTwo
/**
********************
* Classify instances in the block by simple voting.
*
* @param paraBlock
* The given block.
********************
*/
public void vote(int[] paraBlock){
int[] tempClassCount = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempClassCount[(int) dataset.instance(paraBlock[i]).classValue()]++;
}//of if
}//of for i
int tempMaxClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCount.length; i++) {
if (tempMaxCount < tempClassCount[i]) {
tempMaxClass = i;
tempMaxCount = tempClassCount[i];
}//of if
}//of for i
//Classify unprocessed instances.
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempMaxClass;
instanceStatusArray[paraBlock[i]] = 2;
}//of if
}//of for i
}//of vote
/**
********************
* Cluster based active learning. Prepare for
*
* @param paraRatio
* The ratio of the maximal distance as the dc.
* @param paraMaxNumQuery
* The maximal number of queries for the whole dataset.
* @param paraSmallBlockThreshold
* The small block threshold.
********************
*/
public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) {
radius = maximalDistance * paraRatio;
smallBlockThreshold = paraSmallBlockThreshold;
maxNumQuery = paraMaxNumQuery;
predictedLabels = new int[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
predictedLabels[i] = -1;
} // Of for i
computeDensitiesGaussian();
computeDistanceToMaster();
computePriority();
descendantRepresentatives = mergeSortToIndices(priority);
System.out.println("descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
numQuery = 0;
clusterBasedActiveLearning(descendantRepresentatives);
}//of clusterBasedActiveLearning
/**
********************
* Cluster based active learning.
*
* @param paraBlock
* The given block. This block must be sorted according to the
* priority in descendant order.
********************
*/
public void clusterBasedActiveLearning(int[] paraBlock){
System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// Step 1. How many labels are queried for this block.
int tempExpectedQueries = (int) Math.sqrt(paraBlock.length);
int tempNumQuery = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempNumQuery++;
}//of if
}//of for i
// Step 2. Vote for small blocks.
if((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)){
System.out.println("" + tempNumQuery +
" instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock));
vote(paraBlock);
return;
}//of if
// Step 3. Query enough labels.
for (int i = 0; i < tempExpectedQueries; i++) {
if (numQuery >= maxNumQuery) {
System.out.println("No more queries are provided, numQuery = " + numQuery +".");
vote(paraBlock);
return;
}//of if
if (instanceStatusArray[paraBlock[i]] == 0) {
instanceStatusArray[paraBlock[i]] = 1;
predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
// System.out.println("Query #" + paraBlock[i] +", numQuery ="
// + numQuery);
numQuery++;
}//of if
}//of for i
// Step 4. About Pure
int tempFirstLabel = predictedLabels[paraBlock[0]];
boolean tempPure = true;
for (int i = 0; i < tempExpectedQueries; i++) {
if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
tempPure = false;
break;
}//of if
}//of for i
if (tempPure) {
System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0){
predictedLabels[paraBlock[i]] = tempFirstLabel;
instanceStatusArray[paraBlock[i]] = 2;
}//of if
}//of for i
return;
}//of if
// Step 5. Split in two and process then independently.
int[][] tempBlocks = clusterInTwo(paraBlock);
for (int i = 0; i < 2; i++) {
// Attention: recursive invoking here.
clusterBasedActiveLearning(tempBlocks[i]);
}//of for i
}//of clusterBasedActiveLearning
/**
********************
* Show the statistics information.
********************
*/
public String toString(){
int[] tempStatusCounts = new int[3];
double tempCorrect = 0;
for (int i = 0; i < dataset.numInstances(); i++) {
tempStatusCounts[instanceStatusArray[i]]++;
if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
}//of if
}//of for i
String resultString = "(unhandled, queried, classified = " + Arrays.toString(tempStatusCounts);
resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances());
return resultString;
}//of toString
/**
********************
* The entrance of the program.
*
* @param args
* Not used now.
********************
*/
public static void main(String[] args){
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
//String arffFilename = "D:/mitchelles/data/mushroom.arff";
String arffFilename = "D:/mitchelles/data/iris.arff";
Alec tempAlec = new Alec(arffFilename);
tempAlec.clusterBasedActiveLearning(0.1, 30, 3);
//tempAlec.clusterBasedActiveLearning(0.1, 800, 3);
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}//of main
}//of class Alec
day67
最新推荐文章于 2023-12-17 23:41:26 发布