原博文:minfanphd
任务计划
第61天:决策树 (1. 准备工作)
决策树是最经典的机器学习算法. 其实我不想在后面加上"之一". 它有非常好的可解释性.
- 数据仅有一份. 分裂后的数据子集仅需要保存 availableInstances 和 availableAttributes 两个数组.
- 两个构造方法, 一个读入文件获得根节点, 另一个建立根据数据分裂的获得.
- 判断数据集是否纯, 即所有的类标签是否相同, 如果是就不用分裂了.
- 每个节点 (包括非叶节点) 都需要一个标签, 这样, 遇到未见过的属性就可以直接分类了. 为获得该标签, 可以通过投票的方式, 即 getMajorityClass().
- 最大化信息增益, 与最小化条件信息熵, 两者是等价的.
因为计算信息增益时,经验熵都是一样的,那么信息增益=经验熵-条件信息熵,所以条件信息熵越小,信息增益就越大。
- 分裂的数据块有可能是空的, 这时使用长度为 0 的数组而不是 null.
“熵”(entropy) 是表示随机变量不确定性的度量。熵越大,随机变量的不确定性就越大。
信息增益(information gain) 表示得知特征 X X X的信息而使得类 Y Y Y的信息的不确定性减少的程度。
ID3 算法的核心是在决策树各个节点上应用信息增益准则选择特征,递归地构建决策树。具体方法是:从根节点开始,对节点计算所有可能的特征的信息增益,然后选择信息增益最大的特征作为节点的特征,由该特征的不同取值建立子节点,再对子节点递归调用上述方法,构成决策树。
package MachineLearning.decisiontree;
import weka.core.*;
import java.io.FileReader;
import java.util.Arrays;
/**
* @description:ID3决策树归纳算法
* @author: Qing Zhang
* @time: 2021/7/11
*/
public class ID3 {
//数据集
Instances dataset;
//数据集是否纯(所有的类标签是否相同)
boolean pure;
//类的数量
int numClasses;
//可用的实例。其它实例不属于这个分支
int[] availableInstances;
//可用的属性。其它属性已经在到根节点的路径里被选择了
int[] availableAttributes;
//当前被选择的分割属性
int splitAttribute;
//子节点
ID3[] children;
//我的标签。内部节点同样有标签。
//例如<outlook = sunny, humidity = high>从未出现在训练集中,
//但是<humidity = high>在其它情况中有效
int label;
//预测结果,包括查询和预测的标签
int[] predicts;
//小块不能进一步划分
static int smallBlockThreshold = 3;
/**
* @Description: 构造函数
* @Param: [paraFilename]
* @return:
*/
public ID3(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
}
dataset.setClassIndex(dataset.numAttributes() - 1);
numClasses = dataset.classAttribute().numValues();
availableInstances = new int[dataset.numInstances()];
for (int i = 0; i < availableInstances.length; i++) {
availableInstances[i] = i;
}
availableAttributes = new int[dataset.numAttributes() - 1];
for (int i = 0; i < availableAttributes.length; i++) {
availableAttributes[i] = i;
}
//初始化
children = null;
//通过投票判断标签
label = getMajorityClass(availableInstances);
//判断实例是否纯
pure = pureJudge(availableInstances);
}
/**
* @Description: 构造函数
* @Param: [paraDataset, paraAvailableInstances, paraAvailableAttributes]
* @return:
*/
public ID3(Instances paraDataset, int[] paraAvailableInstances, int[] paraAvailableAttributes) {
//复制其引用,而不是克隆availableInstances
dataset = paraDataset;
availableInstances = paraAvailableInstances;
availableAttributes = paraAvailableAttributes;
//初始化
children = null;
//通过投票判断标签
label = getMajorityClass(availableInstances);
//判断实例是否纯
pure = pureJudge(availableInstances);
}
/**
* @Description: 判断是否纯
* @Param: [paraBlock]
* @return: boolean
*/
public boolean pureJudge(int[] paraBlock) {
pure = true;
for (int i = 1; i < paraBlock.length; i++) {
if (dataset.instance(paraBlock[i]).classValue() != dataset.instance(paraBlock[0])
.classValue()) {
pure = false;
break;
}
}
return pure;
}
/**
* @Description: 通过投票计算给定块主要的类
* @Param: [paraBlock]
* @return: int
*/
public int getMajorityClass(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
}
int resultMajorityClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
resultMajorityClass = i;
tempMaxCount = tempClassCounts[i];
}
}
return resultMajorityClass;
}
/**
* @Description: 选择最优属性
* @Param: []
* @return: int
*/
public int selectBestAttribute() {
splitAttribute = -1;
double tempMinimalEntropy = 10000;
double tempEntropy;
//选择条件熵最小的作为最优属性
for (int i = 0; i < availableAttributes.length; i++) {
tempEntropy = conditionalEntropy(availableAttributes[i]);
if (tempMinimalEntropy > tempEntropy) {
tempMinimalEntropy = tempEntropy;
splitAttribute = availableAttributes[i];
}
}
return splitAttribute;
}
/**
* @Description: 计算条件熵
* @Param: [paraAttribute]
* @return: double
*/
public double conditionalEntropy(int paraAttribute) {
// Step 1. 统计,统计出该特征下的类别分布.
//数据集的类别数量
int tempNumClasses = dataset.numClasses();
//当前属性被分类的类别数量
int tempNumValues = dataset.attribute(paraAttribute).numValues();
int tempNumInstances = availableInstances.length;
double[] tempValueCounts = new double[tempNumValues];
double[][] tempCountMatrix = new double[tempNumValues][tempNumClasses];
int tempClass, tempValue;
for (int i = 0; i < tempNumInstances; i++) {
tempClass = (int) dataset.instance(availableInstances[i]).classValue();
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempValueCounts[tempValue]++;
tempCountMatrix[tempValue][tempClass]++;
}
// Step 2.
double resultEntropy = 0;
double tempEntropy, tempFraction;
for (int i = 0; i < tempNumValues; i++) {
if (tempValueCounts[i] == 0) {
continue;
}
tempEntropy = 0;
for (int j = 0; j < tempNumClasses; j++) {
//该特征下其中一种情况的概率
tempFraction = tempCountMatrix[i][j] / tempValueCounts[i];
if (tempFraction == 0) {
continue;
}
tempEntropy += -tempFraction * Math.log(tempFraction);
}
resultEntropy += tempValueCounts[i] / tempNumInstances * tempEntropy;
}
return resultEntropy;
}
/**
* @Description: 根据给定的属性分割数据
* @Param: [paraAttribute]
* @return: int[][]
*/
public int[][] splitData(int paraAttribute) {
int tempNumValues = dataset.attribute(paraAttribute).numValues();
// System.out.println("Dataset " + dataset + "\r\n");
// System.out.println("Attribute " + paraAttribute + " has " +
// tempNumValues + " values.\r\n");
int[][] resultBlocks = new int[tempNumValues][];
int[] tempSizes = new int[tempNumValues];
//第一次扫描来计算每个块的大小
int tempValue;
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
tempSizes[tempValue]++;
}
//分配空间
for (int i = 0; i < tempNumValues; i++) {
resultBlocks[i] = new int[tempSizes[i]];
}
//第二轮扫描来填充,先将tempSizes填充为零再依次将实例分割到相应的块中
Arrays.fill(tempSizes, 0);
for (int i = 0; i < availableInstances.length; i++) {
tempValue = (int) dataset.instance(availableInstances[i]).value(paraAttribute);
//拷贝数据
resultBlocks[tempValue][tempSizes[tempValue]] = availableInstances[i];
tempSizes[tempValue]++;
}
return resultBlocks;
}
/**
* @Description: 建立回归树
* @Param: []
* @return: void
*/
public void buildTree() {
//如果可用实例都是纯的,就不用再划分树了
if (pureJudge(availableInstances)) {
return;
}
//剩余的实例已经少于小块阈值也可以停止划分
if (availableInstances.length <= smallBlockThreshold) {
return;
}
selectBestAttribute();
int[][] tempSubBlocks = splitData(splitAttribute);
children = new ID3[tempSubBlocks.length];
//构造剩余的属性集
int[] tempRemainingAttributes = new int[availableAttributes.length - 1];
for (int i = 0; i < availableAttributes.length; i++) {
if (availableAttributes[i] < splitAttribute) {
tempRemainingAttributes[i] = availableAttributes[i];
} else if (availableAttributes[i] > splitAttribute) {
tempRemainingAttributes[i - 1] = availableAttributes[i];
}
}
// 构建儿子
for (int i = 0; i < children.length; i++) {
if ((tempSubBlocks[i] == null) || (tempSubBlocks[i].length == 0)) {
children[i] = null;
continue;
} else {
// System.out.println("Building children #" + i + " with
// instances " + Arrays.toString(tempSubBlocks[i]));
children[i] = new ID3(dataset, tempSubBlocks[i], tempRemainingAttributes);
//重要代码:递归执行此操作
children[i].buildTree();
}
}
}
/**
* @Description: 判断实例类别
* @Param: [paraInstance]
* @return: int
*/
public int classify(Instance paraInstance) {
if (children == null) {
return label;
}
ID3 tempChild = children[(int) paraInstance.value(splitAttribute)];
if (tempChild == null) {
return label;
}
return tempChild.classify(paraInstance);
}
/**
* @Description: 在测试集上测试
* @Param: [paraDataset]
* @return: double
*/
public double test(Instances paraDataset) {
double tempCorrect = 0;
for (int i = 0; i < paraDataset.numInstances(); i++) {
if (classify(paraDataset.instance(i)) == (int) paraDataset.instance(i).classValue()) {
tempCorrect++;
}
}
return tempCorrect / paraDataset.numInstances();
}
/**
* @Description: 在训练集上测试
* @Param: []
* @return: double
*/
public double selfTest() {
return test(dataset);
}
public String toString() {
String resultString = "";
String tempAttributeName = dataset.attribute(splitAttribute).name();
if (children == null) {
resultString += "class = " + label;
} else {
for (int i = 0; i < children.length; i++) {
if (children[i] == null) {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + "class = " + label
+ "\r\n";
} else {
resultString += tempAttributeName + " = "
+ dataset.attribute(splitAttribute).value(i) + ":" + children[i]
+ "\r\n";
}
}
}
return resultString;
}
/**
* @Description: 测试该类
* @Param: []
* @return: void
*/
public static void id3Test() {
ID3 tempID3 = new ID3("F:\\研究生\\研0\\学习\\Java_Study\\data_set\\weather.arff");
// ID3 tempID3 = new ID3("D:/data/mushroom.arff");
ID3.smallBlockThreshold = 3;
tempID3.buildTree();
System.out.println("The tree is: \r\n" + tempID3);
double tempAccuracy = tempID3.selfTest();
System.out.println("The accuracy is: " + tempAccuracy);
}
public static void main(String[] args) {
id3Test();
}
}
第62天:决策树 (2. 建树与分类)
- 构建决策树是一个递归的过程, 参数设计是核心.
- 分类 classify() 也是一个递归的过程.
- 当前仅在训练集上测试. 设计单独的测试集也不困难.
- toString() 也用了递归的方式. 但输出的树格式不太好看.
- 附 weather.arff
@relation weather
@attribute Outlook {Sunny, Overcast, Rain}
@attribute Temperature {Hot, Mild, Cool}
@attribute Humidity {High, Normal, Low}
@attribute Windy {FALSE, TRUE}
@attribute Play {N, P}
@data
Sunny,Hot,High,FALSE,N
Sunny,Hot,High,TRUE,N
Overcast,Hot,High,FALSE,P
Rain,Mild,High,FALSE,P
Rain,Cool,Normal,FALSE,P
Rain,Cool,Normal,TRUE,N
Overcast,Cool,Normal,TRUE,P
Sunny,Mild,High,FALSE,N
Sunny,Cool,Normal,FALSE,P
Rain,Mild,Normal,FALSE,P
Sunny,Mild,Normal,TRUE,P
Overcast,Mild,High,TRUE,P
Overcast,Hot,Normal,FALSE,P
Rain,Mild,High,TRUE,N
第63天:集成学习之 AdaBoosting (1. 带权数据集)
AdaBoost 是一种迭代算法,其核心思想是针对同一个训练集训练不同的分类器,即弱分类器,然后把这些弱分类器集合起来,构造一个更强的最终分类器。(通常用“三个臭皮匠赛过诸葛亮”来形容)
弱学习算法—识别错误率小于1/2(即准确率仅比随机猜测略高的学习算法)
强学习算法—识别准确率很高并能在多项式时间内完成的学习算法
package MachineLearning.adaboosting;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.Instances;
/**
* @description:带权数据集
* @author: Qing Zhang
* @time: 2021/7/9
*/
public class WeightedInstances extends Instances{
//有些类别需要,任何数字都可以
private static final long serialVersionUID = 12345678L;
//权重
private double[] weights;
/**
* @Description: 第一个构造函数
* 根据文件读入
* @Param: [paraFileReader]
* @return:
*/
public WeightedInstances(FileReader paraFileReader)throws Exception {
super(paraFileReader);
setClassIndex(numAttributes()-1);
// 初始化权重
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
}
System.out.println("Instances weights are: " + Arrays.toString(weights));
}
/**
* @Description: 第二个构造函数
* 根据给定的数据集读入
* @Param: [paraInstances]
* @return:
*/
public WeightedInstances(Instances paraInstances) {
super(paraInstances);
setClassIndex(numAttributes() - 1);
// 初始化权重
weights = new double[numInstances()];
double tempAverage = 1.0 / numInstances();
for (int i = 0; i < weights.length; i++) {
weights[i] = tempAverage;
}
System.out.println("Instances weights are: " + Arrays.toString(weights));
}
/**
* @Description: 读取权值
* @Param: [paraIndex]
* @return: double
*/
public double getWeight(int paraIndex) {
return weights[paraIndex];
}
/**
* @Description: 调整权值
* @Param: [paraCorrectArray:被正确分类的数据集, paraAlpha:上一个分类器的权重]
* @return: void
*/
public void adjustWeights(boolean[] paraCorrectArray, double paraAlpha) {
// Step 1. 计算alpha.
double tempIncrease = Math.exp(paraAlpha);
// Step 2. 调整.
double tempWeightsSum = 0; // 为了归一化.
for (int i = 0; i < weights.length; i++) {
if (paraCorrectArray[i]) {
weights[i] /= tempIncrease;
} else {
weights[i] *= tempIncrease;
}
tempWeightsSum += weights[i];
}
// Step 3. 归一化.
for (int i = 0; i < weights.length; i++) {
weights[i] /= tempWeightsSum;
}
System.out.println("After adjusting, instances weights are: " + Arrays.toString(weights));
}
/**
* @Description: 权重调整测试
* @Param: []
* @return: void
*/
public void adjustWeightsTest() {
boolean[] tempCorrectArray = new boolean[numInstances()];
for (int i = 0; i < tempCorrectArray.length / 2; i++) {
tempCorrectArray[i] = true;
}
double tempWeightedError = 0.3;
adjustWeights(tempCorrectArray, tempWeightedError);
System.out.println("After adjusting");
System.out.println(toString());
}
public String toString() {
String resultString = "I am a weighted Instances object.\r\n" + "I have " + numInstances() + " instances and "
+ (numAttributes() - 1) + " conditional attributes.\r\n" + "My weights are: " + Arrays.toString(weights)
+ "\r\n" + "My data are: \r\n" + super.toString();
return resultString;
}
public static void main(String args[]) {
WeightedInstances tempWeightedInstances = null;
String tempFilename = "F:\\研究生\\研0\\学习\\Java_Study\\data_set\\iris.arff";
try {
FileReader tempFileReader = new FileReader(tempFilename);
tempWeightedInstances = new WeightedInstances(tempFileReader);
tempFileReader.close();
} catch (Exception exception1) {
System.out.println("Cannot read the file: " + tempFilename + "\r\n" + exception1);
System.exit(0);
}
System.out.println(tempWeightedInstances.toString());
tempWeightedInstances.adjustWeightsTest();
}
}
第64天:集成学习之 AdaBoosting (2. 树桩分类器)
这里树桩分类器每次只将数据分成两堆,每次随机选择一个特征来分类,计算最优的裁切点,然后将待预测样本中的特征值与裁切点比较,小于它就分到左边的一类,大于就分到右边的一类。
package MachineLearning.adaboosting;
import weka.core.Instance;
import java.util.Random;
/**
* @description:分类器
* @author: Qing Zhang
* @time: 2021/7/9
*/
public abstract class SimpleClassifier {
//当前属性的索引
int selectedAttribute;
//带权值的数据
WeightedInstances weightedInstances;
//训练集的精确度
double trainingAccuracy;
//类别数量,如二分类就是2
int numClasses;
//实例数量
int numInstances;
//属性数量
int numConditions;
//随机种子,用于产生随机数
Random random = new Random();
/**
* @Description: 第一个构造函数
* @Param: [paraWeightedInstances]
* @return:
*/
public SimpleClassifier(WeightedInstances paraWeightedInstances) {
weightedInstances = paraWeightedInstances;
numConditions = weightedInstances.numAttributes() - 1;
numInstances = weightedInstances.numInstances();
numClasses = weightedInstances.classAttribute().numValues();
}
/**
* @Description: 训练分类器
* @Param: []
* @return: void
*/
public abstract void train();
/**
* @Description: 分类一个实例
* @Param: [paraInstance]
* @return: int
*/
public abstract int classify(Instance paraInstance);
/**
* @Description: 标记在训练集中的实例哪些被正确的分类
* @Param: []
* @return: boolean[]
*/
public boolean[] computeCorrectnessArray() {
boolean[] resultCorrectnessArray = new boolean[weightedInstances.numInstances()];
for (int i = 0; i < resultCorrectnessArray.length; i++) {
Instance tempInstance = weightedInstances.instance(i);
if ((int) (tempInstance.classValue()) == classify(tempInstance)) {
resultCorrectnessArray[i] = true;
}
}
return resultCorrectnessArray;
}
/**
* @Description: 计算训练集分类的准确率
* @Param: []
* @return: double
*/
public double computeTrainingAccuracy() {
double tempCorrect = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (tempCorrectnessArray[i]) {
tempCorrect++;
}
}
double resultAccuracy = tempCorrect / tempCorrectnessArray.length;
return resultAccuracy;
}
/**
* @Description: 计算错误分类的权值之和。最小值设为1e-6防止出现NaN。
* @Param: []
* @return: double
*/
public double computeWeightedError() {
double resultError = 0;
boolean[] tempCorrectnessArray = computeCorrectnessArray();
for (int i = 0; i < tempCorrectnessArray.length; i++) {
if (!tempCorrectnessArray[i]) {
resultError += weightedInstances.getWeight(i);
}
}
if (resultError < 1e-6) {
resultError = 1e-6;
}
return resultError;
}
}
树桩分类器代码:
package MachineLearning.adaboosting;
import weka.core.Instance;
import java.io.FileReader;
import java.util.Arrays;
/**
* @description:树桩分类器
* @author: Qing Zhang
* @time: 2021/7/9
*/
public class StumpClassifier extends SimpleClassifier {
//权值集上当前属性的最佳裁剪
double bestCut;
//属性值小于bestCut的类标签
int leftLeafLabel;
//属性值不小于bestCut的类标签
int rightLeafLabel;
/**
* @param paraWeightedInstances
* @Description: 第一个构造函数
* @Param: [paraWeightedInstances]
* @return:
*/
public StumpClassifier(WeightedInstances paraWeightedInstances) {
super(paraWeightedInstances);
}
/**
* @Description: 训练分类器
* @Param: []
* @return: void
*/
@Override
public void train() {
// Step 1. 随机选择一个属性.
selectedAttribute = random.nextInt(numConditions);
// Step 2. 找到该属性所有的值并排序.
double[] tempValuesArray = new double[numInstances];
for (int i = 0; i < tempValuesArray.length; i++) {
tempValuesArray[i] = weightedInstances.instance(i).value(selectedAttribute);
}
Arrays.sort(tempValuesArray);
// Step 3. 初始化, 将所有实例分类与原始的切割相匹配
int tempNumLabels = numClasses;
double[] tempLabelCountArray = new double[tempNumLabels];
int tempCurrentLabel;
// Step 3.1 扫描所有的标签以获得它们的数量
for (int i = 0; i < numInstances; i++) {
// 第i个实例的标签数量
tempCurrentLabel = (int) weightedInstances.instance(i).classValue();
tempLabelCountArray[tempCurrentLabel] += weightedInstances.getWeight(i);
}
// Step 3.2 找到具有最大计数的标签
double tempMaxCorrect = 0;
int tempBestLabel = -1;
for (int i = 0; i < tempLabelCountArray.length; i++) {
if (tempMaxCorrect < tempLabelCountArray[i]) {
tempMaxCorrect = tempLabelCountArray[i];
tempBestLabel = i;
}
}
// Step 3.3 切口比最小值小一点
bestCut = tempValuesArray[0] - 0.1;
leftLeafLabel = tempBestLabel;
rightLeafLabel = tempBestLabel;
// Step 4. 逐个检查候选的切割
// Step 4.1 处理多类数据,左和右。
double tempCut;
double[][] tempLabelCountMatrix = new double[2][tempNumLabels];
for (int i = 0; i < tempValuesArray.length - 1; i++) {
// Step 4.1 一些属性的值是相同的,忽略它们
if (tempValuesArray[i] == tempValuesArray[i + 1]) {
continue;
}
tempCut = (tempValuesArray[i] + tempValuesArray[i + 1]) / 2;
// Step 4.2 扫描所有标签以获得它们的counts wrt. the cut
// 在它使用多次后再次初始化
for (int j = 0; j < 2; j++) {
for (int k = 0; k < tempNumLabels; k++) {
tempLabelCountMatrix[j][k] = 0;
}
}
for (int j = 0; j < numInstances; j++) {
// 第j个实例的标签
tempCurrentLabel = (int) weightedInstances.instance(j).classValue();
if (weightedInstances.instance(j).value(selectedAttribute) < tempCut) {
tempLabelCountMatrix[0][tempCurrentLabel] += weightedInstances.getWeight(j);
} else {
tempLabelCountMatrix[1][tempCurrentLabel] += weightedInstances.getWeight(j);
}
}
// Step 4.3 左叶子
double tempLeftMaxCorrect = 0;
int tempLeftBestLabel = 0;
for (int j = 0; j < tempLabelCountMatrix[0].length; j++) {
if (tempLeftMaxCorrect < tempLabelCountMatrix[0][j]) {
tempLeftMaxCorrect = tempLabelCountMatrix[0][j];
tempLeftBestLabel = j;
}
}
// Step 4.4 右叶子
double tempRightMaxCorrect = 0;
int tempRightBestLabel = 0;
for (int j = 0; j < tempLabelCountMatrix[1].length; j++) {
if (tempRightMaxCorrect < tempLabelCountMatrix[1][j]) {
tempRightMaxCorrect = tempLabelCountMatrix[1][j];
tempRightBestLabel = j;
}
}
// Step 4.5 与当前最好的比较
if (tempMaxCorrect < tempLeftMaxCorrect + tempRightMaxCorrect) {
tempMaxCorrect = tempLeftMaxCorrect + tempRightMaxCorrect;
bestCut = tempCut;
leftLeafLabel = tempLeftBestLabel;
rightLeafLabel = tempRightBestLabel;
}
}
System.out.println("Attribute = " + selectedAttribute + ", cut = " + bestCut + ", leftLeafLabel = "
+ leftLeafLabel + ", rightLeafLabel = " + rightLeafLabel);
}
@Override
public int classify(Instance paraInstance) {
int resultLabel = -1;
if (paraInstance.value(selectedAttribute) < bestCut) {
resultLabel = leftLeafLabel;
} else {
resultLabel = rightLeafLabel;
}
return resultLabel;
}
@Override
public String toString() {
String resultString = "I am a stump classifier.\r\n" + "I choose attribute #" + selectedAttribute
+ " with cut value " + bestCut + ".\r\n" + "The left and right leaf labels are " + leftLeafLabel
+ " and " + rightLeafLabel + ", respectively.\r\n" + "My weighted error is: " + computeWeightedError()
+ ".\r\n" + "My weighted accuracy is : " + computeTrainingAccuracy() + ".";
return resultString;
}
public static void main(String args[]) {
WeightedInstances tempWeightedInstances = null;
String tempFilename = "F:\\研究生\\研0\\学习\\Java_Study\\data_set\\iris.arff";
try {
FileReader tempFileReader = new FileReader(tempFilename);
tempWeightedInstances = new WeightedInstances(tempFileReader);
tempFileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + tempFilename + "\r\n" + ee);
System.exit(0);
}
StumpClassifier tempClassifier = new StumpClassifier(tempWeightedInstances);
tempClassifier.train();
System.out.println(tempClassifier);
System.out.println(Arrays.toString(tempClassifier.computeCorrectnessArray()));
}
}
第65天:集成学习之 AdaBoosting (3. 集成器)
首先设置分类器的数量,然后当前分类器会根据上一个分类器的分类结果以及权重来调整自己的分类效果,依次迭代下去计算所有分类器的权值,最终形成一个集成器,那么预测样本则是根据所有分类器的结果以及它们的权值做计算,最后权值最大的那个是最终预测结果。
package MachineLearning.adaboosting;
import weka.core.Instance;
import weka.core.Instances;
import java.io.FileReader;
/**
* @description:集成器
* @author: Qing Zhang
* @time: 2021/7/9
*/
public class Booster {
//分类器们
SimpleClassifier[] classifiers;
//分类器数量
int numClassifiers;
//是否在训练错误为0后停止
boolean stopAfterConverge = false;
//分类器的权重
double[] classifierWeights;
//训练数据
Instances trainingData;
//测试数据
Instances testingData;
/**
* @Description: 第一个构造函数。测试数据集等同于训练集
* @Param: [paraTrainingFilename]
* @return:
*/
public Booster(String paraTrainingFilename) {
// Step 1. 读入训练集.
try {
FileReader tempFileReader = new FileReader(paraTrainingFilename);
trainingData = new Instances(tempFileReader);
tempFileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraTrainingFilename + "\r\n" + ee);
System.exit(0);
}
// Step 2. 设置最后的属性作为类索引
trainingData.setClassIndex(trainingData.numAttributes() - 1);
// Step 3. 测试数据集等同于训练集
testingData = trainingData;
stopAfterConverge = true;
System.out.println("****************Data**********\r\n" + trainingData);
}
/**
* @Description: 设置基本分类器的数量并分配空间
* @Param: [paraNumBaseClassifiers]
* @return: void
*/
public void setNumBaseClassifiers(int paraNumBaseClassifiers) {
numClassifiers = paraNumBaseClassifiers;
// Step 1. 为分类器分配空间(仅供参考)
classifiers = new SimpleClassifier[numClassifiers];
// Step 2. 初始化分类器的权重
classifierWeights = new double[numClassifiers];
}
/**
* @Description: 训练集成器
* @Param: []
* @return: void
*/
public void train() {
// Step 1. 初始化
WeightedInstances tempWeightedInstances = null;
double tempError;
numClassifiers = 0;
// Step 2. 创建其它分类器
for (int i = 0; i < classifiers.length; i++) {
// Step 2.1 关键代码: 构造并调整带权值的数据集
if (i == 0) {
tempWeightedInstances = new WeightedInstances(trainingData);
} else {
// 调整数据集的权重
tempWeightedInstances.adjustWeights(classifiers[i - 1].computeCorrectnessArray(),
classifierWeights[i - 1]);
}
// Step 2.2 训练下一个分类器
classifiers[i] = new StumpClassifier(tempWeightedInstances);
classifiers[i].train();
tempError = classifiers[i].computeWeightedError();
// 关键代码: 设置分类器的权重
classifierWeights[i] = 0.5 * Math.log(1 / tempError - 1);
if (classifierWeights[i] < 1e-6) {
classifierWeights[i] = 0;
}
System.out.println("Classifier #" + i + " , weighted error = " + tempError + ", weight = "
+ classifierWeights[i] + "\r\n");
numClassifiers++;
// 精确度足够时
if (stopAfterConverge) {
double tempTrainingAccuracy = computeTrainingAccuray();
System.out.println("The accuracy of the booster is: " + tempTrainingAccuracy + "\r\n");
if (tempTrainingAccuracy > 0.999999) {
System.out.println("Stop at the round: " + i + " due to converge.\r\n");
break;
}
}
}
}
/**
* @Description: 分类一个实例
* @Param: [paraInstance]
* @return: int
*/
public int classify(Instance paraInstance) {
double[] tempLabelsCountArray = new double[trainingData.classAttribute().numValues()];
for (int i = 0; i < numClassifiers; i++) {
int tempLabel = classifiers[i].classify(paraInstance);
tempLabelsCountArray[tempLabel] += classifierWeights[i];
}
int resultLabel = -1;
double tempMax = -1;
for (int i = 0; i < tempLabelsCountArray.length; i++) {
if (tempMax < tempLabelsCountArray[i]) {
tempMax = tempLabelsCountArray[i];
resultLabel = i;
}
}
return resultLabel;
}
/**
* @Description: 在训练集上测试集成器
* @Param: []
* @return: double
*/
public double test() {
System.out.println("Testing on " + testingData.numInstances() + " instances.\r\n");
return test(testingData);
}
/**
* @Description: 测试集成器
* @Param: [paraInstances]
* @return: double
*/
public double test(Instances paraInstances) {
double tempCorrect = 0;
paraInstances.setClassIndex(paraInstances.numAttributes() - 1);
for (int i = 0; i < paraInstances.numInstances(); i++) {
Instance tempInstance = paraInstances.instance(i);
if (classify(tempInstance) == (int) tempInstance.classValue()) {
tempCorrect++;
}
}
double resultAccuracy = tempCorrect / paraInstances.numInstances();
System.out.println("The accuracy is: " + resultAccuracy);
return resultAccuracy;
}
/**
* @Description: 计算集成器的训练精确度。该精确度未加权
* @Param: []
* @return: double
*/
public double computeTrainingAccuray() {
double tempCorrect = 0;
for (int i = 0; i < trainingData.numInstances(); i++) {
if (classify(trainingData.instance(i)) == (int) trainingData.instance(i).classValue()) {
tempCorrect++;
}
}
double tempAccuracy = tempCorrect / trainingData.numInstances();
return tempAccuracy;
}
public static void main(String args[]) {
System.out.println("Starting AdaBoosting...");
Booster tempBooster = new Booster("F:\\研究生\\研0\\学习\\Java_Study\\data_set\\iris.arff");
// Booster tempBooster = new Booster("src/data/smalliris.arff");
tempBooster.setNumBaseClassifiers(100);
tempBooster.train();
System.out.println("The training accuracy is: " + tempBooster.computeTrainingAccuray());
tempBooster.test();
}
}
第66天:主动学习之 ALEC
通过找到聚类中心,聚类中心的特点是密度高于邻居,与密度较高的实例相距较远。再为每个中心实例构建集群,以递归方式将集群索引分配给非中心实例,最终生成块信息表。该算法需要用户输入半径和阈值,这将降低聚类的准确性,并需要准确找到根节点,一旦错误将会导致分类错误,从而引起代价增加。
package MachineLearning.activelearning;
import weka.core.*;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
/**
* @description:主动学习Alec
* @author: Qing Zhang
* @time: 2021/7/16
*/
public class Alec {
//数据集
Instances dataset;
//一个随机种子
public static final Random random = new Random();
//可以提供的最大查询数
int maxNumQuery;
//查询的实际数量
int numQuery;
//半径,在论文上也是dc。它用于密度计算
double radius;
//实例的密度,在论文上是 rho
double[] densities;
//distanceToMaster
double[] distanceToMaster;
//索引排序,按密度下降,第一个密度自然最大
int[] descendantDensities;
//优先级
double[] priority;
//任意两点之间的最大距离
double maximalDistance;
//distanceToMaster的最大值
double maximalDelta;
//上级
int[] masters;
//预测标签
int[] predictedLabels;
//实例状态。0:未处理,1:被查询,2:被分类
int[] instanceStatusArray;
//子孙索引,以子孙顺序显示实例的代表性
int[] descendantRepresentatives;
//表示每个实例的集群。它只在clusterInTwo(int[])中使用
int[] clusterIndices;
//大小低于此阈值的块不应进一步分割。
int smallBlockThreshold = 3;
/**
* @Description: 构造函数
* @Param: [paraFilename]
* @return:
*/
public Alec(String paraFilename) {
try {
FileReader tempReader = new FileReader(paraFilename);
dataset = new Instances(tempReader);
dataset.setClassIndex(dataset.numAttributes() - 1);
tempReader.close();
} catch (Exception ee) {
System.out.println(ee);
System.exit(0);
}
computeMaximalDistance();
clusterIndices = new int[dataset.numInstances()];
}
/**
* @Description: 按后代顺序归并排序以获得索引数组。原来的数组没有改变
* Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2].
* input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3].
* 该方法相当于Python编程语言numpy模块中的argsort()
* @Param: [paraArray]
* @return: int[]
*/
public static int[] mergeSortToIndices(double[] paraArray) {
int tempLength = paraArray.length;
int[][] resultMatrix = new int[2][tempLength];// For merge sort.
//初始化
int tempIndex = 0;
for (int i = 0; i < tempLength; i++) {
resultMatrix[tempIndex][i] = i;
}
//归并
int tempCurrentLength = 1;
//当前归并集的索引
int tempFirstStart, tempSecondStart, tempSecondEnd;
while (tempCurrentLength < tempLength) {
//分成若干组
//这里的边界是自适应的数组长度,不等于2^k。
for (int i = 0; i < Math.ceil(tempLength + 0.0 / tempCurrentLength) / 2; i++) {
//集群边界
tempFirstStart = i * tempCurrentLength * 2;
tempSecondStart = tempFirstStart + tempCurrentLength;
tempSecondEnd = tempSecondStart + tempCurrentLength - 1;
if (tempSecondEnd >= tempLength) {
tempSecondEnd = tempLength - 1;
}
//归并集群
int tempFirstIndex = tempFirstStart;
int tempSecondIndex = tempSecondStart;
int tempCurrentIndex = tempFirstStart;
if (tempSecondStart >= tempLength) {
for (int j = tempFirstIndex; j < tempLength; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempFirstIndex++;
tempCurrentIndex++;
}
break;
}
while ((tempFirstIndex <= tempSecondStart - 1) && (tempSecondIndex <= tempSecondEnd)) {
if (paraArray[resultMatrix[tempIndex % 2][tempFirstIndex]] >= paraArray[resultMatrix[tempIndex
% 2][tempSecondIndex]]) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
% 2][tempFirstIndex];
tempFirstIndex++;
} else {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex
% 2][tempSecondIndex];
tempSecondIndex++;
}
tempCurrentIndex++;
}
//剩余部分
for (int j = tempFirstIndex; j < tempSecondStart; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempCurrentIndex++;
}
for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
tempCurrentIndex++;
}
}
tempCurrentLength *= 2;
tempIndex++;
}
return resultMatrix[tempIndex % 2];
}
/**
* @Description: 两个实例的欧氏距离
* @Param: [paraI, paraJ]
* @return: double
*/
public double distance(int paraI, int paraJ) {
double resultDistance = 0;
double tempDifference;
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
resultDistance += tempDifference * tempDifference;
}
resultDistance = Math.sqrt(resultDistance);
return resultDistance;
}
/**
* @Description: 计算最大距离。结果存储在成员变量中
* @Param: []
* @return: void
*/
public void computeMaximalDistance() {
maximalDistance = 0;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);
if (maximalDistance < tempDistance) {
maximalDistance = tempDistance;
}
}
}
System.out.println("maximalDistance = " + maximalDistance);
}
/**
* @Description: 利用高斯核计算密度
* @Param: []
* @return: void
*/
public void computeDensitiesGaussian() {
System.out.println("radius = " + radius);
densities = new double[dataset.numInstances()];
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < dataset.numInstances(); j++) {
tempDistance = distance(i, j);
densities[i] += Math.exp(-tempDistance * tempDistance / radius / radius);
}
}
System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
}
/**
* @Description: 计算distanceToMaster,到父节点的距离
* 节点的父节点 (master), 是比它密度大的节点中, 距离最近那个. 到父节点的距离越远, 表示独立性越强.
* @Param: []
* @return: void
*/
public void computeDistanceToMaster() {
distanceToMaster = new double[dataset.numInstances()];
masters = new int[dataset.numInstances()];
descendantDensities = new int[dataset.numInstances()];
instanceStatusArray = new int[dataset.numInstances()];
descendantDensities = mergeSortToIndices(densities);
distanceToMaster[descendantDensities[0]] = maximalDistance;
double tempDistance;
//找到距离最近的上级
for (int i = 1; i < dataset.numInstances(); i++) {
// 初始化
distanceToMaster[descendantDensities[i]] = maximalDistance;
for (int j = 0; j <= i - 1; j++) {
tempDistance = distance(descendantDensities[i], descendantDensities[j]);
if (distanceToMaster[descendantDensities[i]] > tempDistance) {
distanceToMaster[descendantDensities[i]] = tempDistance;
//记录每个节点的上级
masters[descendantDensities[i]] = descendantDensities[j];
}
}
}
System.out.println("First compute, masters = " + Arrays.toString(masters));
System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
}
/**
* @Description: 计算优先级。
* 优先级越高的元素越有可能被选为集群中心。
* 现在的方式是 rho*distanceToMaster ,但也可以是 rho^alpha * distanceToMaster
* 综合考虑密度 (能力) 与距离 (独立性). 这两者乘积越大的节点 (对象), 代表性越强.
* @Param: []
* @return: void
*/
public void computePriority() {
priority = new double[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
priority[i] = densities[i] * distanceToMaster[i];
}
}
/**
* @Description: 节点的块应该与其上级节点相同。这种递归方法是高效的
* 在聚类算法中使用, 需要用例子来跟踪来能明白其作用. 简单而言, 节点应与其父节点拥有同样的簇编号.
* @Param: [paraIndex]
* @return: int
*/
public int coincideWithMaster(int paraIndex) {
if (clusterIndices[paraIndex] == -1) {
int tempMaster = masters[paraIndex];
clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
}
return clusterIndices[paraIndex];
}
/**
* @Description: 根据上级树将一个块分成两部分
* 将一个块分成两块, 其根节点依次是第一个与第二个 (注意到每个块都是按节点的代表性递减排序).
* @Param: [paraBlock]
* @return: int[][]
*/
public int[][] clusterInTwo(int[] paraBlock) {
//重新初始化。事实上,只有在给定块中的实例才被考虑
Arrays.fill(clusterIndices, -1);
//初始化两部分的集群编号
for (int i = 0; i < 2; i++) {
clusterIndices[paraBlock[i]] = i;
}
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] != -1) {
//已经有集群编号
continue;
}
clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
}
//子块
int[][] resultBlocks = new int[2][];
int tempFistBlockCount = 0;
for (int i = 0; i < clusterIndices.length; i++) {
if (clusterIndices[i] == 0) {
tempFistBlockCount++;
}
}
resultBlocks[0] = new int[tempFistBlockCount];
resultBlocks[1] = new int[paraBlock.length - tempFistBlockCount];
//复制。当集群数量大于2时,可以设计更短的代码。
int tempFirstIndex = 0;
int tempSecondIndex = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (clusterIndices[paraBlock[i]] == 0) {
resultBlocks[0][tempFirstIndex] = paraBlock[i];
tempFirstIndex++;
} else {
resultBlocks[1][tempSecondIndex] = paraBlock[i];
tempSecondIndex++;
}
}
System.out.println("Split (" + paraBlock.length + ") instances " + Arrays.toString(paraBlock) + "\r\nto ("
+ resultBlocks[0].length + ") instances " + Arrays.toString(resultBlocks[0]) + "\r\nand ("
+ resultBlocks[1].length + ") instances " + Arrays.toString(resultBlocks[1]));
return resultBlocks;
}
/**
* @Description: 通过简单投票来分类实例
* 根据已经查询的标签, 对一个块中其它对象进行投票分类
* @Param: [paraBlock]
* @return: void
*/
public void vote(int[] paraBlock) {
int[] tempClassCounts = new int[dataset.numClasses()];
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempClassCounts[(int) dataset.instance(paraBlock[i]).classValue()]++;
}
}
int tempMaxClass = -1;
int tempMaxCount = -1;
for (int i = 0; i < tempClassCounts.length; i++) {
if (tempMaxCount < tempClassCounts[i]) {
tempMaxClass = i;
tempMaxCount = tempClassCounts[i];
}
}
//分类未被处理的实例
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempMaxClass;
instanceStatusArray[paraBlock[i]] = 2;
}
}
}
/**
* @Description: 基于聚类的主动学习
* 为核心算法提供初始化服务
* @Param: [paraRatio, paraMaxNumQuery, paraSmallBlockThreshold]
* @return: void
*/
public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) {
radius = maximalDistance * paraRatio;
smallBlockThreshold = paraSmallBlockThreshold;
maxNumQuery = paraMaxNumQuery;
predictedLabels = new int[dataset.numInstances()];
for (int i = 0; i < dataset.numInstances(); i++) {
predictedLabels[i] = -1;
}
computeDensitiesGaussian();
computeDistanceToMaster();
computePriority();
descendantRepresentatives = mergeSortToIndices(priority);
System.out.println("descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
numQuery = 0;
clusterBasedActiveLearning(descendantRepresentatives);
}
/**
* @Description: 基于聚类的主动学习
* 核心算法, 它是递归的.各种情况的处理要小心
* @Param: [paraBlock]
* @return: void
*/
public void clusterBasedActiveLearning(int[] paraBlock) {
System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
// Step 1. 要为这个块查询多少个标签。
int tempExpectedQueries = (int) Math.sqrt(paraBlock.length);
int tempNumQuery = 0;
for (int i = 0; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 1) {
tempNumQuery++;
}
}
// Step 2. 为小块投票。
if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
System.out.println(
"" + tempNumQuery + " instances are queried, vote for block: \r\n" + Arrays.toString(paraBlock));
vote(paraBlock);
return;
}
// Step 3. 查询足够的标签。
for (int i = 0; i < tempExpectedQueries; i++) {
if (numQuery >= maxNumQuery) {
System.out.println("No more quries are provided, numQuery = " + numQuery + ".");
vote(paraBlock);
return;
}
if (instanceStatusArray[paraBlock[i]] == 0) {
instanceStatusArray[paraBlock[i]] = 1;
predictedLabels[paraBlock[i]] = (int) dataset.instance(paraBlock[i]).classValue();
// System.out.println("Query #" + paraBlock[i] + ", numQuery = "
// + numQuery);
numQuery++;
}
}
// Step 4. 是否纯?
int tempFirstLabel = predictedLabels[paraBlock[0]];
boolean tempPure = true;
for (int i = 1; i < tempExpectedQueries; i++) {
if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
tempPure = false;
break;
}
}
if (tempPure) {
System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
if (instanceStatusArray[paraBlock[i]] == 0) {
predictedLabels[paraBlock[i]] = tempFirstLabel;
instanceStatusArray[paraBlock[i]] = 2;
}
}
return;
}
// Step 5. 分成两部分并独立处理它们
int[][] tempBlocks = clusterInTwo(paraBlock);
for (int i = 0; i < 2; i++) {
//注意:此处递归调用。
clusterBasedActiveLearning(tempBlocks[i]);
}
}
/**
* @Description: 展示静态信息
* @Param: []
* @return: java.lang.String
*/
public String toString() {
int[] tempStatusCounts = new int[3];
double tempCorrect = 0;
for (int i = 0; i < dataset.numInstances(); i++) {
tempStatusCounts[instanceStatusArray[i]]++;
if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
}
}
String resultString = "(unhandled, queried, classified) = " + Arrays.toString(tempStatusCounts);
resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances());
return resultString;
}
public static void main(String[] args) {
long tempStart = System.currentTimeMillis();
System.out.println("Starting ALEC.");
String arffFilename = "F:\\研究生\\研0\\学习\\Java_Study\\data_set\\iris.arff";
// String arffFilename = "D:/data/mushroom.arff";
Alec tempAlec = new Alec(arffFilename);
tempAlec.clusterBasedActiveLearning(0.1, 30, 3); // For iris
// tempAlec.clusterBasedActiveLearning(0.1, 800, 3); //For mushroom
System.out.println(tempAlec);
long tempEnd = System.currentTimeMillis();
System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
}
}
第67天:主动学习之 ALEC (续)
- computeDistanceToMaster 是密度聚类的核心. 节点的父节点 (master), 是比它密度大的节点中, 距离最近那个. 到父节点的距离越远, 表示独立性越强.
- computePriority 综合考虑密度 (能力) 与距离 (独立性). 这两者乘积越大的节点 (对象), 代表性越强.
- coincideWithMaster 在聚类算法中使用, 需要用例子来跟踪来能明白其作用. 简单而言, 节点应与其父节点拥有同样的簇编号.
- clusterInTwo 将一个块分成两块, 其根节点依次是第一个与第二个 (注意到每个块都是按节点的代表性递减排序).
- vote 根据已经查询的标签, 对一个块中其它对象进行投票分类.
- clusterBasedActiveLearning(double, double, int) 为核心算法提供初始化服务.
- clusterBasedActiveLearning(int[]) 是核心算法, 它是递归的.各种情况的处理要小心.
运行截图:
第68天:主动学习之 ALEC (续)
使用了其它的数据集进行测试:
mushroom.arff:
weather.arff:
第69天:矩阵分解
矩阵分解是推荐系统的一种重要算法。同时, 它也可以用到许多其它地方。
- 用三元组来存放一个数据. 与 MBR 里面的方式相似.
- 子矩阵更新代码是核心.
- 在训练集中进行测试, 所以拟合效果好 (MAE = 0.51).
- 作为基础训练, 没有考虑正则项.
代码:
package MachineLearning.recommendersystem;
import java.io.*;
import java.util.Random;
/**
* @description:矩阵分解
* @author: Qing Zhang
* @time: 2021/7/14
*/
public class MatrixFactorization {
public class Triple {
public int user;
public int item;
public double rating;
/**
* @Description: 构造函数
* @Param: []
* @return:
*/
public Triple() {
user = -1;
item = -1;
rating = -1;
}
/**
* @Description: 构造函数
* @Param: [paraUser, paraItem, paraRating]
* @return:
*/
public Triple(int paraUser, int paraItem, double paraRating) {
user = paraUser;
item = paraItem;
rating = paraRating;
}
public String toString() {
return "" + user + ", " + item + ", " + rating;
}
}
//随机种子
Random rand = new Random();
//用户数量
int numUsers;
//项目数量
int numItems;
//评分数量
int numRatings;
//训练集
Triple[] dataset;
//用于控制学习规律的一个参数
double alpha;
//用于控制学习速率的一个参数
double lambda;
//小矩阵的低秩
int rank;
//用户矩阵U
double[][] userSubspace;
//项目矩阵V
double[][] itemSubspace;
//评分值的下界
double ratingLowerBound;
//评分值的上界
double ratingUpperBound;
/**
* @Description: 构造函数
* @Param: [paraFilename, paraNumUsers, paraNumItems, paraNumRatings, paraRatingLowerBound, paraRatingUpperBound]
* @return:
*/
public MatrixFactorization(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings,
double paraRatingLowerBound, double paraRatingUpperBound) {
numUsers = paraNumUsers;
numItems = paraNumItems;
numRatings = paraNumRatings;
ratingLowerBound = paraRatingLowerBound;
ratingUpperBound = paraRatingUpperBound;
try {
readData(paraFilename, paraNumUsers, paraNumItems, paraNumRatings);
// adjustUsingMeanRating();
} catch (Exception ee) {
System.out.println("File " + paraFilename + " cannot be read! " + ee);
System.exit(0);
} // Of try
initialize();
}
/**
* @Description: 设置相应的参数
* @Param: [paraRank, paraAlpha, paraLambda]
* @return: void
*/
public void setParameters(int paraRank, double paraAlpha, double paraLambda) {
rank = paraRank;
alpha = paraAlpha;
lambda = paraLambda;
}
/**
* @Description: 从文件读取数据
* @Param: [paraFilename, paraNumUsers, paraNumItems, paraNumRatings]
* @return: void
*/
public void readData(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings)
throws IOException {
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
}
BufferedReader tempBufferReader = new BufferedReader(new FileReader(tempFile));
//分配空间
dataset = new Triple[paraNumRatings];
String tempString;
String[] tempStringArray;
//处理文件中的字符,将其存储到三元组中
for (int i = 0; i < paraNumRatings; i++) {
tempString = tempBufferReader.readLine();
tempStringArray = tempString.split(",");
dataset[i] = new Triple(Integer.parseInt(tempStringArray[0]), Integer.parseInt(tempStringArray[1]),
Double.parseDouble(tempStringArray[2]));
}
tempBufferReader.close();
}
/**
* @Description: 初始化一些变量
* @Param: []
* @return: void
*/
void initialize() {
rank = 5;
alpha = 0.0001;
lambda = 0.005;
}
/**
* @Description: 初始化子空间。每个值的区间为[0,1].
* 将用户和项分解成两个子矩阵
* @Param: []
* @return: void
*/
void initializeSubspaces() {
userSubspace = new double[numUsers][rank];
for (int i = 0; i < numUsers; i++) {
for (int j = 0; j < rank; j++) {
userSubspace[i][j] = rand.nextDouble();
}
}
itemSubspace = new double[numItems][rank];
for (int i = 0; i < numItems; i++) {
for (int j = 0; j < rank; j++) {
itemSubspace[i][j] = rand.nextDouble();
}
}
}
/**
* @Description: 预测用户对项目的评分
* @Param: [paraUser, paraItem]
* @return: double
*/
public double predict(int paraUser, int paraItem) {
double resultValue = 0;
for (int i = 0; i < rank; i++) {
//用户的行向量和项目的列向量
resultValue += userSubspace[paraUser][i] * itemSubspace[paraItem][i];
}
return resultValue;
}
/**
* @Description: 训练
* @Param: [paraRounds]
* @return: void
*/
public void train(int paraRounds) {
initializeSubspaces();
for (int i = 0; i < paraRounds; i++) {
updateNoRegular();
if (i % 50 == 0) {
//展示此过程
System.out.println("Round " + i);
System.out.println("MAE: " + mae());
}
}
}
/**
* @Description: 使用训练集更新子空间
* @Param: []
* @return: void
*/
public void updateNoRegular() {
for (int i = 0; i < numRatings; i++) {
int tempUserId = dataset[i].user;
int tempItemId = dataset[i].item;
double tempRate = dataset[i].rating;
double tempResidual = tempRate - predict(tempUserId, tempItemId); // 误差
// 更新用户子空间
double tempValue = 0;
for (int j = 0; j < rank; j++) {
tempValue = 2 * tempResidual * itemSubspace[tempItemId][j];
userSubspace[tempUserId][j] += alpha * tempValue;
}
// 更新项的子空间
for (int j = 0; j < rank; j++) {
tempValue = 2 * tempResidual * userSubspace[tempUserId][j];
itemSubspace[tempItemId][j] += alpha * tempValue;
}
}
}
/**
* @Description: 计算均方根误差
* @Param: []
* @return: double
*/
public double rmse() {
double resultRmse = 0;
int tempTestCount = 0;
for (int i = 0; i < numRatings; i++) {
int tempUserIndex = dataset[i].user;
int tempItemIndex = dataset[i].item;
double tempRate = dataset[i].rating;
double tempPrediction = predict(tempUserIndex, tempItemIndex);// +
// DataInfo.mean_rating;
if (tempPrediction < ratingLowerBound) {
tempPrediction = ratingLowerBound;
} else if (tempPrediction > ratingUpperBound) {
tempPrediction = ratingUpperBound;
}
double tempError = tempRate - tempPrediction;
resultRmse += tempError * tempError;
tempTestCount++;
}
return Math.sqrt(resultRmse / tempTestCount);
}
/**
* @Description: 计算平均绝对误差
* @Param: []
* @return: double
*/
public double mae() {
double resultMae = 0;
int tempTestCount = 0;
for (int i = 0; i < numRatings; i++) {
int tempUserIndex = dataset[i].user;
int tempItemIndex = dataset[i].item;
double tempRate = dataset[i].rating;
double tempPrediction = predict(tempUserIndex, tempItemIndex);
if (tempPrediction < ratingLowerBound) {
tempPrediction = ratingLowerBound;
}
if (tempPrediction > ratingUpperBound) {
tempPrediction = ratingUpperBound;
}
double tempError = tempRate - tempPrediction;
resultMae += Math.abs(tempError);
// System.out.println("resultMae: " + resultMae);
tempTestCount++;
}
return (resultMae / tempTestCount);
}
/**
* @Description: 测试训练
* @Param: [paraFilename, paraNumUsers, paraNumItems, paraNumRatings, paraRatingLowerBound, paraRatingUpperBound, paraRounds]
* @return: void
*/
public static void testTrainingTesting(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings,
double paraRatingLowerBound, double paraRatingUpperBound, int paraRounds) {
try {
// Step 1. 读入训练集和测试集
MatrixFactorization tempMF = new MatrixFactorization(paraFilename, paraNumUsers, paraNumItems,
paraNumRatings, paraRatingLowerBound, paraRatingUpperBound);
tempMF.setParameters(5, 0.0001, 0.005);
// Step 2. 初始化矩阵U和V的特征
tempMF.initializeSubspaces();
// Step 3. 更新并预测
System.out.println("Begin Training ! ! !");
tempMF.train(paraRounds);
double tempMAE = tempMF.mae();
double tempRMSE = tempMF.rmse();
System.out.println("Finally, MAE = " + tempMAE + ", RMSE = " + tempRMSE);
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String args[]) {
testTrainingTesting("F:\\研究生\\研0\\学习\\Java_Study\\data_set\\movielens-943u1682m.txt", 943, 1682, 10000, 1, 5, 2000);
}
}
运行结果:
第70天:矩阵分解 (续)
根据一篇博文了解了矩阵分解的概念,该博文链接为:推荐系统笔记(矩阵分解)
矩阵分解确实可以解决一些近邻模型无法解决的问题,近邻模型存在的问题:
- 物品之间存在相关性,信息量并不是随着向量维度增加而线性增加。
- 矩阵元素稀疏,计算结果不稳定,增减一个向量维度,导致紧邻结果差异很大的情况出现。
矩阵分解就是把原来的大矩阵,近似的分解成小矩阵的乘积,在实际推荐计算时不再使用大矩阵,而是使用分解得到的两个小矩阵。
比如在本次使用的数据集中,整个数据集可以看作一个大矩阵,用户矩阵U和项目矩阵V就是将大矩阵分解成了两个小矩阵。
U
m
×
k
V
n
×
k
T
≈
A
m
×
n
\begin{aligned} U_{m\times k}V^T_{n\times k}\approx A_{m\times n} \end{aligned}
Um×kVn×kT≈Am×n
类似这样的计算过程就是矩阵分解,还有一个更常见的名字SVD,但是SVD和矩阵分解不能划等号,因为除了SVD还有一些别的矩阵分解方法。
学习过程就是:
1)准备好用户物品的评分矩阵,每一条评分数据看作是一条训练样本
2)给分解后的U矩阵和V矩阵随机初始化元素值
3)用U和V计算预测后的分数
4) 计算预测的分数和实际的分数误差
5)按照梯度下降的方向更行U和V中的元素值
6)重复步骤3到5,直到达到停止条件