1.学习该算法之前需要掌握一些数学中概率的知识,如条件概率与正态分布的概率密度函数等知识。
2.本算法主要是根据概率计算,要先搞懂算法的思想再学习代码,可以在
NB 算法 (包括符号型与数值型, 结合 Java 程序分析)https://blog.csdn.net/minfanphd/article/details/123286049
中学习代码的思路与涉及的数学公式。
3.为防止0概率冲突事件需要对数据进行平滑处理。
4.符号型NB算法使用的是mushroom.arff数据集,数值型NB算法使用的是iris.arff数据集。
代码:
package machinelearning.nb;
import weka.core.Instance;
import weka.core.Instances;
import java.io.FileReader;
import java.util.Arrays;
public class NaiveBayes {
/**
* An inner class to store parameters.
*/
private class GaussianParameters {
double mu;
double sigma;
public GaussianParameters(double parMu, double paraSigma) {
this.mu = parMu;
this.sigma = paraSigma;
}
public String toString() {
return "(" + mu + "," + sigma + ")";
}
}// of class GaussianParameters
/**
* The data.
*/
Instances dataset;
/**
* The number of classes.For binary classification it is 2.
*/
int numClasses;
/**
* The number of instances.
*/
int numInstances;
/**
* The number of conditional attributes.
*/
int numConditions;
/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;
/**
* Class distribution
*/
double[] classDistribution;
/**
* Class distribution with Laplacian smooth.
*/
double[] classDistributionLaplacian;
/**
* To calculate the conditional probabilities for all classes over all
* attributes on all values.
*/
double[][][] conditionalCounts;
/**
* The conditional probabilities with Laplacian smooth.
*/
double[][][] conditionalProbabilitiesLaplacian;
/**
* The Gaussian parameters
*/
GaussianParameters[][] gaussianParameters;
/**
* Data type.
*/
int dataType;
/**
* Nominal
*/
public static final int NOMINAL = 0;
/**
* Numerical
*/
public static final int NUMERICAL = 1;
public NaiveBayes(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
}// of try
dataset.setClassIndex(dataset.numAttributes() - 1);
numConditions = dataset.numAttributes() - 1;
numInstances = dataset.numInstances();
numClasses = dataset.attribute(numConditions).numValues();
}// of the constructor
public NaiveBayes(Instances paraInstances) {
dataset = paraInstances;
dataset.setClassIndex(dataset.numAttributes() - 1);
numConditions = dataset.numAttributes() - 1;
numInstances = dataset.numInstances();
numClasses = dataset.attribute(numConditions).numValues();
}// of the constructor.
public void setDataType(int paraDataType) {
this.dataType = paraDataType;
}// of setDataType
/**
* Calculate the class distribution with Laplacian smooth.
*/
public void calculateClassDistribution() {
classDistribution = new double[numClasses];
classDistributionLaplacian = new double[numClasses];
double[] tempCounts = new double[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClassValue = (int)dataset.instance(i).classValue();
tempCounts[tempClassValue]++;
}// of for i
for (int i = 0; i < numClasses; i++) {
classDistribution[i] = tempCounts[i] / numInstances;
classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
}// of for i.
System.out.println("Class distribution: " + Arrays.toString(classDistribution));
System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
}// of calculateClassDistribution
/**
* Calculate the conditional probabilities with Laplacian smooth. ONLY scan
* the dataset once.
*/
public void calculateConditionalProbabilities() {
conditionalCounts = new double[numClasses][numConditions][];
conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];
//Allocate space
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int)dataset.attribute(j).numValues();
conditionalCounts[i][j] = new double[tempNumValues];
conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
}// of for j
}// of for i
//Count the numbers
int[] tempClassCounts = new int[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClass = (int) dataset.instance(i).classValue();
tempClassCounts[tempClass]++;
for (int j = 0; j < numConditions; j++) {
int tempValue = (int)dataset.instance(i).value(j);
conditionalCounts[tempClass][j][tempValue]++;
}// of for j
}// of for i
//Now for the real probability with Laplacian.
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int)dataset.attribute(j).numValues();
for (int k = 0; k < tempNumValues; k++) {
conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1)
/ (tempClassCounts[i] + tempNumValues);
}// of for k
}// of for j
}// of for i
//System.out.println("Conditional counts: " + Arrays.deepToString(conditionalCounts));//**
//System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalProbabilitiesLaplacian));//**
System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalCounts));//**
}// of calculateConditionalProbabilities
/**
* Calculate the conditional probabilities with Laplacian smooth.
*
*/
public void calculateGaussianParameters() {
gaussianParameters = new GaussianParameters[numClasses][numConditions];
double[] tempValuesArray = new double[numInstances];
int tempNumValues = 0;
double tempSum = 0;
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
tempSum = 0;
//Obtain values for this class.
tempNumValues = 0;
for (int k = 0; k < numInstances; k++) {
if ((int)dataset.instance(k).classValue() != i) {
continue;
}// of if
tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
tempSum += tempValuesArray[tempNumValues];
tempNumValues++;
}// of for k
//Obtain parameters
double tempMu = tempSum / tempNumValues;
double tempSigma = 0;
for (int k = 0; k < tempNumValues; k++) {
tempSigma += (tempValuesArray[k] - tempMu)*(tempValuesArray[k] - tempMu);
}// of for k
tempSigma /= tempNumValues;
tempSigma = Math.sqrt(tempSigma);
gaussianParameters[i][j] = new GaussianParameters(tempMu,tempSigma);
}// of for j
}// of for i
System.out.println("The mu and sigma: "+Arrays.deepToString(gaussianParameters));
}// of calculateGaussianParameters
public void classify() {
predicts = new int[numInstances];
for (int i = 0; i < numInstances; i++) {
predicts[i] = classify(dataset.instance(i));
}
}// of classify
private int classify(Instance paraInstance) {
if(dataType == NOMINAL) {
return classifyNominal(paraInstance);
} else if (dataType == NUMERICAL) {
return classifyNumerical(paraInstance);
}// of if
return -1;
}// of classify
/**
* Classify an instances with nominal data.
*/
private int classifyNominal(Instance paraInstance) {
//Find the biggest one.
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
int tempAttributeValue = (int)paraInstance.value(j);
tempPseudoProbability += Math.log(conditionalProbabilitiesLaplacian[i][j][tempAttributeValue]);
}// of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
}// of if
}// of for j
return resultBestIndex;
}// of classifyNominal
/**
* Classify an instances with numerical data.
********
*/
private int classifyNumerical(Instance paraInstance) {
//Find the biggest one.
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
double tempAttributeValue = paraInstance.value(j);
double tempSigma = gaussianParameters[i][j].sigma;
double tempMu = gaussianParameters[i][j].mu;
tempPseudoProbability += -Math.log(tempSigma)
- (tempAttributeValue - tempMu)*(tempAttributeValue - tempMu)
/ (2*tempSigma*tempSigma);
}// of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
}// of if
}// of for i
return resultBestIndex;
} // of classifyNumerical
public double computeAccuracy() {
double tempCorrect = 0;
for (int i = 0; i < numInstances; i++) {
if (predicts[i] == (int) dataset.instance(i).classValue()){
tempCorrect++;
}// of if
}// of for i
double resultAccuracy = tempCorrect / numInstances;
return resultAccuracy;
}// of computeAccuracy
public static void testNominal() {
System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
String tempFilename = "D:\\研究生学习\\测试文件\\sampledata-main\\mushroom.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NOMINAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateConditionalProbabilities();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy()+"\n");
}// of testNominal
public static void testNumerical() {
System.out.println("Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
String tempFilename = "D:\\研究生学习\\测试文件\\sampledata-main\\iris.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NUMERICAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateGaussianParameters();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy()+"\n");
}
public static void main(String[] args) {
testNominal();
testNumerical();
}
}// of class NaiveBayes
运行结果:
Hello, Naive Bayes. I only want to test the nominal data.
Class distribution: [0.48202855736090594, 0.517971442639094]
Class distribution Laplacian: [0.4820329805562392, 0.5179670194437608]
Conditional probabilities: [[[48.0, 4.0, 1708.0, 1556.0, 600.0, 0.0], [760.0, 4.0, 1740.0, 1412.0], [1020.0, 120.0, 12.0, 808.0, 0.0, 88.0, 0.0, 876.0, 320.0, 672.0], [624.0, 3292.0], [0.0, 0.0, 192.0, 576.0, 2160.0, 36.0, 120.0, 256.0, 576.0], [18.0, 0.0, 3898.0, 0.0], [3804.0, 112.0, 0.0], [1692.0, 2224.0], [64.0, 112.0, 1728.0, 528.0, 504.0, 24.0, 0.0, 640.0, 48.0, 0.0, 246.0, 22.0], [1900.0, 2016.0], [1856.0, 44.0, 0.0, 256.0, 0.0, 0.0, 1760.0], [144.0, 8.0, 2228.0, 1536.0], [144.0, 76.0, 2160.0, 1536.0], [432.0, 432.0, 36.0, 0.0, 0.0, 1296.0, 0.0, 1712.0, 8.0], [448.0, 432.0, 36.0, 0.0, 0.0, 1296.0, 0.0, 1680.0, 24.0], [3916.0, 0.0], [0.0, 0.0, 3908.0, 8.0], [36.0, 3808.0, 72.0], [0.0, 1768.0, 0.0, 1296.0, 36.0, 816.0, 0.0, 0.0], [224.0, 224.0, 0.0, 1584.0, 72.0, 0.0, 0.0, 1812.0, 0.0], [0.0, 52.0, 0.0, 368.0, 2848.0, 648.0], [740.0, 592.0, 36.0, 1008.0, 272.0, 0.0, 1268.0]], [[404.0, 0.0, 1948.0, 1596.0, 228.0, 32.0], [1560.0, 0.0, 1504.0, 1144.0], [1264.0, 48.0, 32.0, 1032.0, 16.0, 56.0, 16.0, 624.0, 720.0, 400.0], [2752.0, 1456.0], [400.0, 400.0, 0.0, 0.0, 0.0, 0.0, 3408.0, 0.0, 0.0], [192.0, 0.0, 4016.0, 0.0], [3008.0, 1200.0, 0.0], [3920.0, 288.0], [344.0, 936.0, 0.0, 204.0, 248.0, 0.0, 64.0, 852.0, 444.0, 96.0, 956.0, 64.0], [1616.0, 2592.0], [1920.0, 512.0, 0.0, 864.0, 0.0, 192.0, 720.0], [408.0, 16.0, 144.0, 3640.0], [456.0, 208.0, 144.0, 3400.0], [16.0, 0.0, 0.0, 576.0, 192.0, 576.0, 96.0, 2752.0, 0.0], [64.0, 0.0, 0.0, 576.0, 192.0, 576.0, 96.0, 2704.0, 0.0], [4208.0, 0.0], [96.0, 96.0, 4016.0, 0.0], [0.0, 3680.0, 528.0], [0.0, 1008.0, 48.0, 0.0, 0.0, 3152.0, 0.0, 0.0], [1648.0, 1744.0, 48.0, 48.0, 0.0, 48.0, 48.0, 576.0, 48.0], [384.0, 288.0, 400.0, 880.0, 1192.0, 1064.0], [1408.0, 240.0, 256.0, 136.0, 96.0, 192.0, 1880.0]]]
The accuracy is: 0.9566715903495815
Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.
Class distribution: [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
Class distribution Laplacian: [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
The mu and sigma: [[(5.005999999999999,0.348946987377739), (3.4180000000000006,0.37719490982779713), (1.464,0.17176728442867115), (0.2439999999999999,0.10613199329137278)], [(5.936,0.5109833656783752), (2.7700000000000005,0.31064449134018135), (4.26,0.4651881339845204), (1.3259999999999998,0.19576516544063702)], [(6.587999999999998,0.6294886813914925), (2.9739999999999998,0.319255383666431), (5.552,0.5463478745268441), (2.026,0.2718896835115301)]]
The accuracy is: 0.96