day58

该博客介绍了如何实现朴素贝叶斯算法,包括数据预处理、参数计算和分类预测。首先,通过读取数据文件初始化实例,然后计算类别分布及其拉普拉斯平滑形式。接着,分别计算条件概率和高斯参数。最后,实现分类功能并计算准确性。博客提供了数值型和名义型数据的测试用例。
摘要由CSDN通过智能技术生成
package machinelearning.Bayes;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.*;
/**
 * ******************************************
 * The Naive Bayes algorithm.
 *
 * @author Michelle Min MitchelleMin@163.com
 * @date 2021-07-18
 * ******************************************
 */
public class NaiveBayes {
    /**
     *************************
     * An inner class to store parameters.
     *************************
     */
    private class GaussianParamters {
        double mu;
        double sigma;

        public GaussianParamters(double paraMu, double paraSigma) {
            mu = paraMu;
            sigma = paraSigma;
        }//of the constructor

        public String toString() {
            return "(" + mu + ", " + sigma + ")";
        }//of toString
    }//of GaussianParameters

    /*
    The data.
     */
    Instances dataset;

    /*
    The number of classes. For binary classification it is 2.
     */
    int numClasses;

    /*
    The number of instances.
     */
    int numInstances;

    /*
    The number of conditional attributes.
     */
    int numConditions;

    /*
    The prediction, including queried and predicted labels.
     */
    int[] predicts;

    /*
    lass distribution.
     */
    double[] classDistribution;

    /*
    Class distribution with Laplacian smooth.
     */
    double[] classDistributionLaplacian;

    /*
    The conditional probabilities for all classes over all attributes on all values.
     */
    double[][][] conditionalProbabilities;

    /*
    The conditional probabilities with Laplacian smooth.
     */
    double[][][] conditionalProbabilitiesLaplacian;

    /*
    The Gaussian parameters.
     */
    GaussianParamters[][] gaussianParameters;

    /*
    Data type.
     */
    int dataType;

    /*
    Nominal.
     */
    public static final int NOMINAL = 0;

    /*
    Numerical.
     */
    public static final int NUMERICAL = 1;

    /**
     ********************
     * The constructor.
     *
     * @param paraFilename
     *            The given file.
     ********************
     */
    public NaiveBayes(String paraFilename) {
        dataset = null;
        try {
            FileReader fileReader = new FileReader(paraFilename);
            dataset = new Instances(fileReader);
            fileReader.close();
        } catch (Exception ee) {
            System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
            System.exit(0);
        }//of try

        dataset.setClassIndex(dataset.numAttributes() - 1);
        numConditions = dataset.numAttributes() - 1;
        numInstances = dataset.numInstances();
        numClasses = dataset.attribute(numConditions).numValues();
    }//of the constructor

    /**
     ********************
     * Set the data type.
     ********************
     */
    public void setDataType(int paraDataType) {
        dataType = paraDataType;
    }//of setDataType

    /**
     ********************
     * Calculate the class distribution with Laplacian smooth.
     ********************
     */
    public void calculateClassDistribution() {
        classDistribution = new double[numClasses];
        classDistributionLaplacian = new double[numClasses];

        double[] tempCounts = new double[numClasses];
        for (int i = 0; i < numInstances; i++) {
            int tempClassValue = (int) dataset.instance(i).classValue();
            tempCounts[tempClassValue]++;
        }//of for i

        for (int i = 0; i < numClasses; i++) {
            classDistribution[i] = tempCounts[i] / numInstances;
            classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
        }//of for i

        System.out.println("Class distribution: " + Arrays.toString(classDistribution));
        System.out.println(
                "Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
    }//of calculateClassDistribution

    /**
     ********************
     * Calculate the conditional probabilities with Laplacian smooth. ONLY scan
     * the dataset once. There was a simpler one, I have removed it because the
     * time complexity is higher.
     ********************
     */
    public void calculateConditionalProbabilities() {
        conditionalProbabilities = new double[numClasses][numConditions][];
        conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];

        // Allocate space.
        for (int i = 0; i < numClasses; i++) {
            for (int j = 0; j < numConditions; j++) {
                int tempNumValues = (int) dataset.attribute(j).numValues();
                conditionalProbabilities[i][j] = new double[tempNumValues];
                conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
            }//of for j
        }//of for i

        // Count the numbers.
        int[] tempClassCounts = new int[numClasses];
        for (int i = 0; i < numInstances; i++) {
            int tempClass = (int) dataset.instance(i).classValue();
            tempClassCounts[tempClass]++;
            for (int j = 0; j < numConditions; j++) {
                int tempValue = (int) dataset.instance(i).value(j);
                conditionalProbabilities[tempClass][j][tempValue]++;
            }//of for j
        }//of for i

        // Now for the real probability with Laplacian.
        for (int i = 0; i < numClasses; i++) {
            for (int j = 0; j < numConditions; j++) {
                int tempNumValues = (int) dataset.attribute(j).numValues();
                for (int k = 0; k < tempNumValues; k++) {
                    conditionalProbabilitiesLaplacian[i][j][k] = (conditionalProbabilities[i][j][k]
                            + 1) / (tempClassCounts[i] + numClasses);
                }//of for k
            }//of for j
        }//of for i

        System.out.println(Arrays.deepToString(conditionalProbabilities));
    }//of calculateConditionalProbabilities

    /**
     ********************
     * Calculate the conditional probabilities with Laplacian smooth.
     ********************
     */
    public void calculateGaussianParameters() {
        gaussianParameters = new GaussianParamters[numClasses][numConditions];

        double[] tempValuesArray = new double[numInstances];
        int tempNumValues = 0;
        double tempSum = 0;

        for (int i = 0; i < numClasses; i++) {
            for (int j = 0; j < numConditions; j++) {
                tempSum = 0;

                // Obtain values for this class.
                tempNumValues = 0;
                for (int k = 0; k < numInstances; k++) {
                    if ((int) dataset.instance(k).classValue() != i) {
                        continue;
                    } // Of if

                    tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
                    tempSum += tempValuesArray[tempNumValues];
                    tempNumValues++;
                }//of for k

                // Obtain parameters.
                double tempMu = tempSum / tempNumValues;

                double tempSigma = 0;
                for (int k = 0; k < tempNumValues; k++) {
                    tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
                }//of for k
                tempSigma /= tempNumValues;
                tempSigma = Math.sqrt(tempSigma);

                gaussianParameters[i][j] = new GaussianParamters(tempMu, tempSigma);
            }//of for j
        }//of for i

        System.out.println(Arrays.deepToString(gaussianParameters));
    }//of calculateGausssianParameters

    /**
     ********************
     * Classify all instances, the results are stored in predicts[].
     ********************
     */
    public void classify() {
        predicts = new int[numInstances];
        for (int i = 0; i < numInstances; i++) {
            predicts[i] = classify(dataset.instance(i));
        }//of for i
    }//of classify

    /**
     ********************
     * Classify an instances.
     ********************
     */
    public int classify(Instance paraInstance) {
        if (dataType == NOMINAL) {
            return classifyNominal(paraInstance);
        } else if (dataType == NUMERICAL) {
            return classifyNumerical(paraInstance);
        }//of if

        return -1;
    }//of classify

    /**
     ********************
     * Classify an instances with nominal data.
     ********************
     */
    public int classifyNominal(Instance paraInstance) {
        // Find the biggest one
        double tempBiggest = -10000;
        int resultBestIndex = 0;
        for (int i = 0; i < numClasses; i++) {
            double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
            for (int j = 0; j < numConditions; j++) {
                int tempAttributeValue = (int) paraInstance.value(j);

                // Laplacian smooth.
                tempPseudoProbability += Math
                        .log(conditionalProbabilities[i][j][tempAttributeValue]);
            }//of for j

            if (tempBiggest < tempPseudoProbability) {
                tempBiggest = tempPseudoProbability;
                resultBestIndex = i;
            }//of if
        }//of for i

        return resultBestIndex;
    }//of classifyNominal

    /**
     ********************
     * Classify an instances with numerical data.
     ********************
     */
    public int classifyNumerical(Instance paraInstance) {
        // Find the biggest one.
        double tempBiggest = -10000;
        int resultBestIndex = 0;

        for (int i = 0; i < numClasses; i++) {
            double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
            for (int j = 0; j < numConditions; j++) {
                double tempAttributeValue = paraInstance.value(j);
                double tempSigma = gaussianParameters[i][j].sigma;
                double tempMu = gaussianParameters[i][j].mu;

                tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu)
                        * (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
            }//of for j

            if (tempBiggest < tempPseudoProbability) {
                tempBiggest = tempPseudoProbability;
                resultBestIndex = i;
            }//of if
        }//of for i

        return resultBestIndex;
    }//of classifyNumerical

    /**
     ********************
     * Compute accuracy.
     ********************
     */
    public double computeAccuracy() {
        double tempCorrect = 0;
        for (int i = 0; i < numInstances; i++) {
            if (predicts[i] == (int) dataset.instance(i).classValue()) {
                tempCorrect++;
            }//of if
        }//of for i

        return tempCorrect / numInstances;
    }//of computeAccuracy

    /**
     *************************
     * Test nominal data.
     *************************
     */
    public static void testNominal() {
        System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
        String tempFilename = "D:/mitchelles/data/mushroom.arff";

        NaiveBayes tempLearner = new NaiveBayes(tempFilename);
        tempLearner.setDataType(NOMINAL);
        tempLearner.calculateClassDistribution();
        tempLearner.calculateConditionalProbabilities();
        tempLearner.classify();

        System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
    }//of testNominal

    /**
     *************************
     * Test numerical data.
     *************************
     */
    public static void testNumerical() {
        System.out.println(
                "Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
        String tempFilename = "D:/mitchelles/data/iris.arff";

        NaiveBayes tempLearner = new NaiveBayes(tempFilename);
        tempLearner.setDataType(NUMERICAL);
        tempLearner.calculateClassDistribution();
        tempLearner.calculateGaussianParameters();
        tempLearner.classify();

        System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
    }//of testNominal

    /**
     *************************
     * Test this class.
     *
     * @param args
     *            Not used now.
     *************************
     */
    public static void main(String[] args) {
        testNominal();
        testNumerical();
    }//of main

}//of class NaiveBayes

bug:

32行拼写错误

81行拼写错误

209行Gaussian拼写错误

NB算法:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值