日撸代码300行:第58天(符号型数据的 NB 算法)

代码来自闵老师”日撸 Java 三百行(51-60天)“,链接:https://blog.csdn.net/minfanphd/article/details/116975957

Naive Bayes是通过概率分布来进行分类。基础理论知识见闵老师博客[1]:NB 算法 (包括符号型与数值型, 结合 Java 程序分析)_nb模型 java_闵帆的博客-CSDN博客

符号型数据的NB 算法主要有两个步骤,第一步是计算数据集类的分布。第二步是条件概率,是考虑Laplacian平滑的条件概率。

package datastructure.nb;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.Instance;
import weka.core.Instances;

/**
 * ********************************************
 * The Naive Bayes algorithm.
 * 
 * @author WX873
 **********************************************
 */
public class NaiveBayes {
	
	/**
	 * The data.
	 */
	Instances dataset;
	
	/**
	 * The number of classes. For binary classification it is 2.
	 */
	int numClasses;
	
	/**
	 * The number of instances.
	 */
	int numInstances;
	
	/**
	 * The number of conditional attributes.
	 */
	int numConditions;
	
	/**
	 * The prediction, including queried and predicted labels.
	 */
	int[] predicts;
	
	/**
	 * Class distribution.
	 */
	double[] classDistribution;
	
	/**
	 * Class distribution with Laplacian smooth.
	 */
	double[] classDistributionLaplacian;
	
	/**
	 * To calculate the conditional probabilities for all classes over all
	 * attributes on all values.
	 */
	double[][][] conditionalCounts;
	
	/**
	 * The conditional probabilities with Laplacian smooth.
	 */
	double[][][] conditionalProbabilitiesLaplacian;
	
	/**
	 * Data type.
	 */
	int dataType;
	
	/**
	 * Nominal
	 */
	public static final int NOMINAL = 0;
	
	/**
	 * Numerical.
	 */
	public static final int NUMERICAL = 1;
	
	/**
	 * **********************************************************
	 * The first constructor.
	 * 
	 * @param paraFilename   The given file.
	 * **********************************************************
	 */
	public NaiveBayes(String paraFilename) {
		// TODO Auto-generated constructor stub
		dataset = null;
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println("Cannot read the file: " + paraFilename + "\r\n" + e);
			System.exit(0);
		}//of try
		
		dataset.setClassIndex(dataset.numAttributes() - 1);
		numConditions = dataset.numAttributes() - 1;
		numInstances = dataset.numInstances();
		numClasses = dataset.attribute(numConditions).numValues();
		
	}//The first constructor
	
	/**
	 * **********************************************************
	 * The second constructor.
	 * 
	 * @param paraInstances   The instance of given file.
	 * **********************************************************
	 */
	public NaiveBayes(Instances paraInstances) {
		// TODO Auto-generated constructor stub
		dataset = paraInstances;
		
		dataset.setClassIndex(dataset.numAttributes() - 1);
		numConditions = dataset.numAttributes() - 1;
		numInstances = dataset.numInstances();
		numClasses = dataset.attribute(numConditions).numValues();
	}//The second constructor
	
	/**
	 * ****************************************************
	 * Set the data type.
	 * @param paraDataType
	 * ****************************************************
	 */
	public void setDataType(int paraDataType) {
		dataType = paraDataType;
	}//of setDataType
	
	/**
	 * **************************************************************
	 * Calculate the class distribution with Laplacian smooth.
	 * **************************************************************
	 */
	public void calculateClassDistribution() {
		classDistribution = new double[numClasses];
		classDistributionLaplacian = new double[numClasses];
		
		double[] tempCounts = new double[numClasses];
		for (int i = 0; i < numInstances; i++) {
			int tempClassValue = (int)dataset.instance(i).classValue();
			tempCounts[tempClassValue]++;
		}//of for i
		
		for (int i = 0; i < numClasses; i++) {
			classDistribution[i] = tempCounts[i]/numInstances;
			classDistributionLaplacian[i] = (tempCounts[i] + 1)/(numInstances + numClasses);
		}//of for i
		
		System.out.println("Class distribution: " + Arrays.toString(classDistribution));
		System.out.println("Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
	}//of calculateClassDistribution
	
	/**
	 * *****************************************************
	 * Calculate the conditional probabilities with Laplacian smooth.
	 * *****************************************************
	 */
	public void calculateConditionalProbabilities() {
		conditionalCounts = new double[numClasses][numConditions][];
		conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];
		
		//Allocate space
		for (int i = 0; i < numClasses; i++) {
			for (int j = 0; j < numConditions; j++) {
				int tempNumValues = (int)dataset.attribute(j).numValues();  //总共有多少个属性就申请多少个空间
				conditionalCounts[i][j] = new double[tempNumValues];
				conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
			}//of for j
		}//of for i
		
		//Count the numbers of the same class and attribute. 
		int[] tempClassCounts = new int[numClasses];
		for (int i = 0; i < numClasses; i++) {
			int tempClass = (int)dataset.instance(i).classValue();
			tempClassCounts[tempClass]++;
			for (int j = 0; j < numConditions; j++) {
				int tempValue = (int)dataset.instance(i).value(j);
				conditionalCounts[tempClass][j][tempValue]++;
			}//of for j
		}//of for i
		
		//Now for the real probability with Laplacian
		for (int i = 0; i < numClasses; i++) {
			for (int j = 0; j < numConditions; j++) {
				int tempNumValues = (int)dataset.instance(i).attribute(j).numValues();
				for (int k = 0; k < tempNumValues; k++) {
					conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1)/(tempClassCounts[i] + tempNumValues);
				}//of for k
			}//of for j
		}//of for i
		
		System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalCounts));	
	}//of calculateConditionalProbabilities
	
	/**
	 * *******************************************************
	 * Classify all instances, the results are stored in predicts[].
	 * *******************************************************
	 */
	public void classify() {
		predicts = new int[numInstances];
		for (int i = 0; i < numInstances; i++) {
			predicts[i] = classifyofDataTypeChose(dataset.instance(i));
		}//of for i
	}//of classify
	
	/**
	 * *******************************************************
	 * Classify an instances according to the data type.
	 * @param paraInstance
	 * @return 
	 * *******************************************************
	 */
	public int classifyofDataTypeChose(Instance paraInstance) {
		if (dataType == NOMINAL) {
			return classifyNominal(paraInstance);
		}else if(dataType == NUMERICAL){
			return 0;
		}//of if
		return -1;
	}//of classifyofdataType
	
	/**
	 * ********************************************************
	 * Classify an instances with nominal data.
	 * @param paraInstance
	 * @return BestIndex.
	 * ********************************************************
	 */
	public int classifyNominal(Instance paraInstance) {
		// Find the biggest one.
		double tempBiggest = -10000;
		int resultBestIndex = 0;
		
		for (int i = 0; i < numClasses; i++) {
			double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
			for (int j = 0; j < numConditions; j++) {
				int tempAttributeValue = (int)paraInstance.value(j);
				
				tempPseudoProbability += Math.log(conditionalProbabilitiesLaplacian[i][j][tempAttributeValue]);
			}//of for j
			
			if (tempBiggest < tempPseudoProbability) {
				tempBiggest = tempPseudoProbability;
				resultBestIndex = i;
			}//of if
		}//of for i
		
		return resultBestIndex;
	}//of classifyNominal
	
	
	/**
	 * *****************************************
	 * Compute accuracy.
	 * @return accuracy
	 * *****************************************
	 */
	public double computeAccuracy() {
		double tempCorrect = 0;
		double resultAccuracy = 0;
		
		for (int i = 0; i < numInstances; i++) {
			if (predicts[i] == (int)dataset.instance(i).classValue()) {
				tempCorrect++;
			}//of if
		}//of for i
		
		resultAccuracy = tempCorrect/numInstances;
		return resultAccuracy;
	}//of computeAccuracy
	
	/**
	 * ******************************
	 * Test nominal data.
	 * ******************************
	 */
	public static void testNominal() {
		System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
		String tempFilename = "E:/Datasets/UCIdatasets/mushroom.arff";
		
		NaiveBayes tempLearner = new NaiveBayes(tempFilename);
		tempLearner.setDataType(NOMINAL);
		tempLearner.calculateClassDistribution();
		tempLearner.calculateConditionalProbabilities();
		tempLearner.classify();
		
		System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
	}//of testNominal
	
	
	
	/**
	 * **************************************
	 * The entrance of the program.
	 * @param args
	 * **************************************
	 */
	public static void main(String args[]) {
		testNominal();
	}//of main
	
	
}//of NaiveBayes

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值