Weka_NaiveBayesSimple

最近开始读weka部分算法源码

NaiveBayesSimple是最简单的Bayes实现,对后验概率采用的方法是似然估计。核心函数 是在buldClassifier上


public void buildClassifier(Instances instances) throws Exception {

    int attIndex = 0;
    double sum;
    
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);//测试一下数据能不能用来训练bayes模型,对于连续的值,如果两个值差距大于0.1,weka就会看成不同的属性值

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();//去除没有类标签的数据
    
    m_Instances = new Instances(instances, 0);
    
    // Reserve space
    m_Counts = new double[instances.numClasses()]
      [instances.numAttributes() - 1][0];//某个类某个属性的出现总的次数
    m_Means = new double[instances.numClasses()]
      [instances.numAttributes() - 1];
    m_Devs = new double[instances.numClasses()]//
      [instances.numAttributes() - 1];
    m_Priors = new double[instances.numClasses()];//每个类的先验概率
    Enumeration enu = instances.enumerateAttributes();
    while (enu.hasMoreElements()) {
      Attribute attribute = (Attribute) enu.nextElement();
      if (attribute.isNominal()) {//是离散的值的数据的话,就要用三维的来表示
	for (int j = 0; j < instances.numClasses(); j++) {
	  m_Counts[j][attIndex] = new double[attribute.numValues()];
	}
      } else {
	for (int j = 0; j < instances.numClasses(); j++) {
	  m_Counts[j][attIndex] = new double[1];
	}
      }
      attIndex++;
    }
    
    // Compute counts and sums
    Enumeration enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = (Instance) enumInsts.nextElement();
      if (!instance.classIsMissing()) {
	Enumeration enumAtts = instances.enumerateAttributes();
	attIndex = 0;
	while (enumAtts.hasMoreElements()) {
	  Attribute attribute = (Attribute) enumAtts.nextElement();
	  if (!instance.isMissing(attribute)) {
	    if (attribute.isNominal()) {
	      m_Counts[(int)instance.classValue()][attIndex]
		[(int)instance.value(attribute)]++;
	    } else {
	      m_Means[(int)instance.classValue()][attIndex] +=
		instance.value(attribute);
	      m_Counts[(int)instance.classValue()][attIndex][0]++;
	    }
	  }
	  attIndex++;
	}
	m_Priors[(int)instance.classValue()]++;
      }
    }
    
    // Compute means
    Enumeration enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = (Attribute) enumAtts.nextElement();
      if (attribute.isNumeric()) {
	for (int j = 0; j < instances.numClasses(); j++) {
	  if (m_Counts[j][attIndex][0] < 2) {//一个类的属性取值至少要两个值,不然该属性没有用来分类的意义
	    throw new Exception("attribute " + attribute.name() +
				": less than two values for class " +
				instances.classAttribute().value(j));
	  }
	  m_Means[j][attIndex] /= m_Counts[j][attIndex][0];//计算后验概率
	}
      }
      attIndex++;
    }    
    
    // Compute standard deviations
    enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = 
	(Instance) enumInsts.nextElement();
      if (!instance.classIsMissing()) {
	enumAtts = instances.enumerateAttributes();
	attIndex = 0;
	while (enumAtts.hasMoreElements()) {
	  Attribute attribute = (Attribute) enumAtts.nextElement();
	  if (!instance.isMissing(attribute)) {
	    if (attribute.isNumeric()) {
	      m_Devs[(int)instance.classValue()][attIndex] +=
		(m_Means[(int)instance.classValue()][attIndex]-
		 instance.value(attribute))*
		(m_Means[(int)instance.classValue()][attIndex]-
		 instance.value(attribute));
	    }
	  }
	  attIndex++;
	}
      }
    }
    enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = (Attribute) enumAtts.nextElement();
      if (attribute.isNumeric()) {
	for (int j = 0; j < instances.numClasses(); j++) {
	  if (m_Devs[j][attIndex] <= 0) {
	    throw new Exception("attribute " + attribute.name() +
				": standard deviation is 0 for class " +
				instances.classAttribute().value(j));
	  }
	  else {
	    m_Devs[j][attIndex] /= m_Counts[j][attIndex][0] - 1;
	    m_Devs[j][attIndex] = Math.sqrt(m_Devs[j][attIndex]);
	  }
	}
      }
      attIndex++;
    } 
    
    // Normalize counts
    enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = (Attribute) enumAtts.nextElement();
      if (attribute.isNominal()) {
	for (int j = 0; j < instances.numClasses(); j++) {
	  sum = Utils.sum(m_Counts[j][attIndex]);
	  for (int i = 0; i < attribute.numValues(); i++) {
	    m_Counts[j][attIndex][i] =
	      (m_Counts[j][attIndex][i] + 1) //拉普拉斯平滑
	      / (sum + (double)attribute.numValues());
	  }
	}
      }
      attIndex++;
    }
    
    // Normalize priors
    sum = Utils.sum(m_Priors);//类别概率,同样也是拉普拉斯平滑
    for (int j = 0; j < instances.numClasses(); j++)
      m_Priors[j] = (m_Priors[j] + 1) 
	/ (sum + (double)instances.numClasses());
  }


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
目录 Weka 开发[1]-Instances类 1 Weka开发[2]-分类器类 2 Weka开发[3]-Evaluation类 3 Weka开发[4]-特征选择 4 Weka开发[5]-半监督算法 6 Weka开发[0]-导入Weka包 8 半监督算法工具SVMlin使用 12 半监督算法工具SVMlin读取数据代码介绍 14 Weka开发[6]-参数设置 16 Weka开发[7]-LibSVM 17 Weka开发[8]-ID3源码介绍 18 Weka开发[9]—KMeans源码介绍 21 Weka开发[10]—NBTree源码介绍 25 Weka开发[11]—J48源代码介绍 31 Weka开发[13]-Ensemble 39 Weka开发[14]-AdaBoost源代码介绍 42 Weka开发[15]-ZeroR源代码介绍(入门篇) 45 Multi-Label Classification(多标签分类) 介绍 47 Weka开发[16]-OneR源代码介绍 47 Weka开发[-1]——在你的代码中使用Weka 51 挖掘多标签数据综述(multi-label data mining)[Available] 62 数据流-移动超平面(HyperPlane)构造 63 Weka开发[17]——关联规则之Apriori 66 Weka开发[18]——寻找K个邻居 67 Weka开发[19]——NaiveBayes源代码分析 69 Weka开发[20]——IB1源代码分析 74 Weka开发[21]——IBk(KNN)源代码分析 77 Weka开发[22]——REPTree源代码分析(1) 81 Weka开发[23]——PART源代码分析 94 Weka开发[24]——Apriori源代码分析(1) 101 Weka开发[24]——Apriori源代码分析(2) 106 Weka开发[25]——Bagging源代码分析 112 Weka开发[26]——Voting源代码分析 116 Weka开发[27]——SMO源代码分析[1] 122 Weka开发[27]——SMO源代码分析[2] 127 weka开发[27]——SMO源代码分析[3] 132 Weka开发[27]——SMO源代码分析[4] 138

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值