基于Hadoop的朴素贝叶斯算法实现

    贝叶斯分类器的分类原理是通过某对象的先验概率,利用贝叶斯公式计算出其后验概率,即该对象属于某一类的概率,选择具有最大后验概率的类作为该对象所属的类。    

    以下为一个简单的例子:

    数据:天气情况和每天是否踢足球的记录表

日期 踢足球 天气 温度 湿度 风速
1号 否(0) 晴天(0) 热(0) 高(0) 低(0)
2号 否(0) 晴天(0) 热(0) 高(0) 高(1)
3号 是(1) 多云(1) 热(0) 高(0) 低(0)
4号 是(1) 下雨(2) 舒适(1) 高(0) 低(0)
5号 是(1) 下雨(2) 凉爽(2) 正常(1) 低(0)
6号 否(0) 下雨(2) 凉爽(2) 正常(1) 高(1)
7号 是(1) 多云(1) 凉爽(2) 正常(1) 高(1)
8号 否(0) 晴天(0) 舒适(1) 高(0) 低(0)
9号 是(1) 晴天(0) 凉爽(2) 正常(1) 低(0)
10号 是(1) 下雨(2) 舒适(1) 正常(1) 低(0)
11号 是(1) 晴天(0) 舒适(1) 正常(1) 高(1)
12号 是(1) 多云(1) 舒适(1) 高(0) 高(1)
13号 是(1) 多云(1) 热(0) 正常(1) 低(0)
14号 否(0) 下雨(2) 舒适(1) 高(0) 高(1)
15号 晴天(0) 凉爽(2) 高(0) 高(1)
    需要预测15号,在这种天气情况下是否踢球。

    假设15号去踢球,踢球的概率计算过程如下:

    P(踢球的概率) = 9/14

    P(晴天|踢) = 踢球天数中晴天踢球的次数/踢球次数 = 2/9

    P(凉爽|踢) = 踢球天数中凉爽踢球的次数/踢球次数 = 3/9

    P(湿度高|踢) = 踢球天数中湿度高踢球的次数/踢球次数 = 3/9

    P(风速高|踢) = 踢球天数中风速高踢球的次数/踢球次数 = 3/9

    则15号踢球的概率P = 9/14 * 2/9 * 3/9 * 3/9 * 3/9 = 0.00529

    按照上述步骤还可计算出15号不去踢球的概率P = 5/14 * 3/5 * 1/5 * 4/5 * 3/5 = 0.02057

    可以看出,15号不去踢球的概率大于去踢球的概率,则可预测说,15号不去踢球。

    理解朴素贝叶斯的流程之后,开始设计MR程序。在Mapper中,对训练数据进行拆分,也就是将这条训练数据拆分为类别和训练数据,将训练数据以自定义值类型来保存,然后传递给Reducer。

    Mapper:

public class BayesMapper extends Mapper<Object, Text, IntWritable, MyWritable> {
	Logger log = LoggerFactory.getLogger(BayesMapper.class);
	private IntWritable myKey = new IntWritable();
	private MyWritable myValue = new MyWritable();
	@Override
	protected void map(Object key, Text value, Context context)
			throws IOException, InterruptedException {
		log.info("***"+value.toString());
		int[] values = getIntData(value);
		int label = values[0];  //存放类别
		int[] result = new int[values.length-1]; //存放数据
		for(int i =1;i<values.length;i++){
			result[i-1] = values[i];
		}
		myKey.set(label);
		myValue.setValue(result);
		context.write(myKey, myValue);
		
	}

	private int[] getIntData(Text value) {
		String[] values = value.toString().split(",");
		int[] data = new int[values.length];
		for(int i=0; i < values.length;i++){
			data[i] = Integer.parseInt(values[i]);
		}
		return data;
	}

}
    MyWritable:

public class MyWritable implements Writable{
	private int[] value;
	public MyWritable() {
		// TODO Auto-generated constructor stub
	}
	public MyWritable(int[] value){
		this.setValue(value);
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(value.length);
		for(int i=0; i<value.length;i++){
			out.writeInt(value[i]);
		}
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		int vLength = in.readInt();
		value = new int[vLength];
		for(int i=0; i<vLength;i++){
			value[i] = in.readInt();
		}
	}
	public int[] getValue() {
		return value;
	}
	public void setValue(int[] value) {
		this.value = value;
	}

}


    Reducer中,需要在setup中初始化测试数据,由于训练数据与测试数据的属性中均只有0,1两种值,因此在reduce中,统计相同类别的不同属性的值的和(也就是统计出现1的次数,0的次数就是用该类别下数据的总和减去出现1的次数)。用对象CountAll来保存当前类别k、在类别k中每个属性出现1的概率、类别k中数据的条数,然后在cleanup中去计算当前测试数据出现哪种类别的概率最大,并设定这个类别为当前测试数据的类别。

public class BayesReducer extends Reducer<IntWritable, MyWritable, IntWritable, IntWritable>{
	Logger log = LoggerFactory.getLogger(BayesReducer.class);
	private String testFilePath;
	// 测试数据
	private ArrayList<int[]> testData = new ArrayList<>();
	// 保存相同k的所有数据
	private ArrayList<CountAll> allData = new ArrayList<>();

	@Override
	protected void setup(Context context)
			throws IOException, InterruptedException {
		Configuration conf = context.getConfiguration();
		testFilePath = conf.get("TestFilePath");
		Path path = new Path(testFilePath);
		FileSystem fs = path.getFileSystem(conf);
		readTestData(fs,path);
	}
	/***
	 * k,v => 0  {{0,1,0,1,0,0,1,...},{},{},...}
	 */
	@Override
	protected void reduce(IntWritable key, Iterable<MyWritable> values,
			Context context)
			throws IOException, InterruptedException {
		Double[] myTest = new Double[testData.get(0).length-1];
		for(int i=0;i<myTest.length;i++){
			myTest[i] = 1.0;
		}
		Long sum = 2L;
		// 计算每个类别中,每个属性值为1的个数
		for (MyWritable myWritable : values) {
			int[] myvalue = myWritable.getValue();
			for(int i=0; i < myvalue.length;i++){
				myTest[i] += myvalue[i];
			}
			sum += 1;
		}
		for(int i=0;i<myTest.length;i++){
			myTest[i] = myTest[i]/sum;
		}
		allData.add(new CountAll(sum,myTest,key.get()));
		
	}
	private IntWritable myKey = new IntWritable();
	private IntWritable myValue = new IntWritable();
	@Override
	protected void cleanup(Context context)
			throws IOException, InterruptedException {
		// 保存每个类别的在训练数据中出现的概率
		// k,v  0,0.4
		// k,v  1,0.6
		HashMap<Integer, Double> labelG = new HashMap<>();
		Long allSum = getSum(allData); //计算训练数据的长度
		for(int i=0; i<allData.size();i++){
			labelG.put(allData.get(i).getK(), 
					Double.parseDouble(allData.get(i).getSum().toString())/allSum);
		}
		//test的长度 要比训练数据中的长度大1
		int sum = 0;
		int yes = 0;
		for(int[] test: testData){
			int value = getClasify(test, labelG);
			if(test[0] == value){
				yes += 1;
			}
			sum +=1;
			myKey.set(test[0]);
			myValue.set(value);
			context.write(myKey, myValue);
		}
		System.out.println("正确率为:"+(double)yes/sum);
	}
	/***
	 * 求得所有训练数据的条数
	 * @param allData2
	 * @return
	 */
	private Long getSum(ArrayList<CountAll> allData2) {
		Long allSum = 0L;
		for (CountAll countAll : allData2) {
			log.info("类别:"+countAll.getK()+"数据:"+myString(countAll.getValue())+"总数:"+countAll.getSum());
			allSum += countAll.getSum();
		}
		return allSum;
	}
	/***
	 * 得到分类的结果
	 * @param test
	 * @param labelG
	 * @return
	 */
	private int getClasify(int[] test,HashMap<Integer, Double> labelG ) {
		double[] result = new double[allData.size()]; //以类别的长度作为数组的长度
		for(int i = 0; i<allData.size();i++){
			double count = 0.0;
			CountAll ca = allData.get(i);
			Double[] pdata = ca.getValue();
			for(int j=1;j<test.length;j++){
				if(test[j] == 1){
					// 在该类别中,相同位置上的元素的值出现1的概率
					count += Math.log(pdata[j-1]);
//					count *= pdata[j-1];
				}else{
					count += Math.log(1- pdata[j-1]);
//					count *= (1- pdata[j-1]);
				}
				log.info("count: "+count);
			}
			count += Math.log(labelG.get(ca.getK()));
//			count *= labelG.get(ca.getK());
			result[i] = count;
		}
		// 求出最大的概率
//		int index = 0;
//		double maxValue = Double.MIN_VALUE;
//		log.info("0的概率:"+result[0]+"1的概率:"+result[1]+"  长度:"+result.length);
//		for(int i=0; i< result.length;i++){
//			if(result[i] > maxValue){
//				maxValue = result[i];
//				index = i;
//			}
//		}
//		return allData.get(index).getK();
		if(result[0] > result[1]){
			return 0;
		}else{
			return 1;
		}
	}
	/***
	 * 读取测试数据
	 * @param fs
	 * @param path
	 * @throws NumberFormatException
	 * @throws IOException
	 */
	private void readTestData(FileSystem fs, Path path) throws NumberFormatException, IOException {
		FSDataInputStream data = fs.open(path);
		BufferedReader bf = new BufferedReader(new InputStreamReader(data));
		String line = "";
		while ((line = bf.readLine()) != null) {
			String[] str = line.split(",");
			int[] myData = new int[str.length];
			for(int i=0;i<str.length;i++){
				myData[i] = Integer.parseInt(str[i]);
			}
			testData.add(myData);
		}
		bf.close();
		data.close();
		
	}
	public static String myString(Double[] arr){
		String num = "";
		for(int i=0;i<arr.length;i++){
			if(i==arr.length-1){
				num += String.valueOf(arr[i]);
			}else{
				num += String.valueOf(arr[i])+',';
			}
		}
		return num;
	}
}
    CountAll:

public class CountAll {
	private Long sum;
	private Double[] value;
	private int k;
	public CountAll(){}
	public CountAll(Long sum, Double[] value,int k){
		this.sum = sum;
		this.value = value;
		this.k = k;
	}
	public Double[] getValue() {
		return value;
	}
	public void setValue(Double[] value) {
		this.value = value;
	}
	public Long getSum() {
		return sum;
	}
	public void setSum(Long sum) {
		this.sum = sum;
	}
	public int getK() {
		return k;
	}
	public void setK(int k) {
		this.k = k;
	}
}

    从Reducer中的getClassify方法中可以看出,考虑了P(ai|c)=0的情况(ai为属性,c为类别),这里用到了拉普拉斯平滑来解决这个问题,也就是在reduce中初始化属性值和数组myTest时,将其中每个元素的值设定为1,然后将该类别下数据总数sum初始化为2即可。

    由于在计算P(a1|c) * P(a2|c)...*P(an|c)时,如果每个p值都比较小,当属性很多时会出现精度损失的情况,也就是最后每个类别算出的概率都会为0,。这里将其取对数,转换为ln(P(a1|c) * P(a2|c)...*P(an|c)) = ln(P(a1|c)) + ln(P(a2|c)) + ...+ln(P(an|c)),可以避免出现精度损失这种情况。

    

    训练数据中的一部分如下:

1,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1
1,1,0,1,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1
1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1
1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1
1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,0,0,1,0,1,1,0,0
1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1
1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1
1,1,0,1,1,0,0,1,1,1,0,1,1,1,1,1,1,0,1,1,0,1,1
1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,0,1,1,1,0,1,1,1
1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,1,0
1,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1
1,1,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0
1,1,1,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
    验证数据中的一部分如下:

1,1,0,0,1,1,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,0,0
1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1
1,0,1,1,1,0,0,1,0,1,0,0,1,1,1,0,1,0,0,0,0,1,0
1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,1
1,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,1
1,1,0,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,0,0,1,1,1,0,0,1,0,1,1,0,1,0,0,0
1,1,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,0,0,1,0,0,0
1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1
1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0
1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0
1,1,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,1
1,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,1,0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,1,0
1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0
1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,1
1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,0,1,0,1,1,0
1,1,1,0,1,1,1,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
1,1,1,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,0,0

    由于训练数据只有86条,验证数据有184条,最后预测的正确率为0.8。将训练数据与验证数据调换,预测结果的正确率会更高。

  • 4
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值