MapReduce之基于符号数据的朴素贝叶斯分类（完）

最新推荐文章于 2022-05-26 14:52:04 发布

路人张的鱼生

最新推荐文章于 2022-05-26 14:52:04 发布

阅读量1.5k

点赞数 2

分类专栏： MapReduce 机器学习文章标签： MapReduce

本文链接：https://blog.csdn.net/zhangdy12307/article/details/101994704

版权

MapReduce 同时被 2 个专栏收录

41 篇文章 8 订阅

订阅专栏

机器学习

10 篇文章 0 订阅

订阅专栏

MapReduce之基于符号数据的朴素贝叶斯分类

在上一篇博客MapReduce之基于符号数据的朴素贝叶斯分类（一）中，简单的介绍了朴素贝叶斯算法的原理，接下来讲述如何使用MapReduce实现朴素贝叶斯算法。

输入数据

还是以《Machine Learning》中的数据为例：如下所示
训练数据

Sunny Hot High Weak No
Sunny Hot High Strong No
Overcast Hot High Weak Yes
Rain Mild High Weak Yes
Rain Cool Normal Weak Yes
Rain Cool Normal Strong No
Overcast Cool Normal Strong Yes
Sunny Mild High Weak No
Sunny Cool Normal Weak Yes
Rain Mild Normal Weak Yes
Sunny Mild Normal Strong Yes
Overcast Mild High Strong Yes
Overcast Hot Normal Weak Yes
Rain Mild High Strong No

预测数据

Sunny Hot High Weak
Sunny Cool Normal Strong
Overcast Hot High Weak
Sunny Hot High Weak
Sunny Hot High Strong

MapReduce解决方案

整个方案分为两个部分，

1、利用一个MapReduce作业实现分类器
2、利用另一个MapReduce实现对输入数据的分类

阶段1：使用符号训练数据建立分类器

mapper阶段任务

这个阶段通过接受一个数据实例 $X=(X_1=u_1,\dots,X_m=u_m)$ ,以及属于何类，生成形如<属性，分类>这样的键值对，在(sunny,hot,hight,weak,no)这样的数据中，生成结果如下：
<sunny,no>,<1>
<hot,no>,<1>
<high,no>,<1>
<weak,no>,<1>
<CLASS,no>,<1>

mapper阶段编码

package com.deng.NaiveBayes;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class trainMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
    private Text reduceKey,reduceValue;
    private static final IntWritable ONE=new IntWritable(1);

    public void map(LongWritable key,Text value,Context context){
        String[] tokens=value.toString().split(" ");
        int classIndex=tokens.length-1;
        String theClass=tokens[classIndex];
        for(int i=0;i<classIndex;i++){
            reduceKey=new Text(tokens[i]+","+theClass);
            try {
                context.write(reduceKey,ONE);
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        reduceKey=new Text("CLASS,"+theClass);
        try {
            context.write(reduceKey,ONE);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

reducer阶段任务

这个阶段相当于技术器，对频度进行累加

reducer阶段编码

package com.deng.NaiveBayes;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class trainReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
    public void reduce(Text key,Iterable<IntWritable> values,Context context){
        int total=0;
        for(IntWritable value:values){
            total+=value.get();
        }
        try {
            context.write(key,new IntWritable(total));
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

阶段2：使用分类器对新符号数据分类

在建立分类器后，可以对新数据进行分类

mapper阶段任务

这个阶段主要进行去重，避免同样的数据重复分类多次

mapper阶段编码

package com.deng.NaiveBayes;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ClassifierMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
    private IntWritable ONE=new IntWritable(1);

    public void map(LongWritable key,Text value,Context context){
        try {
            context.write(value,ONE);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

reducer阶段任务

这个阶段任务是根据上一个阶段的MapReduce运行结果生成一个概率表，进而通过对每个属性的计算来判断属于哪个类

reducer阶段编码

package com.deng.NaiveBayes;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.List;

public class ClassifierReducer extends Reducer<Text, IntWritable,Text, Text> {
    private Text reduceKey;
    private Text reduceValue;
    private List<String> classifications=null;
    private ProbabilityTable theProbabilityTable;

    public void setup(Context context){
        try {
            theProbabilityTable=new ProbabilityTable();
            classifications=theProbabilityTable.buildClassifications();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException {
        String[] attributes=key.toString().split(" ");
        String selectClass=null;
        double maxPosterior=0.0;
        for(String attribute:attributes){
            System.out.println("the attributes is "+attribute);
        }
        System.out.println("beginning calculations\n  ***********************************************************");
        for(String aClass: classifications){
            double posterior=theProbabilityTable.getClassProbability(aClass);
            System.out.println("posterior is =  "+posterior);
            for(int i=0;i<attributes.length;i++){
                posterior*=theProbabilityTable.getConditionalProbability(attributes[i],aClass);
                System.out.println("attributes is "+attributes[i]+"  aClass is  "+aClass+"   cal is "+posterior);
            }

            if(selectClass==null){
                selectClass=aClass;
                maxPosterior=posterior;
            }else{
                if(posterior>maxPosterior){
                    selectClass=aClass;
                    maxPosterior=posterior;
                }
            }
        }
        reduceKey=key;
        reduceValue=new Text(selectClass+","+maxPosterior);
        try {
            context.write(reduceKey,reduceValue);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

ProbabilityTable(概率表)类如下

package com.deng.NaiveBayes;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ProbabilityTable {
    //将归约器输出存储到classifierMap中，不用每次使用都重新查询文件
    private static Map<String,Double> classifierMap=new HashMap<>();
    private static Double tot;   //分类数总和
    //初始化
    public ProbabilityTable(){
        try {
            tot=0.0;
            conditionProbabilityBuild();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    //将归约器输出存储到classifierMap中
    public void conditionProbabilityBuild() throws IOException {
        BufferedReader br=new BufferedReader(new FileReader("output/part-r-00000"));
        String str;

        while((str=br.readLine())!=null){
            String[] line=str.split("\t");
            classifierMap.put(line[0],Double.parseDouble(line[1]));
        }
        br.close();
    }

    //获取分类内容
    public List<String> buildClassifications() throws IOException {
        BufferedReader br=new BufferedReader(new FileReader("input/NaiveBayes.txt"));
        String str;
        List<String> classifications=new ArrayList<>();
        while((str=br.readLine())!=null){
            String[] line=str.split(" ");
            String classIndex=line[line.length-1];
            if(!classifications.contains(classIndex)){
                classifications.add(classIndex);
                classifierMap.put(classIndex,1.0);
            }else{
                classifierMap.put(classIndex,classifierMap.get(classIndex)+1.0);
            }
        }
        for(String classification:classifications){
            tot+=classifierMap.get(classification);
        }
        return classifications;
    }

    //获取分类个数
    public double getClassProbability(String aClass) throws IOException {
        return classifierMap.get(aClass)/tot;
    }

    //计算该状态下的概率
    public double getConditionalProbability(String attribute,String aClass) throws IOException {
        double aClassTotal=classifierMap.get(aClass);
        try{
            System.out.println(attribute+","+aClass+","+aClassTotal);
            System.out.println("the conclusion is "+classifierMap.get(attribute+","+aClass));
            System.out.println("the calculation is "+classifierMap.get(attribute+","+aClass)/aClassTotal);
            return classifierMap.get(attribute+","+aClass)/aClassTotal;
        }catch (NullPointerException e){
            return 0;
        }
    }
}

驱动代码

package com.deng.NaiveBayes;

import com.deng.util.FileUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.io.IOException;

public class NaiveBayesDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        FileUtil.deleteDirs("output");
        FileUtil.deleteDirs("output2");
        Configuration conf=new Configuration();
        String[] otherArgs=new String[]{"input/NaiveBayes.txt","output","output2"};
        Job job=new Job(conf,"NaiveBayes");
        job.setJarByClass(NaiveBayesDriver.class);
        job.setMapperClass(trainMapper.class);
        job.setReducerClass(trainReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
        if((job.waitForCompletion(true)?0:1)==0){
            Job classifierJob=new Job(conf,"NaiveBayes");
            classifierJob.setJarByClass(NaiveBayesDriver.class);
            classifierJob.setMapperClass(ClassifierMapper.class);
            classifierJob.setReducerClass(ClassifierReducer.class);
            classifierJob.setMapOutputKeyClass(Text.class);
            classifierJob.setMapOutputValueClass(IntWritable.class);
            classifierJob.setOutputKeyClass(Text.class);
            classifierJob.setOutputValueClass(Text.class);
            FileInputFormat.addInputPath(classifierJob,new Path("input/NaiveBayesPre.txt"));
            FileOutputFormat.setOutputPath(classifierJob,new Path(otherArgs[2]));
            System.exit(classifierJob.waitForCompletion(true)?0:1);
        }
    }
}

运行结果如下
在这里插入图片描述