MapReduce程序案例 MapJoin与ReduceJoin

MapJoin案例应用

1使用场景

Map Join适用于一张表十分小、一张表很大的场景。

2优点

思考:在Reduce端处理过多的表,非常容易产生数据倾斜。怎么办?
在Map端缓存多张表,提前处理业务逻辑,这样增加Map端业务,减少Reduce端数据的压力,尽可能的减少数据倾斜。

3具体办法:采用DistributedCache
​ (1)在Mapper的setup阶段,将文件读取到缓存集合中。
​ (2)在Driver驱动类中加载缓存。

//缓存普通文件到Task运行节点。

job.addCacheFile(new URI("file:///e:/cache/pd.txt"));

//如果是集群运行,需要设置HDFS路径

job.addCacheFile(new URI("hdfs://hadoop102:9820/cache/pd.txt"));
public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    private HashMap<String,String> pidMap = new HashMap<>();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取在driver中设置的缓存文件。
        URI[] cacheFiles = context.getCacheFiles();
        FileSystem fileSystem = FileSystem.get(context.getConfiguration());
        FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
        //转化为字符流,一行一行的读
        InputStreamReader isr = new InputStreamReader(open, "UTF-8");
        BufferedReader bufferedReader = new BufferedReader(isr);
        String line;
        String[] split;
        while (StringUtils.isNoneEmpty(line = bufferedReader.readLine())) {
            split = line.split("\t");
            pidMap.put(split[0], split[1]);
        }
    }
    private Text outK = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.getCounter("line","linenumber").increment(1);
        String line = value.toString();
        String[] split = line.split("\t");
        outK.set(split[0]+"\t"+pidMap.get(split[1])+"\t"+split[2]);
        context.write(outK,NullWritable.get());
    }
}
public class MapJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(MapJoinDriver.class);

        job.setMapperClass(MapJoinMapper.class);

//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);   //没有reduce阶段
        job.addCacheFile(new URI(args[2]));

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.out.println(b ? "执行成功!" : "执行失败!");
    }
}
ReduceJoin案例

在这里插入图片描述

public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(ReduceJoinDriver.class);

        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("I:/input/inputtable2/"));
        FileOutputFormat.setOutputPath(job, new Path("I:/output/reduceOutput55/"));

        boolean b = job.waitForCompletion(true);
        System.out.println(b ? "执行成功!" : "执行失败!");

    }
}
public class ReduceJoinMapper extends Mapper<LongWritable, Text,Text,TableBean> {

    private String filename;
    private Text outK = new Text();
    private TableBean outV = new TableBean();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取切片信息
        InputSplit inputSplit = context.getInputSplit();
        FileSplit fileSplit = (FileSplit) inputSplit;
        filename = fileSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //以pid为key,tableBean为value
        String line = value.toString();
        String[] split = line.split("\t");
        if(filename.contains("order")){
            outK.set(split[1]);
            outV.setId(split[0]);
            outV.setPid(split[1]);
            outV.setPname("");
            outV.setAmount(Integer.parseInt(split[2]));
            outV.setFlag("order");
        }else{
            outK.set(split[0]);
            outV.setId("");
            outV.setPid(split[0]);
            outV.setPname(split[1]);
            outV.setAmount(0);
            outV.setFlag("pd");
        }
        context.write(outK,outV);
    }
}
public class ReduceJoinReduce extends Reducer<Text,TableBean,TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
        ArrayList<TableBean> orderBeans = new ArrayList<>(); //保存每组order
        TableBean pdBean  = new TableBean();    //保存pd

        //Iterable是hadoop中比较特殊的集合,hadoop为了减少内存开销,
        // 只用了一个对象,如果想取数据,需要修改这个对象的中的数据
        for (TableBean value : values) {
            if("order".equals(value.getFlag())){
                //所以如果向把数据的都放入到集合中,就需要创建一个TableBean的对象
                TableBean tmp = new TableBean();
                try {
                    BeanUtils.copyProperties(tmp ,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                orderBeans.add(tmp);
            }else{
                try {
                    BeanUtils.copyProperties(pdBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        for (TableBean orderBean : orderBeans) {
            orderBean.setPname(pdBean.getPname());
            context.write(orderBean,NullWritable.get());
        }
    }
}
public class TableBean  implements Writable {

    private String id;
    private String pid;
    private String pname;
    private int amount;
    private String flag;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }


    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return id + "\t" + pname + "\t" + amount;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.getId());
        out.writeUTF(this.getPid());
        out.writeInt(this.getAmount());
        out.writeUTF(this.getPname());
        out.writeUTF(this.getFlag());
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id = in.readUTF();
        this.pid = in.readUTF();
        this.amount = in.readInt();
        //这里的如果序列化的时候用的是writeInt(),反序列化的时候一定也要用readInt();不然会报EOFException
        this.pname = in.readUTF();
        this.flag = in.readUTF();
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值