Map端表合并（Distributedcache）

最新推荐文章于 2021-08-17 15:23:57 发布

SuperBigData~

最新推荐文章于 2021-08-17 15:23:57 发布

阅读量185

点赞数

分类专栏： # 7.MapReduce

本文链接：https://blog.csdn.net/Jackson_mvp/article/details/103075599

版权

7.MapReduce 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

需求2：Map端表合并（Distributedcache）

1）分析

适用于关联表中有小表的情形；

可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果，可以大大提高合并操作的并发度，加快处理速度。

 Mapperjion类：

package com.itstar.mr.wc0908.mapperjoin;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.mortbay.util.StringUtil;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

public class Mapperjion extends Mapper <LongWritable, Text,Text, NullWritable> {

   HashMap<String,String> hashMap= new HashMap<String,String>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {


            //1获取缓存中的数据
            BufferedReader reader= new BufferedReader(new InputStreamReader(new FileInputStream("D:\\大数据\\08-19压缩\\pd.txt")));
            //2.切分
            String line;
            while (StringUtils.isNotEmpty(line=reader.readLine())){
               String[] strings=line.split("\t");
                //3.将小表里面的数据存储进HashMap集合中
                hashMap.put(strings[0],strings[1]);
            }
        }
Text k=new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {


        //1获取大表（订单表）一行数据
        String line=value.toString();
        //2切分
        String[] strings=line.split("\t");
        //3获取订单里面的ID
        String pid = strings[1];

        //4获取产品名称
        String pname = hashMap.get(pid);

        //5进行拼接
        k.set(strings[0]+"\t"+pname+"\t"+strings[2]);
        //6输出
        context.write(k,NullWritable.get());

    }

}

MapperJoinDrive类：

package com.itstar.mr.wc0908.mapperjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.xml.soap.Text;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class MapperJoinDrive  {

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
      args=new String[]{"D:\\大数据中高期\\08-19压缩\\order.txt","D:\\大数据中高期\\08-19压缩\\mapperjoinout","file:///D:/大数据中高期/08-19压缩/pd.txt"};
        //实例化配置文件
        Configuration conf = new Configuration();
        //配置job信息
        Job job = Job.getInstance(conf);

        job.setJarByClass(MapperJoinDrive.class);

        //配置mapper
        job.setMapperClass(Mapperjion.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //reduce的个数
        job.setNumReduceTasks(0);

        //输入路径写大表的路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        //输出路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //加载小表到缓存中
        job.addCacheFile(new URI(args[2]));

        //提交
        job.waitForCompletion(true);



    }
}

结果：打印成功