（三）MapReduce的连接问题和自定义key的合并规则

最新推荐文章于 2022-08-21 20:02:27 发布

草莓味的风

最新推荐文章于 2022-08-21 20:02:27 发布

阅读量1.2k

点赞数 1

分类专栏： hadoop 文章标签： MapReduce 连接

本文链接：https://blog.csdn.net/hr786250678/article/details/80454200

版权

hadoop 专栏收录该内容

18 篇文章 1 订阅

订阅专栏

（一）MapReduce的Join连接操作

MapReduce连接的目的是把两个文件的内容，按连接条件进行连接，合并成一个文件

MapReduce连接的方法有两种：

1.在Map端连接

不需要向reduce端发送数据

适合于有一个表比较小的情况

具体操作：i 将较小的文章读入到内存（Map集合）

ii 读取较大的文件，得到连接字段，利用连接字段去Map集合中取指

iii 进行连接

2.在Reduce端连接

利用key将相同记录合并的特点，用连接条件做key

制作一个实体，包含两个表的所有内容，在通过isXXX属性区分是哪个表

在reduce端取值、判断、合并

在Map端连接

import com.sun.scenario.effect.impl.sw.sse.SSEBlend_SRC_OUTPeer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

/*

 */
public class ForMapJoin {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
        private Map<String,String> cacheMap=new HashMap<String,String>();
        private Text oKey=new Text();
        private NullWritable oValue=NullWritable.get();
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //取出缓存文件的URI
            URI uri=context.getCacheFiles()[0];
            //创建
            File file=new File(uri);
            FileReader fr=new FileReader(file);
            BufferedReader br=new BufferedReader(fr);
            String temp;
            while((temp=br.readLine())!=null){
                String []strs=temp.split("\t");
                cacheMap.put(strs[0],strs[1]);
            }
            for(Map.Entry<String,String> entry:cacheMap.entrySet()){
                System.err.println(entry.getKey()+"\t"+entry.getValue());
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String []strs=value.toString().split("\t");
            if(cacheMap.get(strs[2])==null){
                System.out.println(strs[2]);
            }
            String phoneInfo=value.toString()+"\t"+cacheMap.get(strs[2]);
            oKey.set(phoneInfo);
            context.write(oKey,oValue);
        }
    }
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job=Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

        //设置加载到内存中的文件
        job.addCacheFile(new URI("file:///E:/forTestData/jionData/phoneinfo1.txt"));
        //或者job.setCacheFiles(URI []files);

        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\map\\userinfo.txt"));
        FileOutputFormat.setOutputPath(job,path);

        job.waitForCompletion(true);
    }
}

在Reduce端连接

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

/*
将product表和order表根据商品id连接后输出
 */
public class ForReducerJoin {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,ProductAndOrder>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Text oKey=new Text();
            ProductAndOrder oValue=new ProductAndOrder();
            String line=value.toString();
            String []strs=line.split("\t");
            if(strs.length<4) return;
            //获得当前的文件分片
            FileSplit fileSplit= (FileSplit) context.getInputSplit();
            //获得当前读取的文件名
            String fileName=fileSplit.getPath().getName();

            if("product.txt".equals(fileName)){
                oValue.setProductId(strs[0]);
                oValue.setProductName(strs[1]);
                oValue.setProductPrice(Integer.parseInt(strs[3]));
                oValue.setProductType(strs[2]);
                oValue.setPruduct(true);
                oKey.set(strs[0]);
            }else if("order.txt".equals(fileName)){
                oValue.setOrderDate(strs[1]);
                oValue.setOrderId(strs[0]);
                oValue.setOrderNum(Integer.parseInt(strs[2]));
                oValue.setPruduct(false);
                oKey.set(strs[3]);
            }
            context.write(oKey,oValue);
        }
    }
    public static class ForReducer extends Reducer<Text,ProductAndOrder,Text,NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<ProductAndOrder> values, Context context) throws IOException, InterruptedException {
            List<ProductAndOrder> orders= new ArrayList<ProductAndOrder>();
            ProductAndOrder product=new ProductAndOrder();
            for(ProductAndOrder entity:values){
                if(entity.isPruduct()){
                    product.setProductId(entity.getProductId());
                    product.setProductName(entity.getProductName());
                    product.setProductType(entity.getProductType());
                    product.setProductPrice(entity.getProductPrice());
                }else{
                    ProductAndOrder order=new ProductAndOrder();
                    order.setOrderId(entity.getOrderId());
                    order.setOrderDate(entity.getOrderDate());
                    order.setOrderNum(entity.getOrderNum());
                    orders.add(order);
                }
            }
            for(ProductAndOrder order:orders){
                order.setProductId(product.getProductId());
                order.setProductName(product.getProductName());
                order.setProductType(product.getProductType());
                order.setProductPrice(product.getProductPrice());
                String info=order.toString();
                context.write(new Text(info),NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job= Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ProductAndOrder.class);

        job.setReducerClass(ForReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }
        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\reduce"));
        FileOutputFormat.setOutputPath(job,path);
        job.waitForCompletion(true);

    }
}

（二）自定义key的合并规则

方法：编写一个类继承WritableComparator，并重写compare方法

而且要写一个无参的构造器，调用super(key的类型,true)，不然可能会空指针异常

在通过job.setGroupingComparatorClass()设置自定义的合并规则