(三)MapReduce的连接问题和自定义key的合并规则

(一)MapReduce的Join连接操作

MapReduce连接的目的是把两个文件的内容,按连接条件进行连接,合并成一个文件

MapReduce连接的方法有两种:

    1.在Map端连接

        不需要向reduce端发送数据

        适合于有一个表比较小的情况 

        具体操作:i    将较小的文章读入到内存(Map集合)

                        ii    读取较大的文件,得到连接字段,利用连接字段去Map集合中取指

                        iii    进行连接

    2.在Reduce端连接

         利用key将相同记录合并的特点,用连接条件做key

         制作一个实体,包含两个表的所有内容,在通过isXXX属性区分是哪个表

        在reduce端取值、判断、合并

在Map端连接

import com.sun.scenario.effect.impl.sw.sse.SSEBlend_SRC_OUTPeer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

/*

 */
public class ForMapJoin {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
        private Map<String,String> cacheMap=new HashMap<String,String>();
        private Text oKey=new Text();
        private NullWritable oValue=NullWritable.get();
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //取出缓存文件的URI
            URI uri=context.getCacheFiles()[0];
            //创建
            File file=new File(uri);
            FileReader fr=new FileReader(file);
            BufferedReader br=new BufferedReader(fr);
            String temp;
            while((temp=br.readLine())!=null){
                String []strs=temp.split("\t");
                cacheMap.put(strs[0],strs[1]);
            }
            for(Map.Entry<String,String> entry:cacheMap.entrySet()){
                System.err.println(entry.getKey()+"\t"+entry.getValue());
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String []strs=value.toString().split("\t");
            if(cacheMap.get(strs[2])==null){
                System.out.println(strs[2]);
            }
            String phoneInfo=value.toString()+"\t"+cacheMap.get(strs[2]);
            oKey.set(phoneInfo);
            context.write(oKey,oValue);
        }
    }
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job=Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

        //设置加载到内存中的文件
        job.addCacheFile(new URI("file:///E:/forTestData/jionData/phoneinfo1.txt"));
        //或者job.setCacheFiles(URI []files);

        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\map\\userinfo.txt"));
        FileOutputFormat.setOutputPath(job,path);

        job.waitForCompletion(true);
    }
}


在Reduce端连接

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

/*
将product表和order表根据商品id连接后输出
 */
public class ForReducerJoin {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,ProductAndOrder>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Text oKey=new Text();
            ProductAndOrder oValue=new ProductAndOrder();
            String line=value.toString();
            String []strs=line.split("\t");
            if(strs.length<4) return;
            //获得当前的文件分片
            FileSplit fileSplit= (FileSplit) context.getInputSplit();
            //获得当前读取的文件名
            String fileName=fileSplit.getPath().getName();

            if("product.txt".equals(fileName)){
                oValue.setProductId(strs[0]);
                oValue.setProductName(strs[1]);
                oValue.setProductPrice(Integer.parseInt(strs[3]));
                oValue.setProductType(strs[2]);
                oValue.setPruduct(true);
                oKey.set(strs[0]);
            }else if("order.txt".equals(fileName)){
                oValue.setOrderDate(strs[1]);
                oValue.setOrderId(strs[0]);
                oValue.setOrderNum(Integer.parseInt(strs[2]));
                oValue.setPruduct(false);
                oKey.set(strs[3]);
            }
            context.write(oKey,oValue);
        }
    }
    public static class ForReducer extends Reducer<Text,ProductAndOrder,Text,NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<ProductAndOrder> values, Context context) throws IOException, InterruptedException {
            List<ProductAndOrder> orders= new ArrayList<ProductAndOrder>();
            ProductAndOrder product=new ProductAndOrder();
            for(ProductAndOrder entity:values){
                if(entity.isPruduct()){
                    product.setProductId(entity.getProductId());
                    product.setProductName(entity.getProductName());
                    product.setProductType(entity.getProductType());
                    product.setProductPrice(entity.getProductPrice());
                }else{
                    ProductAndOrder order=new ProductAndOrder();
                    order.setOrderId(entity.getOrderId());
                    order.setOrderDate(entity.getOrderDate());
                    order.setOrderNum(entity.getOrderNum());
                    orders.add(order);
                }
            }
            for(ProductAndOrder order:orders){
                order.setProductId(product.getProductId());
                order.setProductName(product.getProductName());
                order.setProductType(product.getProductType());
                order.setProductPrice(product.getProductPrice());
                String info=order.toString();
                context.write(new Text(info),NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job= Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ProductAndOrder.class);

        job.setReducerClass(ForReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }
        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\reduce"));
        FileOutputFormat.setOutputPath(job,path);
        job.waitForCompletion(true);

    }
}

(二)自定义key的合并规则

        方法:编写一个类继承WritableComparator,并重写compare方法

                  而且要写一个无参的构造器,调用super(key的类型,true),不然可能会空指针异常

                  在通过job.setGroupingComparatorClass()设置自定义的合并规则


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值