MapReduce实现join算法

最新推荐文章于 2021-08-30 21:06:57 发布

weizhouck

最新推荐文章于 2021-08-30 21:06:57 发布

阅读量266

点赞数

分类专栏： mapreduce

本文链接：https://blog.csdn.net/WandaZw/article/details/82744948

版权

mapreduce 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

需求：将订单和商品表进行join合并展现

orders.txt 订单数据

Order_0000001,pd001,222.8
Order_0000001,pd005,25.8
Order_0000002,pd005,325.8
Order_0000002,pd003,522.8
Order_0000002,pd004,122.4
Order_0000003,pd001,222.8
Order_0000003,pd001,322.8

products.txt 商品数据

pd001,apple
pd002,banana
pd003,orange
pd004,xiaomi
pd005,meizu

思路：join 实现可以有两种实现形式

1：Reduce端合并数据

这里依赖订单中商品的不同，进行Reduce任务处理，但实际中的畅销商品数据量较大，会导致数据倾斜的问题

2：Mapper端进行合并

这里需要利用mapreduce分布式缓存的技术,将数据量少的商品数据放在缓存中统一调用

这里介绍Mapper端合并的代码如何实现

package com.mapreduce.joinCacheFile;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;

/**
 * mapreduce join 算法
 */
public class MapJoinDistributedCacheFile {

    public static class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
        FileReader in =null;
        BufferedReader reader = null;
        HashMap<String,String[]> b_tab = new HashMap<String,String[]>();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //此处加载产品表数据
            in = new FileReader("pdts.txt");
            reader = new BufferedReader( in );
            String line = null;
            while(StringUtils.isNotBlank((line= reader.readLine()))){
                String [] split = line.split(",");
                String [] products = { split[0],split[1] };
                b_tab.put( split[0],products );
            }
            IOUtils.closeStream( reader );
            IOUtils.closeStream( in );
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] orderFields = line.split( "\t" );
            String pdt_id = orderFields[1];
            String[] pdtFields = b_tab.get(pdt_id);
            String ll = pdtFields[0]+"\t"+pdtFields[1]+"\t"+orderFields[1]+"\t"+orderFields[2];
            context.write( new Text( ll ),NullWritable.get() );
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration con = new Configuration();
        Job job = Job.getInstance(con);

        job.setJarByClass( MapJoinDistributedCacheFile.class );
        job.setMapperClass( MapJoinMapper.class );

        job.setOutputKeyClass( Text.class );
        job.setOutputValueClass( NullWritable.class );

        FileInputFormat.setInputPaths( job,new Path("d://bigDataJob/mapJoin/orders") );
        FileOutputFormat.setOutputPath( job,new Path("d://bigDataJob/mapJoin/output" ));
        //不需要汇总
        job.setNumReduceTasks(0);

        job.addCacheFile( new URI( "file:/D:/pdts.txt" ));

        job.waitForCompletion( true );

    }
}