需求:将订单和商品表进行join合并展现
orders.txt 订单数据
Order_0000001,pd001,222.8
Order_0000001,pd005,25.8
Order_0000002,pd005,325.8
Order_0000002,pd003,522.8
Order_0000002,pd004,122.4
Order_0000003,pd001,222.8
Order_0000003,pd001,322.8
products.txt 商品数据
pd001,apple
pd002,banana
pd003,orange
pd004,xiaomi
pd005,meizu
思路:join 实现可以有两种实现形式
1:Reduce端合并数据
这里依赖订单中商品的不同,进行Reduce任务处理,但实际中的畅销商品数据量较大,会导致数据倾斜的问题
2:Mapper端进行合并
这里需要利用mapreduce分布式缓存的技术,将数据量少的商品数据放在缓存中统一调用
这里介绍Mapper端合并的代码如何实现
package com.mapreduce.joinCacheFile;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
/**
* mapreduce join 算法
*/
public class MapJoinDistributedCacheFile {
public static class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
FileReader in =null;
BufferedReader reader = null;
HashMap<String,String[]> b_tab = new HashMap<String,String[]>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//此处加载产品表数据
in = new FileReader("pdts.txt");
reader = new BufferedReader( in );
String line = null;
while(StringUtils.isNotBlank((line= reader.readLine()))){
String [] split = line.split(",");
String [] products = { split[0],split[1] };
b_tab.put( split[0],products );
}
IOUtils.closeStream( reader );
IOUtils.closeStream( in );
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] orderFields = line.split( "\t" );
String pdt_id = orderFields[1];
String[] pdtFields = b_tab.get(pdt_id);
String ll = pdtFields[0]+"\t"+pdtFields[1]+"\t"+orderFields[1]+"\t"+orderFields[2];
context.write( new Text( ll ),NullWritable.get() );
}
}
public static void main(String[] args) throws Exception {
Configuration con = new Configuration();
Job job = Job.getInstance(con);
job.setJarByClass( MapJoinDistributedCacheFile.class );
job.setMapperClass( MapJoinMapper.class );
job.setOutputKeyClass( Text.class );
job.setOutputValueClass( NullWritable.class );
FileInputFormat.setInputPaths( job,new Path("d://bigDataJob/mapJoin/orders") );
FileOutputFormat.setOutputPath( job,new Path("d://bigDataJob/mapJoin/output" ));
//不需要汇总
job.setNumReduceTasks(0);
job.addCacheFile( new URI( "file:/D:/pdts.txt" ));
job.waitForCompletion( true );
}
}