需求2:Map端表合并(Distributedcache)
1)分析
适用于关联表中有小表的情形;
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。
Mapperjion类:
package com.itstar.mr.wc0908.mapperjoin;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.mortbay.util.StringUtil;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
public class Mapperjion extends Mapper <LongWritable, Text,Text, NullWritable> {
HashMap<String,String> hashMap= new HashMap<String,String>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//1获取缓存中的数据
BufferedReader reader= new BufferedReader(new InputStreamReader(new FileInputStream("D:\\大数据\\08-19压缩\\pd.txt")));
//2.切分
String line;
while (StringUtils.isNotEmpty(line=reader.readLine())){
String[] strings=line.split("\t");
//3.将小表里面的数据存储进HashMap集合中
hashMap.put(strings[0],strings[1]);
}
}
Text k=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1获取大表(订单表)一行数据
String line=value.toString();
//2切分
String[] strings=line.split("\t");
//3获取订单里面的ID
String pid = strings[1];
//4获取产品名称
String pname = hashMap.get(pid);
//5进行拼接
k.set(strings[0]+"\t"+pname+"\t"+strings[2]);
//6输出
context.write(k,NullWritable.get());
}
}
MapperJoinDrive类:
package com.itstar.mr.wc0908.mapperjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import javax.xml.soap.Text;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapperJoinDrive {
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
args=new String[]{"D:\\大数据中高期\\08-19压缩\\order.txt","D:\\大数据中高期\\08-19压缩\\mapperjoinout","file:///D:/大数据中高期/08-19压缩/pd.txt"};
//实例化配置文件
Configuration conf = new Configuration();
//配置job信息
Job job = Job.getInstance(conf);
job.setJarByClass(MapperJoinDrive.class);
//配置mapper
job.setMapperClass(Mapperjion.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//reduce的个数
job.setNumReduceTasks(0);
//输入路径写大表的路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//加载小表到缓存中
job.addCacheFile(new URI(args[2]));
//提交
job.waitForCompletion(true);
}
}
结果:打印成功
输出生成的文件: