MR之MapJoin
前言
有一个商品文件和一个订单文件,其中订单文件记录了商品文件的编号,而商品文件中有商品的中文名称。
需求:需要输出一个订单文件,字段为:商品名称、商品总量、总金额。
一、实现思路
我们需要读取商品文件形成一个Map集合,再通过订单文件中的商品id,在Map集合中找到商品名称,再实现对应的wc统计就行了
二、具体代码
1.MapJoinOrderBean类
package com.hadoop.mapreduce.mapJoin;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author LengQing
* @date 2020/5/3 - 15:20
* 实现将订单表和产品表数据合并
*/
public class MapJoinOrderBean implements WritableComparable<MapJoinOrderBean> {
private String o_id; // order表:订单id
private String p_id; // order表、product表:商品id
private Integer o_price; // order表:商品单价
private Integer o_num; // order表:商品数量
private String p_name; // product表:商品名称
public MapJoinOrderBean() {
super();
}
public MapJoinOrderBean(String o_id, String p_id, Integer o_price, Integer o_num, String p_name) {
this.o_id = o_id;
this.p_id = p_id;
this.o_price = o_price;
this.o_num = o_num;
this.p_name = p_name;
}
@Override
public String toString() {
return o_id + '\t' + p_name + '\t' + o_num + '\t' + o_price;
}
public String getO_id() {
return o_id;
}
public void setO_id(String o_id) {
this.o_id = o_id;
}
public String getP_id() {
return p_id;
}
public void setP_id(String p_id) {
this.p_id = p_id;
}
public Integer getO_price() {
return o_price;
}
public void setO_price(Integer o_price) {
this.o_price = o_price;
}
public Integer getO_num() {
return o_num;
}
public void setO_num(Integer o_num) {
this.o_num = o_num;
}
public String getP_name() {
return p_name;
}
public void setP_name(String p_name) {
this.p_name = p_name;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(o_id);
out.writeUTF(p_id);
out.writeInt(o_price);
out.writeInt(o_num);
out.writeUTF(p_name);
}
@Override
public void readFields(DataInput in) throws IOException {
o_id = in.readUTF();
p_id = in.readUTF();
o_price = in.readInt();
o_num = in .readInt();
p_name = in.readUTF();
}
@Override
public int compareTo(MapJoinOrderBean o) {
return 0;
}
}
2.MapJoinMapper类
package com.hadoop.mapreduce.mapJoin;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
/**
* @author LengQing
* @date 2020/5/3 - 15:07
*/
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, MapJoinOrderBean> {
private Text outputKey = new Text();
private MapJoinOrderBean orderBean = new MapJoinOrderBean();
private Map<String, String> pdMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
String cacheFile = "hdfs://mycluster:9000/datas/products.txt";
FileSystem fileSystem = FileSystem.get(URI.create(cacheFile), context.getConfiguration());
FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile));
BufferedReader reader = new BufferedReader(new InputStreamReader(fsDataInputStream));
// 1 获取输入文件切片
//URI[] cacheFiles = context.getCacheFiles();
//String path = cacheFiles[0].getPath().toString();
//FileInputStream fileInputStream = new FileInputStream(path);
//InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "UTF-8");
//BufferedReader reader = new BufferedReader(inputStreamReader);
String line;
while(StringUtils.isNotEmpty(line = reader.readLine())) {
// 2 切割
String[] fields = line.split("\t");
// 3 缓存数据到集合
pdMap.put(fields[0], fields[1]);
}
// 4 关流
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split("\t");
orderBean.setO_id(line[0]);
orderBean.setP_id(line[1]);
orderBean.setO_num(Integer.parseInt(line[2]));
orderBean.setO_price(Integer.parseInt(line[3]));
orderBean.setP_name(pdMap.get(line[1]));
outputKey.set(pdMap.get(line[1]));
context.write(this.outputKey, orderBean);
}
}
3.MapJoinReduce类
package com.hadoop.mapreduce.mapJoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author LengQing
* @date 2020/5/3 - 15:08
*/
public class MapJoinReduce extends Reducer<Text, MapJoinOrderBean, Text, Text> {
private Text outputValue = new Text();
@Override
protected void reduce(Text key, Iterable<MapJoinOrderBean> values, Context context) throws IOException, InterruptedException {
int num = 0;
Double price = 0.0;
for (MapJoinOrderBean value : values) {
num += value.getO_num();
price += value.getO_price();
}
outputValue.set(num + "\t" + price);
context.write(key, this.outputValue);
}
}
4.MapJoinDriver类
package com.hadoop.mapreduce.mapJoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @author LengQing
* @date 2020/5/3 - 14:29
* 需要输出一个订单文件,字段为:商品名称、商品总量、总金额。
*/
public class MapJoinDriver extends Configured implements Tool {
private Configuration conf = new Configuration();
@Override
public int run(String[] args) throws Exception {
// 实例化Job
Job job = Job.getInstance(conf, "reduceJoin");
job.setJarByClass(MapJoinDriver.class);
//job.addCacheFile(new URI("hdfs://mycluster:8020/datas/order/product.txt"));
// 1 input阶段
Path inputPath = new Path(args[0]);
FileInputFormat.setInputPaths(job, inputPath);
// 2 map阶段
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MapJoinOrderBean.class);
// 3 shuffle阶段
//job.setGroupingComparatorClass(mapJoinGroup.class);
// 4 reduce阶段
job.setReducerClass(MapJoinReduce.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//job.setNumReduceTasks(0);
// 5 output阶段
Path outputPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputPath);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) {
try {
int status = ToolRunner.run(new MapJoinDriver(), args);
System.exit(status);
} catch (Exception e) {
e.printStackTrace();
}
}
}
三、运行注意点
注意cacheFile文件位置,文件不能过大
集群运行输入参数:1、输入文件名称 2、输出文件名称
四、个人运行结果
1、输入文件order.txt

2、product.txt文件

3、export结果

总结
3年前被问到的面试题,实现很简单,但是却很重要。
658

被折叠的 条评论
为什么被折叠?



