MapReduce连接查询入门
MapReduce连接查询分为reduce连接查询和map连接查询。
reduce阶段连接查询
缺点:容易出现数据倾斜
解决方法:使用map阶段连接查询
Map阶段连接查询
记录商品信息文件link/shop(商品id,商品名称)
1 小米
2 华为
3 联想
记录订单信息文件link/order(订单id,商品id,商品数量)
1 1 1
2 2 2
3 3 3
4 1 4
5 2 5
6 3 6
将order文件和shop文件合并,输出格式为订单编号、商品名称、商品数量的数据。
实体类
实现Writable接口,序列化
public class Goods implements Writable{
//订单编号
private int oid;
//商品编号
private int nid;
//商品名称
private String name;
//商品数量
private int num;
public Goods(int oid, int nid, String name, int num) {
this.oid = oid;
this.nid = nid;
this.name = name;
this.num = num;
}
@Override
public String toString() {
return "Goods{" +
"oid=" + oid +
", nid=" + nid +
", name='" + name + '\'' +
", num=" + num +
'}';
}
public int getOid() {
return oid;
}
public void setOid(int oid) {
this.oid = oid;
}
public int getNid() {
return nid;
}
public void setNid(int nid) {
this.nid = nid;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
public Goods() {
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.oid);
dataOutput.writeInt(this.nid);
dataOutput.writeUTF(this.name);
dataOutput.writeInt(this.num);
}
public void readFields(DataInput dataInput) throws IOException {
this.oid=dataInput.readInt();
this.nid=dataInput.readInt();
this.name=dataInput.readUTF();
this.num=dataInput.readInt();
}
}
Mapper
在自定义mapper中,将文件中的数据放入Goods对象中,以共同属性商品名称为输出key,Goods对象为value。
public class LinkMapper extends Mapper<LongWritable,Text,IntWritable,Goods>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit = (FileSplit) inputSplit;
String name = fileSplit.getPath().getName();
String string = value.toString();
String[] split = string.split(" ");
Goods goods = new Goods();
if(name.startsWith("shop")){
//商品
goods.setOid(0);
goods.setNum(0);
goods.setNid(Integer.parseInt(split[0]));
goods.setName(split[1]);
}else {
//订单
goods.setOid(Integer.parseInt(split[0]));
goods.setNid(Integer.parseInt(split[1]));
goods.setName("");
goods.setNum(Integer.parseInt(split[2]));
}
context.write(new IntWritable(goods.getNid()),goods);
}
}
Reduce
Reduce得到的数据中,key相同的Goods对象中,从order中获取的对象会缺少商品名称,从shop中获取的对象,中存在商品mc,将shop中的商品名称放入order对象中,输出获得结果。
public class LinkReducer extends Reducer<IntWritable,Goods,Text,NullWritable>{
@Override
protected void reduce(IntWritable key, Iterable<Goods> values, Context context) throws IOException, InterruptedException {
ArrayList<Goods> list = new ArrayList<Goods>();
String name = "";
for (Goods goods:values){
if (StringUtils.isEmpty(goods.getName())){
Goods goods1 = new Goods();
try {
BeanUtils.copyProperties(goods1,goods);
} catch (Exception e) {
e.printStackTrace();
}
list.add(goods1);
}else {
name = goods.getName();
}
}
for (Goods goods:list){
System.out.println(goods);
String str = goods.getOid()+" "+name+" "+goods.getNum();
context.write(new Text(str),NullWritable.get());
}
}
}
Driver
public class LinkWork {
public static void main(String[] args) {
Configuration config = new Configuration();
try {
Job link = Job.getInstance(config);
link.setJobName("link");
link.setJarByClass(LinkWork.class);
link.setMapperClass(LinkMapper.class);
link.setReducerClass(LinkReducer.class);
link.setMapOutputKeyClass(IntWritable.class);
link.setMapOutputValueClass(Goods.class);
link.setOutputKeyClass(Text.class);
link.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(link,"link");
Path sort = new Path("linkout");
if (sort.getFileSystem(config).exists(sort)){
sort.getFileSystem(config).delete(sort,true);
}
FileOutputFormat.setOutputPath(link,new Path("linkout"));
link.waitForCompletion(true);
link.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}