一、reduce端join算法实现
实现机制:
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联
缺点:
这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
需求:
订单数据表t_order:
id | date | pid | amount |
1001 | 20150710 | P0001 | 2 |
1002 | 20150710 | P0001 | 3 |
1002 | 20150710 | P0002 | 3 |
商品信息表t_product
id | pname | category_id | price |
P0001 | 小米5 | 1000 | 2000 |
P0002 | 锤子T1 | 1000 | 3000 |
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id |
第一步:定义OrderBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JoinBean implements Writable {
private String id;
private String date;
private String pid;
private String amount;
private String name;
private String categoryId;
private String price;
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id+"");
out.writeUTF(date+"");
out.writeUTF(pid+"");
out.writeUTF(amount+"");
out.writeUTF(name+"");
out.writeUTF(categoryId+"");
out.writeUTF(price+"");
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readUTF();
this.name = in.readUTF();
this.categoryId = in.readUTF();
this.price = in.readUTF();
}
public JoinBean() {
}
public JoinBean(String id, String date, String pid, String amount, String name, String categoryId, String price) {
this.id = id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.name = name;
this.categoryId = categoryId;
this.price = price;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategoryId() {
return categoryId;
}
public void setCategoryId(String categoryId) {
this.categoryId = categoryId;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public String getAmount() {
return amount;
}
public void setAmount(String amount) {
this.amount = amount;
}
@Override
public String toString() {
return "JoinBean{" +
"id='" + id + '\'' +
", date='" + date + '\'' +
", pid='" + pid + '\'' +
", amount='" + amount + '\'' +
", name='" + name + '\'' +
", categoryId='" + categoryId + '\'' +
", price='" + price + '\'' +
'}';
}
}
第二步:定义map类
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class OrderJoinMapper extends Mapper<LongWritable,Text,Text,JoinBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
JoinBean joinBean = new JoinBean();
String[] data = value.toString().split(",");
FileSplit split = (FileSplit) context.getInputSplit();
String name = split.getPath().getName();
if (name.contains("orders")){
joinBean.setId(data[0]);
joinBean.setDate(data[1]);
joinBean.setPid(data[2]);
joinBean.setAmount(data[3]);
context.write(new Text(data[2]),joinBean);
}else {
joinBean.setName(data[1]);
joinBean.setCategoryId(data[2]);
joinBean.setPrice(data[3]);
context.write(new Text(data[0]),joinBean);
}
}
}
第三步:自定义reduce类
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class OrderJoinReducer extends Reducer<Text,JoinBean,JoinBean,NullWritable> {
@Override
protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {
JoinBean joinBean = new JoinBean();
for (JoinBean value : values) {
String[] split = value.toString().split(",");
if (split[0]!=null && !split[0].equals("null")){
joinBean.setId(value.getId());
joinBean.setDate(value.getDate());
joinBean.setPid(value.getPid());
joinBean.setAmount(value.getAmount());
}else{
joinBean.setName(value.getName());
joinBean.setCategoryId(value.getCategoryId());
joinBean.setPrice(value.getPrice());
}
}
context.write(joinBean,NullWritable.get());
}
}
第四步:开发main方法入口
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class OrderJoinDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// job.setJarByClass(MapJoinDriver.class);
job.setMapperClass(OrderJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setReducerClass(OrderJoinReducer.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
TextInputFormat.addInputPath(job,new Path("F:\\3term\\task_201911118_customInputFormat\\src\\main\\java\\com\\czxy\\demo05_map_join\\input"));
TextOutputFormat.setOutputPath(job,new Path("F:\\3term\\task_201911118_customInputFormat\\src\\main\\java\\com\\czxy\\demo05_map_join\\output"));
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new OrderJoinDriver(),args);
}
}
map端join算法实现
原理阐述
适用于关联表中有小表的情形;
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度
实现示例
--先在mapper类中预先定义好小表,进行join
--引入实际场景中的解决方案:一次加载数据库或者用
第一步:定义mapJoin
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
public class MapJoinMapper extends Mapper<LongWritable,Text,Text,Text> {
HashMap<String,String> map = new HashMap<>();
String line = null;
//读取文件系统上的小文件存储到map集合
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取分布式缓存数据
URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration());
//获取文件系统
FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration());
//通过文件系统open方法获取输入流
FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
//获取高效读取流
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
//p0001,xiaomi,1000,2
while ((line = bufferedReader.readLine())!=null){
String[] split = line.split(",");
map.put(split[0],line);
//key p001
//valuue p0001,xiaomi,1000,2
}
}
//大文件map端join小文件
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//value 就是1001,20150710,p0001,2
String[] split = value.toString().split(",");//1001 20150710 p0001 2
String s = map.get(split[2]);
context.write(new Text(s),new Text(split[0]+"\t"+split[1]+"\t"+split[3]));
}
}
第二步:定义程序运行main方法
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class MapJoinDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
//将小数据添加到分布式缓存
//URI必须是HDFS上的文件
DistributedCache.addCacheFile(new URI("F:\\3term\\task_201911118_customInputFormat\\src\\main\\java\\com\\czxy\\demo05_map_join\\input\\order.txt"),conf);
Job job = Job.getInstance(conf,"MapJoinDriver");
// job.setJarByClass(MapJoinDriver.class);
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("F:\\3term\\task_201911118_customInputFormat\\src\\main\\java\\com\\czxy\\demo05_map_join\\input\\product.txt"));
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("F:\\3term\\task_201911118_customInputFormat\\src\\main\\java\\com\\czxy\\demo05_map_join\\output"));
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new MapJoinDriver(),args);
}
}