Hadoop分布式计算框架MapReduce
一.MapReduce实现join操作
使用MapReduce API来实现join
1.Reduce join
1.1.需求
假如数据量巨大,两表的数据是以文件的形式存储在 HDFS 中,需要用 MapReduce 程序来实现以下 SQL 查询运算:
select c.customer_id,c.customer_name,o.orderId,o.order_status from customer c join order o on c.customer_id=o.customer_id
1.2.原理
Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
reduce端的主要工作:在reduce端以连接字段作为key分组已经完成,我们只需要在每一个分组当中将那些来源不同文件的记录(在map阶段已经打标志)分开,最后进行合并就ok了。
1.3.缺点
会造成map和reduce端也就是shuffle阶段出现大量的数据传输,效率很低。
1.4.基本分析
1.输入路径:
customer.csv
order.csv
2.map端:判断字段个数,如果是四个字段是order表;10个字段的话,customer表
key value
(customer_id,(customer_id,customer_name,orderId,order_Status,flag))
3.reduce端:
对同一个customer_id的key进行处理,将value进行拼接
接收的数据:
顾客表
“256”,“David”,“Rodriguez”,“XXXXXXXXX”,“XXXXXXXXX”,“7605 Tawny Horse Falls”,“Chicago”,“IL”,“60625”
对应的订单表
“2”,“2013-07-25 00:00:00”,“256”,“PENDING_PAYMENT”
“9467”,“2013-09-22 00:00:00”,“256”,“CLOSED”
map输出(即reduce接收端):
key value
(“256”,CustomerOrder(“256",“David”,null,null,"0)
CustomerOrder(“256”,null,“2”,“PENDING_PAYMENT”,“1”)
CustomerOrder(“256”,null,“9467”,“Closed”,“1”)
…)
//订单的集合
ArrayList(
CustomerOrder(“256”,null,“2”,“PENDING_PAYMENT”,“1”)
CustomerOrder(“256”,null,“9467”,“Closed”,“1”)
)
//客户的对象
CustomerOrder(“256",“David”,null,null,"0)
1.5.具体事例实现
Mapper端
package cn.kgc.kb09.reducejoin;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class CustomerOrderMapper extends Mapper<LongWritable, Text,Text,CustomerOreders> {
CustomerOreders customerOreders=new CustomerOreders();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将字段进行切割,返回字段数组
String[] fields = value.toString().split(",");
//进行判断,如果是4个字段进可以判断成订单表,否则是顾客表
if(fields.length==4){
//对订单表中可以赋值的字段进行赋值
customerOreders.setCustomer_id(fields[2]);
customerOreders.setCustomer_name("");
customerOreders.setOrder_id(fields[0]);
customerOreders.setOrder_status(fields[3]);
customerOreders.setFlag("1");
}else {
//对顾客表中可以赋值的字段进行赋值
customerOreders.setCustomer_id(fields[0]);
customerOreders.setCustomer_name(fields[1]);
customerOreders.setOrder_id("");
customerOreders.setOrder_status("");
customerOreders.setFlag("0");
}
context.write(new Text(customerOreders.getCustomer_id()),customerOreders);
}
}
reduce端
package cn.kgc.kb09.reducejoin;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.codehaus.jackson.map.util.BeanUtil;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
public class CustomerOrderReduce extends Reducer<Text,CustomerOreders,CustomerOreders, NullWritable> {
@Override
protected void reduce(Text key, Iterable<CustomerOreders> values, Context context) throws IOException, InterruptedException {
//1.准备订单记录的集合
ArrayList<CustomerOreders> orderBeans=new ArrayList<>();
//主播1个客户bean对象
CustomerOreders cusbean=new CustomerOreders();
//2.把数据放入到集合里,准备合并bean对象
for (CustomerOreders bean : values) {
if ("1".equals(bean.getFlag())){//订单表
CustomerOreders orderBean=new CustomerOreders();
try {
//拷贝传递过来的每条订单数据到集合里
BeanUtils.copyProperties(orderBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
orderBeans.add(orderBean);
}else {//客户表
try {
//拷贝传递过来的客户表到内存里
BeanUtils.copyProperties(cusbean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
//3.遍历表,进行空白字段拼接
for (CustomerOreders bean : orderBeans) {
bean.setCustomer_name(cusbean.getCustomer_name());
//4.调写出方法,进行写出
context.write(bean,NullWritable.get());
}
}
}
CustomerOreders类
package cn.kgc.kb09.reducejoin;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class CustomerOreders implements Writable {
private String customer_id;
private String customer_name;
private String order_id;
private String order_status;
//标志位
private String flag;
public CustomerOreders() {
}
public CustomerOreders(String customer_id, String customer_name, String order_id, String order_status, String flag) {
this.customer_id = customer_id;
this.customer_name = customer_name;
this.order_id = order_id;
this.order_status = order_status;
this.flag = flag;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(customer_id);
dataOutput.writeUTF(customer_name);
dataOutput.writeUTF(order_id);
dataOutput.writeUTF(order_status);
dataOutput.writeUTF(flag);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.customer_id=dataInput.readUTF();
this.customer_name=dataInput.readUTF();
this.order_id=dataInput.readUTF();
this.order_status=dataInput.readUTF();
this.flag=dataInput.readUTF();
}
@Override
public String toString() {
return customer_id + ',' + customer_name +
", '" + order_id +
", '" + order_status;
}
public String getCustomer_id() {
return customer_id;
}
public void setCustomer_id(String customer_id) {
this.customer_id = customer_id;
}
public String getCustomer_name() {
return customer_name;
}
public void setCustomer_name(String customer_name) {
this.customer_name = customer_name;
}
public String getOrder_id() {
return order_id;
}
public void setOrder_id(String order_id) {
this.order_id = order_id;
}
public String getOrder_status() {
return order_status;
}
public void setOrder_status(String order_status) {
this.order_status = order_status;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
}
driver端
package cn.kgc.kb09.reducejoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class CustomerOrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.创建配置文件,创建job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "reducejoin");
//2.设置jar的位置
job.setJarByClass(CustomerOreders.class);
//3.设置map和reduce的位置
job.setMapperClass(CustomerOrderMapper.class);
job.setReducerClass(CustomerOrderReduce.class);
//4.设置map输出的key,value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(CustomerOreders.class);
//5.设置reduce输出的key,value类型
job.setOutputKeyClass(CustomerOreders.class);
job.setOutputValueClass(NullWritable.class);
//6.设置输入输出路径
FileInputFormat.setInputPaths(job,new Path("file:///C:/Users/86188/Desktop/7/data"));
FileOutputFormat.setOutputPath(job,new Path("file:///F:/test/6"));//test下的文件夹3不能提前存在
//7.提交程序运行
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
2.Map Join
2.1使用场景
一张表十分小,一张表很大
2.2使用方法
在提交作业的时候先将小表文件放到该作业的DistributedCache中,然后从DistributeCache中取出该小表进行join (比如放到Hash Map等等容器中)。然后扫描大表,看大表中的每条记录的join key/value值是否能够在内存中找到相同join key的记录,如果有则直接输出结果。
2.3具体事例实现
Mapper端
package cn.kgc.kb09.mapjoin;
import cn.kgc.kb09.reducejoin.CustomerOreders;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
public class MapJoinMapper extends Mapper<LongWritable, Text, CustomerOreders, NullWritable> {
HashMap<String,String>customerMap=new HashMap<>();
CustomerOreders customerOreders=new CustomerOreders();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取缓存文件的URI
URI[] cacheFiles = context.getCacheFiles();
if(null!=cacheFiles&&cacheFiles.length>0){
//获取文件路径,文件名
String fileName = cacheFiles[0].getPath().toString();
//获取文件缓冲读入流
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
String line;
//读取这个文件,将第一列和第二列作为HashMap的key和value值
while(StringUtils.isNotEmpty(line=bufferedReader.readLine())){
String[] split = line.split(",");
customerMap.put(split[0],split[1]);
}
//关闭资源
bufferedReader.close();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取第一行,切割成字段
String[] fields = value.toString().split(",");
//为customerOreders进行赋值
customerOreders.setCustomer_id(fields[2]);
customerOreders.setOrder_id(fields[0]);
customerOreders.setOrder_status(fields[3]);
//从HashMap中获取客户的姓名
customerOreders.setCustomer_name(customerMap.get(fields[2]));
//写出
context.write(customerOreders,NullWritable.get());
}
}
driver端
设置缓存文件(小文件)
job.addCacheFile(new URI("file:///C:/Users/86188/Desktop/7/data/customers.csv"));
package cn.kgc.kb09.mapjoin;
import cn.kgc.kb09.mr1.WCDriver1;
import cn.kgc.kb09.mr1.WCMapper1;
import cn.kgc.kb09.mr1.WCReduce1;
import cn.kgc.kb09.reducejoin.CustomerOreders;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class MapJoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
//1.创建配置文件,创建job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "mapjoin");
//2.设置jar的位置
job.setJarByClass(MapJoinDriver.class);
//3.设置map和reduce的位置
job.setMapperClass(MapJoinMapper.class);
//4.设置map输出的key,value类型
job.setMapOutputKeyClass(CustomerOreders.class);
job.setMapOutputValueClass(NullWritable.class);
//5.设置reduce的个数为0,即没有reduce
job.setNumReduceTasks(0);
//6.设置输入输出路径
FileInputFormat.setInputPaths(job,new Path("file:///C:/Users/86188/Desktop/7/mapjoin"));
FileOutputFormat.setOutputPath(job,new Path("file:///F:/test/8"));//test下的文件夹3不能提前存在
//设置缓存文件
job.addCacheFile(new URI("file:///C:/Users/86188/Desktop/7/data/customers.csv"));
//7.提交程序运行
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
二.推测执行
问题
- 程序bug或负载不均时,部分任务成为短板
- 如:100个map任务中的99个完成,剩下1个停留10%
使用推测执行启动备份任务
- 取最先完成的作为最终结果
- 利用资源来换取时间的一种优化策略
- 资源很紧张时不适用