Hadoop分布式计算框架MapReduce(二)

一.MapReduce实现join操作

使用MapReduce API来实现join

1.Reduce join

1.1.需求

假如数据量巨大,两表的数据是以文件的形式存储在 HDFS 中,需要用 MapReduce 程序来实现以下 SQL 查询运算:

 select c.customer_id,c.customer_name,o.orderId,o.order_status from customer c join order o on c.customer_id=o.customer_id

1.2.原理

Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
reduce端的主要工作:在reduce端以连接字段作为key分组已经完成,我们只需要在每一个分组当中将那些来源不同文件的记录(在map阶段已经打标志)分开,最后进行合并就ok了。

1.3.缺点

会造成map和reduce端也就是shuffle阶段出现大量的数据传输,效率很低。

1.4.基本分析

1.输入路径:
customer.csv
order.csv

2.map端:判断字段个数,如果是四个字段是order表;10个字段的话,customer表

​ key value

(customer_id,(customer_id,customer_name,orderId,order_Status,flag))

3.reduce端:

对同一个customer_id的key进行处理,将value进行拼接

接收的数据:
顾客表

“256”,“David”,“Rodriguez”,“XXXXXXXXX”,“XXXXXXXXX”,“7605 Tawny Horse Falls”,“Chicago”,“IL”,“60625”

对应的订单表

“2”,“2013-07-25 00:00:00”,“256”,“PENDING_PAYMENT”

“9467”,“2013-09-22 00:00:00”,“256”,“CLOSED”

map输出(即reduce接收端):

key value

(“256”,CustomerOrder(“256",“David”,null,null,"0)

​ CustomerOrder(“256”,null,“2”,“PENDING_PAYMENT”,“1”)

​ CustomerOrder(“256”,null,“9467”,“Closed”,“1”)

…)

//订单的集合

ArrayList(

​ CustomerOrder(“256”,null,“2”,“PENDING_PAYMENT”,“1”)

​ CustomerOrder(“256”,null,“9467”,“Closed”,“1”)
)

//客户的对象

CustomerOrder(“256",“David”,null,null,"0)

1.5.具体事例实现

Mapper端

package cn.kgc.kb09.reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class CustomerOrderMapper extends Mapper<LongWritable, Text,Text,CustomerOreders> {
    CustomerOreders customerOreders=new CustomerOreders();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
       //将字段进行切割,返回字段数组
        String[] fields = value.toString().split(",");
        //进行判断,如果是4个字段进可以判断成订单表,否则是顾客表
        if(fields.length==4){
            //对订单表中可以赋值的字段进行赋值
            customerOreders.setCustomer_id(fields[2]);
            customerOreders.setCustomer_name("");
            customerOreders.setOrder_id(fields[0]);
            customerOreders.setOrder_status(fields[3]);
            customerOreders.setFlag("1");
        }else {
            //对顾客表中可以赋值的字段进行赋值
            customerOreders.setCustomer_id(fields[0]);
            customerOreders.setCustomer_name(fields[1]);
            customerOreders.setOrder_id("");
            customerOreders.setOrder_status("");
            customerOreders.setFlag("0");
        }
        context.write(new Text(customerOreders.getCustomer_id()),customerOreders);
    }
}

reduce端

package cn.kgc.kb09.reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.codehaus.jackson.map.util.BeanUtil;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class CustomerOrderReduce extends Reducer<Text,CustomerOreders,CustomerOreders, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<CustomerOreders> values, Context context) throws IOException, InterruptedException {
        //1.准备订单记录的集合
        ArrayList<CustomerOreders> orderBeans=new ArrayList<>();
        //主播1个客户bean对象
        CustomerOreders cusbean=new CustomerOreders();
        //2.把数据放入到集合里,准备合并bean对象
        for (CustomerOreders bean : values) {
            if ("1".equals(bean.getFlag())){//订单表
                CustomerOreders orderBean=new CustomerOreders();
                try {
                    //拷贝传递过来的每条订单数据到集合里
                    BeanUtils.copyProperties(orderBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }

                orderBeans.add(orderBean);
            }else {//客户表
                try {
                    //拷贝传递过来的客户表到内存里
                    BeanUtils.copyProperties(cusbean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        //3.遍历表,进行空白字段拼接
        for (CustomerOreders bean : orderBeans) {
            bean.setCustomer_name(cusbean.getCustomer_name());
            //4.调写出方法,进行写出
            context.write(bean,NullWritable.get());
        }
    }
}

CustomerOreders类

package cn.kgc.kb09.reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class CustomerOreders implements Writable {
    private  String customer_id;
    private String customer_name;
    private String order_id;
    private String order_status;
    //标志位
    private String flag;

    public CustomerOreders() {
    }

    public CustomerOreders(String customer_id, String customer_name, String order_id, String order_status, String flag) {
        this.customer_id = customer_id;
        this.customer_name = customer_name;
        this.order_id = order_id;
        this.order_status = order_status;
        this.flag = flag;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(customer_id);
        dataOutput.writeUTF(customer_name);
        dataOutput.writeUTF(order_id);
        dataOutput.writeUTF(order_status);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
  this.customer_id=dataInput.readUTF();
  this.customer_name=dataInput.readUTF();
  this.order_id=dataInput.readUTF();
  this.order_status=dataInput.readUTF();
  this.flag=dataInput.readUTF();
    }

    @Override
    public String toString() {
        return customer_id + ',' + customer_name +
                ", '" + order_id +
                ", '" + order_status;

    }

    public String getCustomer_id() {
        return customer_id;
    }

    public void setCustomer_id(String customer_id) {
        this.customer_id = customer_id;
    }

    public String getCustomer_name() {
        return customer_name;
    }

    public void setCustomer_name(String customer_name) {
        this.customer_name = customer_name;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getOrder_status() {
        return order_status;
    }

    public void setOrder_status(String order_status) {
        this.order_status = order_status;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }
}

driver端

package cn.kgc.kb09.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class CustomerOrderDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.创建配置文件,创建job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "reducejoin");
        //2.设置jar的位置
        job.setJarByClass(CustomerOreders.class);
        //3.设置map和reduce的位置
        job.setMapperClass(CustomerOrderMapper.class);
        job.setReducerClass(CustomerOrderReduce.class);
        //4.设置map输出的key,value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CustomerOreders.class);
        //5.设置reduce输出的key,value类型
        job.setOutputKeyClass(CustomerOreders.class);
        job.setOutputValueClass(NullWritable.class);
        //6.设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path("file:///C:/Users/86188/Desktop/7/data"));
        FileOutputFormat.setOutputPath(job,new Path("file:///F:/test/6"));//test下的文件夹3不能提前存在
        //7.提交程序运行
        boolean result=job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

2.Map Join

2.1使用场景

一张表十分小,一张表很大

2.2使用方法

在提交作业的时候先将小表文件放到该作业的DistributedCache中,然后从DistributeCache中取出该小表进行join (比如放到Hash Map等等容器中)。然后扫描大表,看大表中的每条记录的join key/value值是否能够在内存中找到相同join key的记录,如果有则直接输出结果。

2.3具体事例实现

Mapper端

package cn.kgc.kb09.mapjoin;

import cn.kgc.kb09.reducejoin.CustomerOreders;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

public class MapJoinMapper extends Mapper<LongWritable, Text, CustomerOreders, NullWritable> {
   HashMap<String,String>customerMap=new HashMap<>();
   CustomerOreders customerOreders=new CustomerOreders();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取缓存文件的URI
        URI[] cacheFiles = context.getCacheFiles();
        if(null!=cacheFiles&&cacheFiles.length>0){
            //获取文件路径,文件名
            String fileName = cacheFiles[0].getPath().toString();
            //获取文件缓冲读入流
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
            String line;
            //读取这个文件,将第一列和第二列作为HashMap的key和value值
            while(StringUtils.isNotEmpty(line=bufferedReader.readLine())){
                String[] split = line.split(",");
                customerMap.put(split[0],split[1]);
            }
            //关闭资源
            bufferedReader.close();
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取第一行,切割成字段
        String[] fields = value.toString().split(",");
        //为customerOreders进行赋值
        customerOreders.setCustomer_id(fields[2]);
        customerOreders.setOrder_id(fields[0]);
        customerOreders.setOrder_status(fields[3]);
        //从HashMap中获取客户的姓名
        customerOreders.setCustomer_name(customerMap.get(fields[2]));
        //写出
        context.write(customerOreders,NullWritable.get());

    }
}

driver端
设置缓存文件(小文件)

  job.addCacheFile(new URI("file:///C:/Users/86188/Desktop/7/data/customers.csv"));
package cn.kgc.kb09.mapjoin;

import cn.kgc.kb09.mr1.WCDriver1;
import cn.kgc.kb09.mr1.WCMapper1;
import cn.kgc.kb09.mr1.WCReduce1;
import cn.kgc.kb09.reducejoin.CustomerOreders;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class MapJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        //1.创建配置文件,创建job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "mapjoin");
        //2.设置jar的位置
        job.setJarByClass(MapJoinDriver.class);
        //3.设置map和reduce的位置
        job.setMapperClass(MapJoinMapper.class);

        //4.设置map输出的key,value类型
        job.setMapOutputKeyClass(CustomerOreders.class);
        job.setMapOutputValueClass(NullWritable.class);
        //5.设置reduce的个数为0,即没有reduce
       job.setNumReduceTasks(0);
        //6.设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path("file:///C:/Users/86188/Desktop/7/mapjoin"));
        FileOutputFormat.setOutputPath(job,new Path("file:///F:/test/8"));//test下的文件夹3不能提前存在
        //设置缓存文件
        job.addCacheFile(new URI("file:///C:/Users/86188/Desktop/7/data/customers.csv"));

        //7.提交程序运行
        boolean result=job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

二.推测执行

问题

  • 程序bug或负载不均时,部分任务成为短板
  • 如:100个map任务中的99个完成,剩下1个停留10%

使用推测执行启动备份任务

  • 取最先完成的作为最终结果
  • 利用资源来换取时间的一种优化策略
  • 资源很紧张时不适用
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值