hadoop二次排序应用，实现两表连接join操作

最新推荐文章于 2023-08-29 19:38:41 发布

fantasticqiang

最新推荐文章于 2023-08-29 19:38:41 发布

阅读量773

点赞数 2

分类专栏： hadoop

本文链接：https://blog.csdn.net/fantasticqiang/article/details/80742797

版权

hadoop 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

背景

有两个文件customer.txt,orders.txt分别记录着客户的信息和客户的订单信息。我们要实现数据库中的join操作，类似于“select a.*,b.* from A a,B b where a.cid = b.cid;”。

//customer.txt中记录的是客户信息
//客户id，名字，年龄
//cid,name,age
1,tom1,12
2,tom2,13
3,tom3,14
4,tom4,15
//orders.txt中记录的是客户订单信息
//订单id，价格，客户id
//oid,price,cid
1,12.23,1
2,12.48,1
3,12.23,2
4,12.56,2
5,12.23,2
6,15.55,3
7,16.23,3
8,18.78,3
9,20.23,3

思路

我们的map方法要读取两个文件中的内容，一行行读取文件中的值去构造自定义组合key，发往reduce进行排序聚合。发往reduce function的每一组数据的顺序是第一行是客户的信息，剩下的是他的订单信息。

我们要对两个文件中的cid对numPartition取模，这样从customer.txt中读取的客户信息，从orders.txt中读取的订单信息都会在一个分区里面，会发往同一个reduce节点处理。

我们对同一个reduce节点中的按照cid进行分组处理，这样某一个客户的信息和他的订单信息都会由一个reduce函数处理，然后循环遍历完成拼接。

实现

自定义组合key，完成从customer.txt,orders.txt读取内容。

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 客户和他的订单组合key
 * 一个客户有多个订单
 *  组合key要实现序列化和比较器接口
 */
public class CustomerOrdersComboKey implements WritableComparable<CustomerOrdersComboKey> {
    //客户id
    private int cid;
    //订单id
    private int oid;

    /**
     *由于我们读取customer.txt和orders.txt文件，来生成组合key对象
     * 不同的文件生成的组合key对象有不同的类型
     *  type == 0 表示此组合key是客户信息
     *  type == 1 表示此组合key是订单信息
     */
    private int type;

    //客户信息字符串
    private String customerInfo = "";
    //订单信息字符串
    private String orderInfo = "";

    /**
     * 两个组合key排序比较方法
     * 判断两个对象的cid，判断他俩是不是同一个客户的"信息①"（或者是customer.txt中读取的客户信息，或者是orders.txt读取的订单信息）
     * 如果cid相同，判断是不是同一类型type，是同一类型按照订单oid排序;如果不是同一类型type，如果type==0返回-1,如果type==1返回1
     * 如果cid不同，按照oid升序排序
     */
    public int compareTo(CustomerOrdersComboKey key2) {
        int cid2 = key2.getCid();
        int oid2 = key2.getOid();
        int type2 = key2.getType();
        if(cid == cid2){  //判断他俩是不是同一个客户的"信息①",判断客户id是否相同
            if(type == type2){
                return oid - oid2; //按照订单oid升序
            }else{
                if(type == 0){
                    return -1;
                }else{
                    return 1;
                }
            }
        }else{
            return cid - cid2;
        }
    }

    //序列化
    public void write(DataOutput out) throws IOException {
        out.writeInt(cid);
        out.writeInt(oid);
        out.writeInt(type);
        out.writeUTF(customerInfo);
        out.writeUTF(orderInfo);
    }
    //序列化
    public void readFields(DataInput in) throws IOException {
        this.cid = in.readInt();
        this.oid = in.readInt();
        this.type = in.readInt();
        this.customerInfo = in.readUTF();
        this.orderInfo = in.readUTF();
    }

    public int getCid() {
        return cid;
    }

    public void setCid(int cid) {
        this.cid = cid;
    }

    public int getOid() {
        return oid;
    }

    public void setOid(int oid) {
        this.oid = oid;
    }

    public int getType() {
        return type;
    }

    public void setType(int type) {
        this.type = type;
    }

    public String getCustomerInfo() {
        return customerInfo;
    }

    public void setCustomerInfo(String customerInfo) {
        this.customerInfo = customerInfo;
    }

    public String getOrderInfo() {
        return orderInfo;
    }

    public void setOrderInfo(String orderInfo) {
        this.orderInfo = orderInfo;
    }
}

自定义分区类，相同cid的客户信息和他的订单信息都发往同一分区

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 按照客户cid分区，相同的客户信息和他的订单信息到同一分区中
 */
public class PartionByCid extends Partitioner<CustomerOrdersComboKey,NullWritable> {

    public int getPartition(CustomerOrdersComboKey customerOrdersComboKey, NullWritable nullWritable, int numPartitions) {
        return customerOrdersComboKey.getCid() % numPartitions;
    }

}

组合key的排序对比器

根据客户cid，订单oid，类型type排序，调用组合key中的compareTo方法

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 组合key的排序对比器
 */
public class ComparatorOfComboKey extends WritableComparator {

    protected ComparatorOfComboKey() {
        super(CustomerOrdersComboKey.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        CustomerOrdersComboKey key1 = (CustomerOrdersComboKey)a;
        CustomerOrdersComboKey key2 = (CustomerOrdersComboKey)b;
        return key1.compareTo(key2);
    }
}

reduce节点的分组对比器

相同cid的组合key由同一个reduce函数处理

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 组对比器
 */
public class GroupComparator extends WritableComparator {

    protected GroupComparator() {
        super(CustomerOrdersComboKey.class,true);
    }
    //组按照cid分组，相同客户id的数据在一组中
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        CustomerOrdersComboKey key1 = (CustomerOrdersComboKey)a;
        CustomerOrdersComboKey key2 = (CustomerOrdersComboKey)b;
        return key1.getCid() - key2.getCid();
    }
}

mapper处理文件

mapper方法读取文件，生成组合key对象，发往reduce节点

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,CustomerOrdersComboKey,NullWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        FileSplit fileSplit = (FileSplit)context.getInputSplit();
        String filePath = fileSplit.getPath().toString();
        CustomerOrdersComboKey keyOut = new CustomerOrdersComboKey();
        //value为txt文件中的一行
        String line = value.toString();
        if(filePath.contains("customer")){ //如果是客户信息文件
            String cid = line.substring(0,line.indexOf(",")); //第一个逗号之前存放的是客户cid
            String customerInfo = line;
            keyOut.setType(0);
            keyOut.setCid(Integer.parseInt(cid));
            keyOut.setCustomerInfo(customerInfo);
        }else{
            String cid = line.substring(line.lastIndexOf(",")+1); //最后一个逗号之后存放的是客户cid
            String oid = line.substring(0,line.indexOf(","));  //第一个逗号之前是订单oid
            String orderInfo = line;
            keyOut.setType(1);
            keyOut.setCid(Integer.parseInt(cid));
            keyOut.setOid(Integer.parseInt(oid));
            keyOut.setOrderInfo(orderInfo);

        }
        context.write(keyOut,NullWritable.get());
    }
}

reduce对发过来的一组key,value进行处理

每个reduce方法处理一组KV,根据排序和分组规则，第一条是某一客户的信息，从第二条到最后一条是这个客户的订单信息。遍历value,取出它们各自对应的自定义组合key，完成信息的拼接。

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.util.Iterator;

public class Reducer extends org.apache.hadoop.mapreduce.Reducer<CustomerOrdersComboKey,NullWritable,Text,NullWritable> {

    @Override
    protected void reduce(CustomerOrdersComboKey key, Iterable<NullWritable> values, Context context)
                                                         throws IOException, InterruptedException {
        Iterator<NullWritable> iterator = values.iterator();
        iterator.next();

        if(0 == key.getType()){  //第一条是客户的信息
            String customerInfo = key.getCustomerInfo();
            int cid = key.getCid();
            while(iterator.hasNext()){  //他下面有他的订单信息
                iterator.next();
                int cid_order = key.getCid();
                if(cid_order == cid && 1 == key.getType()){
                    String orderInfo = key.getOrderInfo();
                    context.write(new Text(customerInfo +" ; "+ orderInfo),NullWritable.get());
                }
            }
        }

    }
}

app运行类

整个job的运行配置信息

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 运行启动类
 */
public class App {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        Job job = Job.getInstance(conf);

        job.setJobName("join 操作");
        job.setJarByClass(App.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(Reducer.class);
        job.setMapOutputKeyClass(CustomerOrdersComboKey.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(2);
        job.setSortComparatorClass(ComparatorOfComboKey.class);
        job.setGroupingComparatorClass(GroupComparator.class);
        job.setPartitionerClass(PartionByCid.class);

        FileInputFormat.setInputPaths(job,new Path("/home/hadoop/join"));
        FileOutputFormat.setOutputPath(job,new Path("/home/hadoop/join/out"));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }
}