MapReduce Algorithm - Semi-joins

In relational world, semi-join can be defined as a join between two tables returns rows from the first table where one or more matches are found in the second table. The difference between a semi-join and a conventional join is that rows in the first table will be returned at most once. Even if the second table contains two matches for a row in the first table, only one copy of the row will be returned. 

 

Well in MapReduce, if you faced with the challenge of joining two large datasets together, the obvious choice is to go with a repartition join, which leverages the full MapReduce framework to perform the join on the reduce-side. In fact, this may be you only option if you can not filter one of the datasets to a size that can be cached on the map side. However, if you believe that you can pare down one dataset to a manageable size, you can use semi-join to accomplish the work. You can use three MapReduce jobs to join two datasets together to avoid the overhead of a reducer-side join. This technique is useful in situations where you're working with large datasets, but a job can be reduced down to a size that can fit into memory of a task by filtering out records that don't match the other dataset. The three MapReduce jobs that comprise a semi-join can be illustrated as below diagram:

 In semi-join, we can leverage the replicated join introduced before (you can find details of replicated join or map-side join in this blog post).Without further ado, let me show you the code, also you can find them in Hadoop in Practice. The first job:

package com.manning.hip.ch4.joins.semijoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class UniqueHashedKeyJob {
    public static void main(String... args) throws Exception {
        runJob(new Path(args[0]), new Path(args[1]));
    }

    public static void runJob(Path inputPath,
                              Path outputPath)
            throws Exception {

        Configuration conf = new Configuration();

        Job job = new Job(conf);

        job.setJarByClass(UniqueHashedKeyJob.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(KeyValueTextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        outputPath.getFileSystem(conf).delete(outputPath, true);

        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        if (!job.waitForCompletion(true)) {
            throw new Exception("Job failed");
        }
    }

    public static class Map extends Mapper<Text, Text, Text, NullWritable> {
        private Set<String> keys = new HashSet<String>();

        @Override
        protected void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
            System.out.println("K[" + key + "]");
            keys.add(key.toString());
        }

        @Override
        protected void cleanup(
                Context context)
                throws IOException, InterruptedException {
            Text outputKey = new Text();
            for (String key : keys) {
                System.out.println("OutK[" + key + "]");
                outputKey.set(key);
                context.write(outputKey, NullWritable.get());
            }
        }
    }

    public static class Reduce
            extends Reducer<Text, NullWritable, Text, NullWritable> {
        @Override
        protected void reduce(Text key, Iterable<NullWritable> values,
                              Context context)
                throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }
}

 The second one:

package com.manning.hip.ch4.joins.semijoin;

import com.manning.hip.ch4.joins.replicated.GenericReplicatedJoin;
import com.manning.hip.ch4.joins.replicated.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReplicatedFilterJob extends GenericReplicatedJoin {
    public static void main(String... args) throws Exception {
        runJob(new Path(args[0]), new Path(args[1]), new Path(args[2]));
    }

    public static void runJob(Path usersPath,
                              Path uniqueUsersPath,
                              Path outputPath)
            throws Exception {

        Configuration conf = new Configuration();

        FileSystem fs = uniqueUsersPath.getFileSystem(conf);

        FileStatus uniqueUserStatus = fs.getFileStatus(uniqueUsersPath);

        if (uniqueUserStatus.isDir()) {
            for (FileStatus f : fs.listStatus(uniqueUsersPath)) {
                if (f.getPath().getName().startsWith("part")) {
                    DistributedCache.addCacheFile(f.getPath().toUri(), conf);
                }
            }
        } else {
            DistributedCache.addCacheFile(uniqueUsersPath.toUri(), conf);
        }

        Job job = new Job(conf);

        job.setJarByClass(ReplicatedFilterJob.class);
        job.setMapperClass(ReplicatedFilterJob.class);

        job.setNumReduceTasks(0);

        job.setInputFormatClass(KeyValueTextInputFormat.class);

        outputPath.getFileSystem(conf).delete(outputPath, true);

        FileInputFormat.setInputPaths(job, usersPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        if (!job.waitForCompletion(true)) {
            throw new Exception("Job failed");
        }
    }

    @Override
    public Pair join(Pair inputSplitPair, Pair distCachePair) {
        return inputSplitPair;
    }
}

 And the final one:

package com.manning.hip.ch4.joins.semijoin;

import com.manning.hip.ch4.joins.replicated.GenericReplicatedJoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FinalJoinJob {
    public static void main(String... args) throws Exception {
        runJob(new Path(args[0]), new Path(args[1]), new Path(args[2]));
    }

    public static void runJob(Path userLogsPath,
                              Path usersPath,
                              Path outputPath)
            throws Exception {

        Configuration conf = new Configuration();

        FileSystem fs = usersPath.getFileSystem(conf);

        FileStatus usersStatus = fs.getFileStatus(usersPath);

        if (usersStatus.isDir()) {
            for (FileStatus f : fs.listStatus(usersPath)) {
                if (f.getPath().getName().startsWith("part")) {
                    DistributedCache.addCacheFile(f.getPath().toUri(), conf);
                }
            }
        } else {
            DistributedCache.addCacheFile(usersPath.toUri(), conf);
        }

        Job job = new Job(conf);

        job.setJarByClass(FinalJoinJob.class);
        job.setMapperClass(GenericReplicatedJoin.class);

        job.setNumReduceTasks(0);

        job.setInputFormatClass(KeyValueTextInputFormat.class);

        outputPath.getFileSystem(conf).delete(outputPath, true);

        FileInputFormat.setInputPaths(job, userLogsPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        if (!job.waitForCompletion(true)) {
            throw new Exception("Job failed");
        }
    }
}

 The jobs driver class:

package com.manning.hip.ch4.joins.semijoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class Main {
    public static void main(String... args) throws Exception {
        runJob(new Path(args[0]), new Path(args[1]), new Path(args[2]));
    }

    public static void runJob(Path smallFilePath,
                              Path largeFilePath,
                              Path workPath)
            throws Exception {

        Configuration conf = new Configuration();

        FileSystem fs = workPath.getFileSystem(conf);
        fs.delete(workPath, true);

        fs.mkdirs(workPath);


        /
        // JOB 1 - Produce unique keys from the large file
        /
        Path uniqueKeyOutputPath = new Path(workPath, "unique");
        UniqueHashedKeyJob.runJob(largeFilePath, uniqueKeyOutputPath);

        /
        // JOB 2 - Use the unique keys from the large file to
        //         retain the contents of the small file that
        //         match
        /
        Path filteredSmallOutputPath = new Path(workPath, "filtered");
        ReplicatedFilterJob.runJob(smallFilePath, uniqueKeyOutputPath,
                filteredSmallOutputPath);

        /
        // JOB 3 - The final join
        /
        Path resultOutputPath = new Path(workPath, "result");
        FinalJoinJob.runJob(largeFilePath, filteredSmallOutputPath, resultOutputPath);
    }

}

 See the comments inline with the code to find the detailed usage.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值