MapReduce练习之数据的连接

最新推荐文章于 2022-03-11 17:21:50 发布

A阿尔

最新推荐文章于 2022-03-11 17:21:50 发布

阅读量380

点赞数

分类专栏： MR笔记文章标签：笔记

本文链接：https://blog.csdn.net/weixin_44222965/article/details/85224569

版权

MR笔记专栏收录该内容

1 篇文章 0 订阅

订阅专栏

MapReduce练习之数据的连接

（所有文章均作为本人笔记使用，可以指点，但是不要喷我，谢谢。）
MapReduce的连接分为两种情况：
1.Map端连接：数据在进入Map端之前就已经做好了连接；对于Map端连接的多个数据集需要符合以下两个条件：
　　1.要连接的多个数据集不能再分片；需要被压缩处理；
　　2.要连接的多个数据集的文件个数要相同；需要对要连接的多个数据集进行预处理，在预处理的过程中对于每个数据集需要固定相同的Reduce个数；
　　3.要连接的多个数据集内部要按照连接的键进行局部排序；（默认会自动进行局部排序）
2.Reduce端连接：主要是用到了二次排序和多输入。
本次主要是对Reduce端连接的练习。

0.数据准备

由于要进行连接，所以需要准备一个以上的数据，本次使用的数据。

artist.txt

u1	john
u2	tom
u3	kai
u4	marry

user_artist.txt

预期结果
u1 john 50
u2 tom 100
u3 kai 90
u4 marry 28

1.代码实现

复合键 UidFlag.java

package com.briup.bigdata.BD1805.hadoop.mr;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;

public class UidFlag implements WritableComparable<UidFlag> {
    private Text uid = new Text();
    private IntWritable flag = new IntWritable();

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        UidFlag uidFlag = (UidFlag) o;
        return uid.equals(uidFlag.uid) &&
                flag.equals(uidFlag.flag);
    }

    @Override
    public int hashCode() {
        return Objects.hash(uid, flag);
    }

    public Text getUid() {
        return uid;
    }

    public void setUid(Text uid) {
        this.uid = new Text(uid.toString());
    }

    public IntWritable getFlag() {
        return flag;
    }

    public void setFlag(IntWritable flag) {
        this.flag = new IntWritable(flag.get());
    }

    @Override
    public int compareTo(UidFlag o) {
        int uidComp = this.uid.compareTo(o.getUid());
        int flagComp = this.flag.compareTo(o.getFlag());

        return uidComp==0?flagComp:uidComp;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        this.uid.write(dataOutput);
        this.flag.write(dataOutput);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.uid.readFields(dataInput);
        this.flag.readFields(dataInput);
    }

    @Override
    public String toString() {
        return uid.toString();
    }
}

该java类是二次比较需要的复合键，首次比较比较uid，当uid相同时按照自定义的标志位进行比较。

自定义分区器 UidFlagPartitioner.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class UidFlagPartitioner extends Partitioner<UidFlag, Text>{

    @Override
    public int getPartition(UidFlag uidFlag, Text text, int i) {
        return Math.abs(uidFlag.getUid().hashCode()%i);
    }
}

自定义分组器 UidFLagGroupComparator .java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class UidFLagGroupComparator extends WritableComparator {
    public UidFLagGroupComparator() {
        super(UidFlag.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        UidFlag ufa = (UidFlag) a;
        UidFlag ufb = (UidFlag) b;

        return ufa.getUid().compareTo(ufb.getUid());
    }
}

自定义排序比较器 UidFlagSortComparator.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class UidFlagSortComparator extends WritableComparator {
    public UidFlagSortComparator() {
        super(UidFlag.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        return super.compare(a, b);
    }
}

MR核心程序 ReduceJoinForUserArtist.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class ReduceJoinForUserArtist extends Configured implements Tool {

    static class ReduceJoinForUserMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
        private UidFlag key = new UidFlag();
        private Text value = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] str = value.toString().split("[\t]");
            this.key.setUid(new Text(str[0]));
            this.key.setFlag(new IntWritable(1));

            this.value.set(str[1]);

            context.write(this.key,this.value);
        }

    }

    static class ReduceJoinForArtistMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
        private UidFlag key = new UidFlag();
        private Text value = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] str = value.toString().split("[\t]");
            this.key.setUid(new Text(str[0]));
            this.key.setFlag(new IntWritable(2));

            this.value.set(str[1]);

            context.write(this.key,this.value);
        }

    }

    static class ReducerJoinForUserArtistReducer extends Reducer<UidFlag, Text, Text, Text>{
        private Text key = new Text();
        private Text value = new Text();
        private StringBuilder sb = new StringBuilder();

        @Override
        protected void reduce(UidFlag key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            sb.setLength(0);
            values.forEach(
                    val->sb.append(val.toString()).append("\t")
            );
            this.key.set(key.getUid());
            this.value.set(sb.substring(0,sb.length()-1));

            context.write(this.key,this.value);
        }
    }


    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = this.getConf();
        Path in1 = new Path(conf.get("in1"));
        Path in2 = new Path(conf.get("in2"));
        Path out = new Path(conf.get("out"));

        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        job.setJarByClass(this.getClass());


        //多输入
        //第一个参数表示要执行的作业，第二个参数表示输入得路径，第三个参数表示输如文件类型，第四个参数表示要调用的Mapper类。
        MultipleInputs.addInputPath(job,in1,TextInputFormat.class,ReduceJoinForUserMapper.class);
        MultipleInputs.addInputPath(job,in2,TextInputFormat.class,ReduceJoinForArtistMapper.class);

        job.setMapOutputKeyClass(UidFlag.class);
        job.setMapOutputValueClass(Text.class);


        job.setReducerClass(ReducerJoinForUserArtistReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job,out);
	.   //由于使用了二次排序，所以千万不要忘了配置二次排序
        job.setPartitionerClass(UidFlagPartitioner.class);
        job.setGroupingComparatorClass(UidFLagGroupComparator.class);
        job.setSortComparatorClass(UidFlagSortComparator.class);

        return job.waitForCompletion(true)?0:1;
    }

    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new ReduceJoinForUserArtist(),args));
    }
}