MapReduce练习之数据的连接

MapReduce练习之数据的连接

(所有文章均作为本人笔记使用,可以指点,但是不要喷我,谢谢。)
MapReduce的连接分为两种情况:
1.Map端连接:数据在进入Map端之前就已经做好了连接;对于Map端连接的多个数据集需要符合以下两个条件:
  1.要连接的多个数据集不能再分片;需要被压缩处理;
  2.要连接的多个数据集的文件个数要相同;需要对要连接的多个数据集进行预处理,在预处理的过程中对于每个数据集需要固定相同的Reduce个数;
  3.要连接的多个数据集内部要按照连接的键进行局部排序;(默认会自动进行局部排序)
2.Reduce端连接:主要是用到了二次排序和多输入。
本次主要是对Reduce端连接的练习。

0.数据准备

由于要进行连接,所以需要准备一个以上的数据,本次使用的数据。


artist.txt

u1	john
u2	tom
u3	kai
u4	marry

user_artist.txt

u1	50
u2	100
u3	90
u4	28

预期结果
u1 john 50
u2 tom 100
u3 kai 90
u4 marry 28

1.代码实现

复合键 UidFlag.java

package com.briup.bigdata.BD1805.hadoop.mr;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;

public class UidFlag implements WritableComparable<UidFlag> {
    private Text uid = new Text();
    private IntWritable flag = new IntWritable();

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        UidFlag uidFlag = (UidFlag) o;
        return uid.equals(uidFlag.uid) &&
                flag.equals(uidFlag.flag);
    }

    @Override
    public int hashCode() {
        return Objects.hash(uid, flag);
    }

    public Text getUid() {
        return uid;
    }

    public void setUid(Text uid) {
        this.uid = new Text(uid.toString());
    }

    public IntWritable getFlag() {
        return flag;
    }

    public void setFlag(IntWritable flag) {
        this.flag = new IntWritable(flag.get());
    }

    @Override
    public int compareTo(UidFlag o) {
        int uidComp = this.uid.compareTo(o.getUid());
        int flagComp = this.flag.compareTo(o.getFlag());

        return uidComp==0?flagComp:uidComp;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        this.uid.write(dataOutput);
        this.flag.write(dataOutput);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.uid.readFields(dataInput);
        this.flag.readFields(dataInput);
    }

    @Override
    public String toString() {
        return uid.toString();
    }
}

  该java类是二次比较需要的复合键,首次比较比较uid,当uid相同时按照自定义的标志位进行比较。


自定义分区器 UidFlagPartitioner.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class UidFlagPartitioner extends Partitioner<UidFlag, Text>{

    @Override
    public int getPartition(UidFlag uidFlag, Text text, int i) {
        return Math.abs(uidFlag.getUid().hashCode()%i);
    }
}

自定义分组器 UidFLagGroupComparator .java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class UidFLagGroupComparator extends WritableComparator {
    public UidFLagGroupComparator() {
        super(UidFlag.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        UidFlag ufa = (UidFlag) a;
        UidFlag ufb = (UidFlag) b;

        return ufa.getUid().compareTo(ufb.getUid());
    }
}

自定义排序比较器 UidFlagSortComparator.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class UidFlagSortComparator extends WritableComparator {
    public UidFlagSortComparator() {
        super(UidFlag.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        return super.compare(a, b);
    }
}

MR核心程序 ReduceJoinForUserArtist.java

package com.briup.bigdata.BD1805.hadoop.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class ReduceJoinForUserArtist extends Configured implements Tool {

    static class ReduceJoinForUserMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
        private UidFlag key = new UidFlag();
        private Text value = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] str = value.toString().split("[\t]");
            this.key.setUid(new Text(str[0]));
            this.key.setFlag(new IntWritable(1));

            this.value.set(str[1]);

            context.write(this.key,this.value);
        }

    }

    static class ReduceJoinForArtistMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
        private UidFlag key = new UidFlag();
        private Text value = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] str = value.toString().split("[\t]");
            this.key.setUid(new Text(str[0]));
            this.key.setFlag(new IntWritable(2));

            this.value.set(str[1]);

            context.write(this.key,this.value);
        }

    }

    static class ReducerJoinForUserArtistReducer extends Reducer<UidFlag, Text, Text, Text>{
        private Text key = new Text();
        private Text value = new Text();
        private StringBuilder sb = new StringBuilder();

        @Override
        protected void reduce(UidFlag key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            sb.setLength(0);
            values.forEach(
                    val->sb.append(val.toString()).append("\t")
            );
            this.key.set(key.getUid());
            this.value.set(sb.substring(0,sb.length()-1));

            context.write(this.key,this.value);
        }
    }


    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = this.getConf();
        Path in1 = new Path(conf.get("in1"));
        Path in2 = new Path(conf.get("in2"));
        Path out = new Path(conf.get("out"));

        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        job.setJarByClass(this.getClass());


        //多输入
        //第一个参数表示要执行的作业,第二个参数表示输入得路径,第三个参数表示输如文件类型,第四个参数表示要调用的Mapper类。
        MultipleInputs.addInputPath(job,in1,TextInputFormat.class,ReduceJoinForUserMapper.class);
        MultipleInputs.addInputPath(job,in2,TextInputFormat.class,ReduceJoinForArtistMapper.class);

        job.setMapOutputKeyClass(UidFlag.class);
        job.setMapOutputValueClass(Text.class);


        job.setReducerClass(ReducerJoinForUserArtistReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileOutputFormat.setOutputPath(job,out);
	.   //由于使用了二次排序,所以千万不要忘了配置二次排序
        job.setPartitionerClass(UidFlagPartitioner.class);
        job.setGroupingComparatorClass(UidFLagGroupComparator.class);
        job.setSortComparatorClass(UidFlagSortComparator.class);

        return job.waitForCompletion(true)?0:1;
    }

    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new ReduceJoinForUserArtist(),args));
    }
}

2.运行结果

本次练习的结果

3.遇到的问题

无。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值