MapReduce练习之数据的连接
(所有文章均作为本人笔记使用,可以指点,但是不要喷我,谢谢。)
MapReduce的连接分为两种情况:
1.Map端连接:数据在进入Map端之前就已经做好了连接;对于Map端连接的多个数据集需要符合以下两个条件:
1.要连接的多个数据集不能再分片;需要被压缩处理;
2.要连接的多个数据集的文件个数要相同;需要对要连接的多个数据集进行预处理,在预处理的过程中对于每个数据集需要固定相同的Reduce个数;
3.要连接的多个数据集内部要按照连接的键进行局部排序;(默认会自动进行局部排序)
2.Reduce端连接:主要是用到了二次排序和多输入。
本次主要是对Reduce端连接的练习。
0.数据准备
由于要进行连接,所以需要准备一个以上的数据,本次使用的数据。
artist.txt
u1 john
u2 tom
u3 kai
u4 marry
user_artist.txt
u1 50
u2 100
u3 90
u4 28
预期结果
u1 john 50
u2 tom 100
u3 kai 90
u4 marry 28
1.代码实现
复合键 UidFlag.java
package com.briup.bigdata.BD1805.hadoop.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;
public class UidFlag implements WritableComparable<UidFlag> {
private Text uid = new Text();
private IntWritable flag = new IntWritable();
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
UidFlag uidFlag = (UidFlag) o;
return uid.equals(uidFlag.uid) &&
flag.equals(uidFlag.flag);
}
@Override
public int hashCode() {
return Objects.hash(uid, flag);
}
public Text getUid() {
return uid;
}
public void setUid(Text uid) {
this.uid = new Text(uid.toString());
}
public IntWritable getFlag() {
return flag;
}
public void setFlag(IntWritable flag) {
this.flag = new IntWritable(flag.get());
}
@Override
public int compareTo(UidFlag o) {
int uidComp = this.uid.compareTo(o.getUid());
int flagComp = this.flag.compareTo(o.getFlag());
return uidComp==0?flagComp:uidComp;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
this.uid.write(dataOutput);
this.flag.write(dataOutput);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.uid.readFields(dataInput);
this.flag.readFields(dataInput);
}
@Override
public String toString() {
return uid.toString();
}
}
该java类是二次比较需要的复合键,首次比较比较uid,当uid相同时按照自定义的标志位进行比较。
自定义分区器 UidFlagPartitioner.java
package com.briup.bigdata.BD1805.hadoop.mr;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class UidFlagPartitioner extends Partitioner<UidFlag, Text>{
@Override
public int getPartition(UidFlag uidFlag, Text text, int i) {
return Math.abs(uidFlag.getUid().hashCode()%i);
}
}
自定义分组器 UidFLagGroupComparator .java
package com.briup.bigdata.BD1805.hadoop.mr;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class UidFLagGroupComparator extends WritableComparator {
public UidFLagGroupComparator() {
super(UidFlag.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
UidFlag ufa = (UidFlag) a;
UidFlag ufb = (UidFlag) b;
return ufa.getUid().compareTo(ufb.getUid());
}
}
自定义排序比较器 UidFlagSortComparator.java
package com.briup.bigdata.BD1805.hadoop.mr;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class UidFlagSortComparator extends WritableComparator {
public UidFlagSortComparator() {
super(UidFlag.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
return super.compare(a, b);
}
}
MR核心程序 ReduceJoinForUserArtist.java
package com.briup.bigdata.BD1805.hadoop.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class ReduceJoinForUserArtist extends Configured implements Tool {
static class ReduceJoinForUserMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
private UidFlag key = new UidFlag();
private Text value = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] str = value.toString().split("[\t]");
this.key.setUid(new Text(str[0]));
this.key.setFlag(new IntWritable(1));
this.value.set(str[1]);
context.write(this.key,this.value);
}
}
static class ReduceJoinForArtistMapper extends Mapper<LongWritable, Text, UidFlag, Text>{
private UidFlag key = new UidFlag();
private Text value = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] str = value.toString().split("[\t]");
this.key.setUid(new Text(str[0]));
this.key.setFlag(new IntWritable(2));
this.value.set(str[1]);
context.write(this.key,this.value);
}
}
static class ReducerJoinForUserArtistReducer extends Reducer<UidFlag, Text, Text, Text>{
private Text key = new Text();
private Text value = new Text();
private StringBuilder sb = new StringBuilder();
@Override
protected void reduce(UidFlag key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
sb.setLength(0);
values.forEach(
val->sb.append(val.toString()).append("\t")
);
this.key.set(key.getUid());
this.value.set(sb.substring(0,sb.length()-1));
context.write(this.key,this.value);
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = this.getConf();
Path in1 = new Path(conf.get("in1"));
Path in2 = new Path(conf.get("in2"));
Path out = new Path(conf.get("out"));
Job job = Job.getInstance(conf, this.getClass().getSimpleName());
job.setJarByClass(this.getClass());
//多输入
//第一个参数表示要执行的作业,第二个参数表示输入得路径,第三个参数表示输如文件类型,第四个参数表示要调用的Mapper类。
MultipleInputs.addInputPath(job,in1,TextInputFormat.class,ReduceJoinForUserMapper.class);
MultipleInputs.addInputPath(job,in2,TextInputFormat.class,ReduceJoinForArtistMapper.class);
job.setMapOutputKeyClass(UidFlag.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerJoinForUserArtistReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job,out);
. //由于使用了二次排序,所以千万不要忘了配置二次排序
job.setPartitionerClass(UidFlagPartitioner.class);
job.setGroupingComparatorClass(UidFLagGroupComparator.class);
job.setSortComparatorClass(UidFlagSortComparator.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new ReduceJoinForUserArtist(),args));
}
}
2.运行结果
3.遇到的问题
无。