join操作
左外连接(map)-JoinMapSideMR
问题描述:
将两个文件中每行的内容拼接到一个文件中
思路分析:
准备好两个map,firstMapper和joinMapper,firstMapper负责获取文件内容,joinMapper负责拼接文件内容。利用Job开启两个firstMapper任务,获取到两个文件的内容,然后再开启一个joinMapper任务负责拼接获取到的两个文件。
注:不常用map端的连接操作,推荐reduce端的连接操作
public class JoinMapSideMR extends Configured implements Tool {
public static class FirstStepMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if(!value.toString().equals("")) {
context.write(value, NullWritable.get());
}
}
}
//读取连接好的数据的mapper
public static class JoinMapper extends Mapper<Text, TupleWritable, Text, Text>{
@Override
protected void map(Text key, TupleWritable value, Context context) throws IOException, InterruptedException {
String v = StreamSupport.stream(value.spliterator(), false).map(s -> ((Text) s).toString())
.collect(Collectors.joining("|"));
context.write(key,new Text(v));
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Path inpath1 = new Path(conf.get("inpath1"));
Path inpath2 = new Path(conf.get("inpath2"));
Path mr1 = new Path(conf.get("mr1"));
Path mr2 = new Path(conf.get("mr2"));
Path outpath = new Path(conf.get("outpath"));
//------------------------
Job job1 = Job.getInstance(conf,"first_step1_xj");
job1.setJarByClass(this.getClass());
job1.setMapperClass(FirstStepMapper.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(NullWritable.class);
job1.setReducerClass(Reducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(NullWritable.class);
TextInputFormat.addInputPath(job1,inpath1);
TextOutputFormat.setOutputPath(job1,mr1);
FileOutputFormat.setOutputCompressorClass(job1,new GzipCodec().getClass());
//------------------------
Job job2 = Job.getInstance(conf,"first_step2_xj");
job2.setJarByClass(this.getClass());
job2.setMapperClass(FirstStepMapper.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(NullWritable.class);
job2.setReducerClass(Reducer.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(NullWritable.class);
TextInputFormat.addInputPath(job2,inpath2);
TextOutputFormat.setOutputPath(job2,mr2);
FileOutputFormat.setOutputCompressorClass(job2,new GzipCodec().getClass());
//------------------------
Job job3 = Job.getInstance(conf,"map_join_xj");
job3.setJarByClass(this.getClass());
job3.setMapperClass(JoinMapper.class);
job3.setMapOutputKeyClass(Text.class);
job3.setMapOutputValueClass(Text.class);
job3.setNumReduceTasks(0);
job3.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ",");
String expr = CompositeInputFormat.compose("inner", KeyValueTextInputFormat.class, mr1, mr2);
job3.getConfiguration().set("mapreduce.join.expr",expr);
job3.setInputFormatClass(CompositeInputFormat.class);
TextOutputFormat.setOutputPath(job3,outpath);
List<Job> list = new ArrayList();
list.add(job1);
list.add(job2);
list.add(job3);
for (Job job : list) {
boolean succ = job.waitForCompletion(true);
if(!succ){
System.out.println(job.getJobName()+":"+ job.getJobState().getValue());
break;
}
}
return 0;
}
public static void main(String[] args)throws Exception {
ToolRunner.run(new JoinMapSideMR(),args);
}
}
左外连接(reduce)-JoinReduceSideMR
问题描述:
将两个文件中每行的内容拼接到一个文件中
思路分析:
准备好两个map,fistMapper和SecondMapper,两个map的key的输出类型都为复合类型,包含id和tag,另外准备两个类自定义分组和分区规则,只根据id来分组和分区。因此,这两个map的输出结果就会进入到同一个reduce中,最后在reduce中完成拼接操作。
复合类型-ArtistIDTag
public class ArtistIDTag implements WritableComparable<ArtistIDTag> {
private Text ArtistID = new Text(); // id
private IntWritable Tag = new IntWritable(); // 标记
public ArtistIDTag() {
}
public ArtistIDTag(Text artistID, IntWritable tag) {
this.ArtistID = new Text(artistID.toString());
this.Tag = new IntWritable(tag.get());
}
public Text getArtistID() {
return ArtistID;
}
public void setArtistID(Text artistID) {
this.ArtistID = new Text(artistID.toString());
}
public IntWritable getTag() {
return Tag;
}
public void setTag(IntWritable tag) {
this.Tag = new IntWritable(tag.get());
}
@Override
public int compareTo(ArtistIDTag o) {
return this.ArtistID.compareTo(o.ArtistID)==0 ? this.Tag.compareTo(o.Tag) : this.ArtistID.compareTo(o.ArtistI