数据集
user(id name)用户表
1 user1
2 user2
3 user3
4 user4
5 user5
6 user6
post(userid postid postname)帖子表
1 1 post1
1 2 post2
2 3 post3
4 4 post4
5 5 post5
8 6 post6
8 7 post7
8 8 post8
package com.test;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/*
* mapreduce实现左连接、右连接、全连接、反连接
*/
public class UserAndPostJoinJob {
static class UserAndPostWritable implements Writable{
/*
* 类型 U表示用户,P表示帖子
*/
private String type;
private String data;
public UserAndPostWritable(){}
public UserAndPostWritable(String type, String data) {
super();
this.type = type;
this.data = data;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getData(){
return data;
}
public void setData(String data) {
this.data = data;
}
@Override
public void readFields(DataInput input) throws IOException {
// TODO Auto-generated method stub
type = input.readUTF();
data = input.readUTF();
}
@Override
public void write(DataOutput output) throws IOException {
// TODO Auto-generated method stub
output.writeUTF(type);
output.writeUTF(data);
}
}
static class UserMapper extends Mapper<LongWritable, Text, Text, UserAndPostWritable> {
protected void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException {
String[] arr = value.toString().split("\t");
Text userId = new Text(arr[0]);
context.write(userId, new UserAndPostWritable("U", value.toString()));
}
}
static class PostMapper extends Mapper<LongWritable, Text, Text, UserAndPostWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String[] arr = value.toString().split("\t");
Text userId = new Text(arr[0]);
context.write(userId, new UserAndPostWritable("P",value.toString()));
}
}
static class PostReducer extends Reducer<Text, UserAndPostWritable, Text, Text> {
private List<Text> users = new ArrayList<Text>();
private List<Text> posts = new ArrayList<Text>();
private String joinType;
protected void setup(Context context) throws IOException,InterruptedException {
super.setup(context);
joinType = context.getConfiguration().get("joinType");
}
protected void reduce(Text key, Iterable<UserAndPostWritable> iterable, Context context) throws IOException, InterruptedException{
users.clear();
posts.clear();
for(UserAndPostWritable data : iterable) {
if (data.getType().equals("U")) {
users.add(new Text(data.getData()));
} else {
posts.add(new Text(data.getData()));
}
}
if("innerJoin".equals(joinType)) {
if(users.size() > 0 && posts.size() > 0){
for(Text user : users) {
for(Text post : posts) {
context.write(new Text(user),new Text(post));
}
}
}
}else if("leftOuterJoin".equals(joinType)) {
for(Text user : users) {
if(posts.size() > 0) {
for(Text post : posts) {
context.write(new Text(user), new Text(post));
}
} else {
context.write(new Text(user), new Text(" \t \t "));
}
}
}else if("rightOuterJoin".equals(joinType)) {
for(Text post : posts) {
if(users.size() > 0) {
for(Text user : users) {
context.write(new Text(user), new Text(post));
}
} else {
context.write(new Text(" \t "), new Text(post));
}
}
}else if("fullOuterJoin".equals(joinType)) {
if(users.size() > 0) {
for(Text user : users) {
if(posts.size() > 0) {
for(Text post : posts) {
context.write(new Text(user), new Text(post));
}
} else {
context.write(new Text(user), new Text(" \t \t "));
}
}
} else {
for(Text post : posts) {
if(users.size() > 0) {
for(Text user : users) {
context.write(new Text(user), new Text(post));
}
} else {
context.write(new Text(" \t "), new Text(post));
}
}
}
}else if("anti".equals(joinType)) {
if(users.size() == 0 ^ posts.size() == 0) {
for(Text user : users) {
context.write(new Text(user), new Text(" \t \t "));
}
for(Text post : posts) {
context.write(new Text(" \t "), new Text(post));
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String queue = "hql";
if(args.length > 4) {
queue = args[4].matches("hql|dstream|mapred|udw|user|common") ? args[1] : "hql";
}
String joinType = args[0];
String userInputPath = args[1];
String postInputPath = args[2];
String outputPath = args[3];
conf.set("mapreduce.job.queuename",queue);
Job job = Job.getInstance(conf,"JoinTest");
job.getConfiguration().set("joinType", joinType);
job.getConfiguration().set("mapred.textoutputformat.separator", "\t");
job.setJarByClass(UserAndPostJoinJob.class);
MultipleInputs.addInputPath(job, new Path(userInputPath), TextInputFormat.class, UserMapper.class);
MultipleInputs.addInputPath(job, new Path(postInputPath), TextInputFormat.class, PostMapper.class);
job.setReducerClass(PostReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(UserAndPostWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(outputPath);
// if(fs.exists(outPath)) {
// fs.delete(outPath);
// }
FileOutputFormat.setOutputPath(job, outPath);
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
job.waitForCompletion(true);
}
}