MapReduce编程小案例.9th—join算法
数据:
有订单数据:
order001,u001 order002,u001 order003,u005 order004,u002 order005,u003 order006,u004 |
有用户数据:
u001,senge,18,angelababy u002,laozhao,48,ruhua u003,xiaoxu,16,chunge u004,laoyang,28,zengge u005,nana,14,huangbo |
需求:要求把它们username相同的整合起来
思路:
map端:
不管worker读到的是什么文件,我们的map方法中是可以通过context来区分的
对于order数据,map中切字段,封装为一个joinbean,打标记:t_order
对于user数据,map中切字段,封装为一个joinbean,打标记:t_user
然后,以uid作为key,以joinbean作为value返回
reduce端:
用迭代器迭代出一组相同uid的所有数据joinbean,然后判断
如果是标记字段为t_order的,则加入一个arraylist<JoinBean>中
如果标记字段为t_user的,则放入一个Joinbean对象中
然后,遍历arraylist,对里面的每一个JoinBean填充userBean中的user数据,然后输出这个joinBean即可
实现代码:
JoinBean实现类
package cn.edu360.mr.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class JoinBean implements Writable{
private String orderId;
private String userId;
private String userName;
private int userAge;
private String userFriend;
private String tableName;
public void set(String orderId, String userId, String userName, int userAge, String userFriend,String tableName) {
this.orderId = orderId;
this.userId = userId;
this.userName = userName;
this.userAge = userAge;
this.userFriend = userFriend;
this.tableName = tableName;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getUserAge() {
return userAge;
}
public void setUserAge(int userAge) {
this.userAge = userAge;
}
public String getUserFriend() {
return userFriend;
}
public void setUserFriend(String userFriend) {
this.userFriend = userFriend;
}
@Override
public String toString() {
return this.orderId +","+ this.userId + ","+this.userAge + ","+this.userName + ","+this.userFriend;
}
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.userName = in.readUTF();
this.userAge = in.readInt();
this.userFriend = in.readUTF();
this.tableName = in.readUTF();
}
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.userName);
out.writeInt(this.userAge);
out.writeUTF(this.userFriend);
out.writeUTF(this.tableName);
}
}
ReduceSideJoin实现类
package cn.edu360.mr.join;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 本例是使用最low的方式实现
*
* 还可以利用Partitioner + CompareTo + GroupingComparator 高效实现
*/
public class ReduceSideJoin {
public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean>{
String fileName = null;
JoinBean bean = new JoinBean();
Text k = new Text();
/*
* maptask在做数据处理的时候,会先执行一次setup()
* 执行完之后才对每一行反复调用map()
*/
@Override
protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
FileSplit inputSplit =(FileSplit)context.getInputSplit();
fileName = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if(fileName.startsWith("order")) {
bean.set(fields[0],fields[1],"NULL",-1,"NULL","order");
}else {
bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
}
k.set(bean.getUserId());
context.write(k, bean);
}
}
public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable>{
@Override
protected void reduce(Text key, Iterable<JoinBean> beans,
Reducer<Text, JoinBean, JoinBean, NullWritable>.Context context) throws IOException, InterruptedException {
ArrayList<JoinBean> orderList = new ArrayList<JoinBean>();
JoinBean userBean = null;
try {
for (JoinBean bean : beans) {
if("order".equals(bean.getTableName())) {
JoinBean newBean = new JoinBean();
BeanUtils.copyProperties(newBean, bean);
orderList.add(newBean);
} else {
userBean =new JoinBean();
BeanUtils.copyProperties(userBean, bean);
}
}
//拼接数据
for (JoinBean bean : orderList) {
bean.setUserName(userBean.getUserName());
bean.setUserAge(userBean.getUserAge());
bean.setUserFriend(userBean.getUserFriend());
context.write(bean, NullWritable.get());
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.setInt("order.top.n", 2);
Job job = Job.getInstance(conf);
job.setJarByClass(ReduceSideJoin.class);
job.setMapperClass(ReduceSideJoinMapper.class);
job.setReducerClass(ReduceSideJoinReducer.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\join\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\join\\out1"));
job.waitForCompletion(true);
}
}
PS:以上代码是最low的,耗费内存太大了;