MapReduce编程小案例.9th—join算法

最新推荐文章于 2024-07-20 01:01:27 发布

RobertDowneyLm

最新推荐文章于 2024-07-20 01:01:27 发布

阅读量2.5k

点赞数

分类专栏：干货教程学习笔记文章标签： MapReduce 大数据

本文链接：https://blog.csdn.net/RobertDowneyLm/article/details/80331177

版权

学习笔记同时被 2 个专栏收录

61 篇文章 0 订阅

订阅专栏

干货教程

57 篇文章 0 订阅

订阅专栏

本文通过一个MapReduce编程案例，介绍如何使用Join算法将具有相同username的订单数据和用户数据整合。在Map阶段，根据数据类型打上不同标记，并以uid为key输出。Reduce阶段，遍历相同uid的数据，将订单数据与用户数据进行Join操作，填充后输出。

摘要由CSDN通过智能技术生成

MapReduce编程小案例.9th—join算法

数据：

有订单数据：

order001,u001

order002,u001

order003,u005

order004,u002

order005,u003

order006,u004

有用户数据：

u001,senge,18,angelababy

u002,laozhao,48,ruhua

u003,xiaoxu,16,chunge

u004,laoyang,28,zengge

u005,nana,14,huangbo

需求：要求把它们username相同的整合起来

思路：

map端：

不管worker读到的是什么文件，我们的map方法中是可以通过context来区分的

对于order数据，map中切字段，封装为一个joinbean，打标记：t_order

对于user数据，map中切字段，封装为一个joinbean，打标记：t_user

然后，以uid作为key，以joinbean作为value返回

reduce端：

用迭代器迭代出一组相同uid的所有数据joinbean，然后判断

如果是标记字段为t_order的，则加入一个arraylist<JoinBean>中

如果标记字段为t_user的，则放入一个Joinbean对象中

然后，遍历arraylist，对里面的每一个JoinBean填充userBean中的user数据，然后输出这个joinBean即可

实现代码：

JoinBean实现类

package cn.edu360.mr.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class JoinBean implements Writable{
	
	private String orderId;
	private String userId;
	private String userName;
	private int userAge;
	private String userFriend;
	private String tableName;
	
	
	
	
	public void set(String orderId, String userId, String userName, int userAge, String userFriend,String tableName) {

		this.orderId = orderId;
		this.userId = userId;
		this.userName = userName;
		this.userAge = userAge;
		this.userFriend = userFriend;
		this.tableName = tableName;
	}
	
	
	
	public String getTableName() {
		return tableName;
	}

	public void setTableName(String tableName) {
		this.tableName = tableName;
	}



	public String getOrderId() {
		return orderId;
	}
	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}
	public String getUserId() {
		return userId;
	}
	public void setUserId(String userId) {
		this.userId = userId;
	}
	public String getUserName() {
		return userName;
	}
	public void setUserName(String userName) {
		this.userName = userName;
	}
	public int getUserAge() {
		return userAge;
	}
	public void setUserAge(int userAge) {
		this.userAge = userAge;
	}
	public String getUserFriend() {
		return userFriend;
	}
	public void setUserFriend(String userFriend) {
		this.userFriend = userFriend;
	}
	
	@Override
	public String toString() {
		return this.orderId +","+ this.userId + ","+this.userAge + ","+this.userName + ","+this.userFriend;
	}



	public void readFields(DataInput in) throws IOException {
		this.orderId = in.readUTF();
		this.userId = in.readUTF();
		this.userName = in.readUTF();
		this.userAge = in.readInt();
		this.userFriend = in.readUTF();
		this.tableName = in.readUTF();
		
		
	}



	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.orderId);
		out.writeUTF(this.userId);
		out.writeUTF(this.userName);
		out.writeInt(this.userAge);
		out.writeUTF(this.userFriend);
		out.writeUTF(this.tableName);
		
	}

}

ReduceSideJoin实现类

package cn.edu360.mr.join;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/*
 * 本例是使用最low的方式实现
 * 
 * 还可以利用Partitioner + CompareTo + GroupingComparator 高效实现
 */

public class ReduceSideJoin {
	
	public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean>{
		
		String fileName = null;
		JoinBean bean = new JoinBean();
		Text k = new Text();
		/*
		 * maptask在做数据处理的时候，会先执行一次setup（）
		 * 执行完之后才对每一行反复调用map（）
		 */
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {
            
			FileSplit inputSplit =(FileSplit)context.getInputSplit();
			fileName = inputSplit.getPath().getName();
		}
		
		
		
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context)
				throws IOException, InterruptedException {
			String[] fields = value.toString().split(",");
			if(fileName.startsWith("order")) {
				bean.set(fields[0],fields[1],"NULL",-1,"NULL","order");
			}else {
				bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
			}
			k.set(bean.getUserId());
			context.write(k, bean);
			
		}
		
	}
	
	public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable>{
		
		@Override
		protected void reduce(Text key, Iterable<JoinBean> beans,
				Reducer<Text, JoinBean, JoinBean, NullWritable>.Context context) throws IOException, InterruptedException {
			ArrayList<JoinBean> orderList = new ArrayList<JoinBean>();
			JoinBean userBean = null;
			
	   try {
			for (JoinBean bean : beans) {
				if("order".equals(bean.getTableName())) {
					JoinBean newBean = new JoinBean();
				    BeanUtils.copyProperties(newBean, bean);
				    orderList.add(newBean);
					} else {
						userBean =new JoinBean();
						BeanUtils.copyProperties(userBean, bean);
						
					}
				}
			
			  //拼接数据
			 for (JoinBean bean : orderList) {
				 bean.setUserName(userBean.getUserName());
				 bean.setUserAge(userBean.getUserAge());
				 bean.setUserFriend(userBean.getUserFriend());
				 
				 context.write(bean, NullWritable.get());
			}

	             }catch (Exception e) {
                      	e.printStackTrace();
			} 
		}
	}
	
	public static void main(String[] args) throws Exception {
	
			
			Configuration conf = new Configuration(); 
			conf.setInt("order.top.n", 2);
			
			Job job = Job.getInstance(conf);
	
			job.setJarByClass(ReduceSideJoin.class);
	
			job.setMapperClass(ReduceSideJoinMapper.class);
			job.setReducerClass(ReduceSideJoinReducer.class);
			
			job.setNumReduceTasks(2);
	
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(JoinBean.class);
			
			job.setOutputKeyClass(JoinBean.class);
			job.setOutputValueClass(NullWritable.class);
	
			FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\join\\input"));
			FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\join\\out1"));
	
			job.waitForCompletion(true);
		}

}

PS：以上代码是最low的，耗费内存太大了；