【Hadoop】42-MapReduce连接实例

最新推荐文章于 2022-06-01 18:39:20 发布

一直不懂

最新推荐文章于 2022-06-01 18:39:20 发布

阅读量215

点赞数

分类专栏： Hadoop权威指南

本文链接：https://blog.csdn.net/shenchaohao12321/article/details/82392501

版权

Hadoop权威指南专栏收录该内容

51 篇文章 6 订阅

订阅专栏

1、map端join算法实现

1.1、原理阐述

适用于关联表中有小表的情形；

可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果，可以大大提高join操作的并发度，加快处理速度。

1.2、实现示例

先在mapper类中预先定义好小表，进行join。

引入实际场景中的解决方案：一次加载数据库或者用distributedcache。

public class TestDistributedCache {
	static class TestDistributedCacheMapper extends Mapper<LongWritable, Text, Text, Text>{
		FileReader in = null;
		BufferedReader reader = null;
		HashMap<String,String> b_tab = new HashMap<String, String>();
		String localpath =null;
		String uirpath = null;
		//是在map任务初始化的时候调用一次
		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			//通过这几句代码可以获取到cache file的本地绝对路径，测试验证用
			Path[] files = context.getLocalCacheFiles();
			localpath = files[0].toString();
			URI[] cacheFiles = context.getCacheFiles();
			//缓存文件的用法——直接用本地IO来读取
			//这里读的数据是map task所在机器本地工作目录中的一个小文件
			in = new FileReader("b.txt");
			reader =new BufferedReader(in);
			String line =null;
			while(null!=(line=reader.readLine())){	
				String[] fields = line.split(",");
				b_tab.put(fields[0],fields[1]);
			}
			IOUtils.closeStream(reader);
			IOUtils.closeStream(in);
		}
		
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			//这里读的是这个map task所负责的那一个切片数据（在hdfs上）
			 String[] fields = value.toString().split("\t");
			 String a_itemid = fields[0];
			 String a_amount = fields[1];
			 String b_name = b_tab.get(a_itemid);
			 // 输出结果  1001	98.9	banan
			 context.write(new Text(a_itemid), new Text(a_amount + "\t" + ":" + localpath + "\t" +b_name ));
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(TestDistributedCache.class);
		job.setMapperClass(TestDistributedCacheMapper.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		//这里是我们正常的需要处理的数据所在路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//不需要reducer
		job.setNumReduceTasks(0);
		//分发一个文件到task进程的工作目录
		job.addCacheFile(new URI("hdfs://hadoop-server01:9000/cachefile/b.txt"));
		//分发一个归档文件到task进程的工作目录
//		job.addArchiveToClassPath(archive);
		//分发jar包到task节点的classpath下
//		job.addFileToClassPath(jarfile);
		job.waitForCompletion(true);
	}
}

2、reduce端join算法实现

2.1、需求

订单数据表t_order：

id	date	pid	amount
1001	20150710	P0001	2
1002	20150710	P0001	3
1002	20150710	P0002	3

商品信息表t_product

id	pname	category_id	price
P0001	小米5	1000	2
P0002	锤子T1	1000	3

假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，需要用mapreduce程序来实现一下SQL查询运算：

select  a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2.2、实现机制

通过将关联的条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task，在reduce中进行数据的串联。

public class RJoin {
	static class RJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> {
		InfoBean bean = new InfoBean();
		Text k = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			String name = inputSplit.getPath().getName();
			// 通过文件名判断是哪种数据
			String pid = "";
			if (name.startsWith("order")) {
				String[] fields = line.split(",");
				// id date pid amount
				pid = fields[2];
				bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, "0");
			} else {
				String[] fields = line.split(",");
				// id pname category_id price
				pid = fields[0];
				bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");

			}
			k.set(pid);
			context.write(k, bean);
		}
	}

	static class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {
		@Override
		protected void reduce(Text pid, Iterable<InfoBean> beans, Context context) throws IOException, InterruptedException {
			InfoBean pdBean = new InfoBean();
			ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>();
			for (InfoBean bean : beans) {
				if ("1".equals(bean.getFlag())) {
					try {
						BeanUtils.copyProperties(pdBean, bean);
					} catch (Exception e) {
						e.printStackTrace();
					}
				} else {
					InfoBean odbean = new InfoBean();
					try {
						BeanUtils.copyProperties(odbean, bean);
						orderBeans.add(odbean);
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
			}
			// 拼接两类数据形成最终结果
			for (InfoBean bean : orderBeans) {
				bean.setPname(pdBean.getPname());
				bean.setCategory_id(pdBean.getCategory_id());
				bean.setPrice(pdBean.getPrice());
				context.write(bean, NullWritable.get());
			}

		}

	}

	public static void main(String[] args) throws Exception {
		args = new String[]{"D:/srcdata/wgj/", "D:/temp/out3"};
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		// 指定本程序的jar包所在的本地路径
		// job.setJarByClass(RJoin.class);
//		job.setJar("c:/join.jar");
		// 指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(RJoinMapper.class);
		job.setReducerClass(RJoinReducer.class);
		// 指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(InfoBean.class);
		// 指定最终输出的数据的kv类型
		job.setOutputKeyClass(InfoBean.class);
		job.setOutputValueClass(NullWritable.class);
		// 指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		// 指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
		/* job.submit(); */
		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);
	}
}

public class InfoBean implements Writable {

	private int order_id;
	private String dateString;
	private String p_id;
	private int amount;
	private String pname;
	private int category_id;
	private float price;

	// flag=0表示这个对象是封装订单表记录
	// flag=1表示这个对象是封装产品信息记录
	private String flag;

	public InfoBean() {
	}

	public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, String flag) {
		this.order_id = order_id;
		this.dateString = dateString;
		this.p_id = p_id;
		this.amount = amount;
		this.pname = pname;
		this.category_id = category_id;
		this.price = price;
		this.flag = flag;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(order_id);
		out.writeUTF(dateString);
		out.writeUTF(p_id);
		out.writeInt(amount);
		out.writeUTF(pname);
		out.writeInt(category_id);
		out.writeFloat(price);
		out.writeUTF(flag);

	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.order_id = in.readInt();
		this.dateString = in.readUTF();
		this.p_id = in.readUTF();
		this.amount = in.readInt();
		this.pname = in.readUTF();
		this.category_id = in.readInt();
		this.price = in.readFloat();
		this.flag = in.readUTF();

	}

	@Override
	public String toString() {
		return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
	}

}

2.3、缺点

这种方式中，join的操作是在reduce阶段完成，reduce端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜。

解决方案： map端join实现方式。

一直不懂

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【Hadoop】42-MapReduce连接实例

1、map端join算法实现1.1、原理阐述适用于关联表中有小表的情形；可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果，可以大大提高join操作的并发度，加快处理速度。1.2、实现示例先在mapper类中预先定义好小表，进行join。引入实际场景中的解决方案：一次加载数据库或者用distributedcache...
复制链接

扫一扫

专栏目录