hadoop之join

最新推荐文章于 2023-08-29 14:45:29 发布

优雅程序员

最新推荐文章于 2023-08-29 14:45:29 发布

阅读量138

点赞数

文章标签： hadoop

本文链接：https://blog.csdn.net/weixin_45433525/article/details/105175589

版权

1.reduce端join

mapreduce的join过程：
reduce端的join 在reduce端完成join
订单数据表t_order： flag=0
id date pid amount
1001 20150710 P0001 2
1002 20150710 P0001 3
1003 20150710 P0002 3
Id:数据记录id
Date 日期
Pid 商品id
Amount 库存数量
商品信息表t_product flag=1
pid name category_id price
P0001 小米5 C01 2000
P0002 锤子T1 C01 3500
最终结果：
1001 20150710 P0001 2 小米5 C01 2000
1002 20150710 P0001 3 小米5 C01 2000
1003 20150710 P0002 3 锤子T1 C01 3500
思路：select * from a join b on a.pid=b.pid
核心：关联条件
想要在reduce端完成join 在reduce端可以同时接受到两个表中的数据有区分标志
怎么保证？
保证在map端进行文件读取的时候一次性两个表的数据需要对两个表的数据进行区分
将两个表放在同一个目录下
map端做的事情：发送数据的时候需要打标记
读取两个表中的数据进行切分发送
key：公共字段关联字段 pid
value：剩下的需要有标记标记数据的来源
reduce端：
接收过来，判断是来自于哪个表的数据进行拼接
补充：
public void run(Context context) throws IOException, InterruptedException {
//在maptask执行之前会调用一次并且一个maptask只会调用一次
//setup中通常会帮助map中初始化一些变量或资源主要目的：减少资源的初始化的次数进而提升程序的性能
setup(context);
try {
//context.nextKeyValue() 获取文件中是否还有下一行数据
while (context.nextKeyValue()) {//一行调用一次
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
//maptask任务执行完成之后会调用一次 1个maptask
//帮map处理一些善后工作比如说：资源关闭
cleanup(context);
}
}

缺陷：
1）reducetask的并行度问题 0.95*datanode节点的个数并行度不高性能不高
2）容器性能不提倡 reduce端接收的数据可能会很大
3）reducetask容易产生数据倾斜
假设我们设置多个reducetask 根据分区规则默认hash
key：关联条件 reducetask数据倾斜每个reducetask分工不均非常影响性能的
灭有合理；利用集群资源
在真实的生产中一定要尽量避免数据倾斜
最好的做法：将你的分区设计的足够完美难度比较大

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.examples.SecondarySort.Reduce;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class ReduceJoin {
	//key   pid       value:剩下的   打标记
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
		String filename="";
		//context  上下文对象
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			//获取文件名
			//获取文件切片相关的信息     一个切片对应一个maptask
			InputSplit inputSplit = context.getInputSplit();
			//转换为文件切片
			FileSplit fs=(FileSplit)inputSplit;
			//获取文件名
			filename= fs.getPath().getName();
			
		}
		
		/*	
		 * 
		 * 订单：order
		 * 商品：product
		 * 由于map中需要知道数据来源  所以最好在进入map函数之前可以获取文件的名字   这个事情给setup做
		 * map端做的事情：发送数据的时候需要打标记
			读取两个表中的数据  进行切分  发送
			key：公共字段      关联字段  pid
			value：剩下的    需要有标记  标记数据的来源
		reduce端：
			接收过来，判断是来自于哪个表的数据进行拼接
		 * (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		Text k=new Text();
		Text v=new Text();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			//解析出来每一行内容    打标记   发送
			String[] infos = value.toString().split("\t");
			if(filename.equals("order")){
				//1001	20150710	P0001	2
				k.set(infos[2]);
				//标记不要过长    仅仅标记的作用   
				v.set("OR"+infos[0]+"\t"+infos[1]+"\t"+infos[3]);
				context.write(k, v);
			}else{
				//P0001	小米5	    C01	2000
				k.set(infos[0]);
				v.set("PR"+infos[1]+"\t"+infos[2]+"\t"+infos[3]);
				context.write(k, v);
			}
		}
	}
	
	//1001	20150710	P0001	2    小米5	C01	2000
	static class MyReducer extends Reducer<Text, Text, Text, NullWritable>{
		Text k=new Text();
		//按pid进行分组的
		@Override
		protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context)
				throws IOException, InterruptedException {
			/*
			 * OR:1001	20150710	2    
				1002	20150710	3
			 * PR:小米5	C01	2000
			 * 关联关系  一对多  一个商品对应多条订单
			 * 接收道德数据：一的表的数据只有一个    多的表的数据：可能有多个
			 * 将多的表数据拿过来和一的表中的数据分别进行拼接
			 */
			//两个数据需要疯转到两个容器中
			List<String> orderList=new ArrayList<String>();
			List<String> proList=new ArrayList<String>();
			for(Text v:values){
				String vv=v.toString();
				if(vv.startsWith("OR")){
					orderList.add(vv.substring(2));
				}else{
					proList.add(vv.substring(2));
				}
			}
			//拼接的时候   什么时候才可以进行拼接
			if(orderList.size()>0 && proList.size()>0){
				//循环遍历多的  拼接1的
				for(String ol:orderList){
					String res=key.toString()+"\t"+ol+"\t"+proList.get(0);
					//最终结果写出了  1001	20150710	2  小米5	C01	2000
					k.set(res);
					context.write(k, NullWritable.get());
					
				}
			}
		}
	}
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
		Configuration conf=new Configuration();
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		
		//job01的配置
		Job job=Job.getInstance(conf);
		
		job.setJarByClass(ReduceJoin.class);
		
		
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		
		
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.addInputPath(job, new Path("/info"));
		FileOutputFormat.setOutputPath(job, new Path("/reduce_join_1"));
		
		job.waitForCompletion(true);
	}

}

2.map端join

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.collections.map.HashedMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MapJoin {
	//map端join      
	static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
		/*
		 * 读取缓存中的数据  封装到容器中
		 * 读：流
		 * 容器：
		 * 	因为等会要进行配皮    匹配的时候  pid匹配   最好pid先抽出来
		 * Map
		 * (non-Javadoc)
		 * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
		 */
		// key:pid    value:剩下的
		Map<String,String> map=new HashMap<String,String>();
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			//获取缓存中的数据路径   context.getLocalCacheFiles()获取缓存文件
			Path path = context.getLocalCacheFiles()[0];
			String p=path.toString();
			BufferedReader br=new BufferedReader(new FileReader(p));
			String line=null;
			while((line=br.readLine())!=null){
				//P0002	锤子T1	C01	3500
				String[] infos = line.split("\t");
				map.put(infos[0], infos[1]+"\t"+infos[2]+"\t"+infos[3]);
			}
			
		}
		Text k=new Text();
		//读取一个文件就可以了
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String[] infos = value.toString().split("\t");
			//1002	20150710	P0001	3
			String pid=infos[2];
			//进行关联    pid到map中匹配   如果包含  证明匹配上了
			if(map.containsKey(pid)){
				String res=value.toString()+map.get(pid);
				k.set(res);
				context.write(k, NullWritable.get());
			}
		}

	}
	
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException{
		Configuration conf=new Configuration();
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		
		//job01的配置
		Job job=Job.getInstance(conf);
		
		job.setJarByClass(MapJoin.class);
		
		
		job.setMapperClass(MyMapper.class);
		
		
		//这里指的最终的输出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		//过不加  默认走一个reducetask
		job.setNumReduceTasks(0);
		
		//将文件加载到缓存   参数是加载文件地址  将/info/product加载到缓存中
		job.addCacheFile(new URI("/info/product"));
		
		FileInputFormat.addInputPath(job, new Path("/info/order"));
		FileOutputFormat.setOutputPath(job, new Path("/map_join_1"));
		
		job.waitForCompletion(true);
	}

}

如果能够在map端就完成join的过程是不是就没有上面的问题了，map端的join过程
为了提升map端join性能我们的是将小表的数据加载到每个运行maptask的内存中
如果小表被加载到了内存中，我们每次在map端只需要读取大表，当读取到大表中的每一行数据
可以之间和内存中的小表进行关联那么这个时候我们仅仅需要map就可以完成jpin操作了
怎么把小表加载到内存中
job.addCacheFile(uri);将制定url的文件加载到缓存中
map端怎么读取到缓存中的数据？
想要在java中使用缓存中的数据缓存中的数据必须封装到容器中
为了保证map函数中可以匹配到缓存中的数据，这个封装的过程应该是在map函数之前
setup中做
缓存加载的时候本地
加载缓存的时候拉去到本地的文件：
/home/hadoop/data/hadoopdata/nm-local-dir/filecache/10
只能打jar包运行
mapjoin的方式：大/小表
因为有一个表需要加载到内存中注定加载到内存中的表不能过大 256M
大表大表：
1）reducejoin 解决数据倾斜的问题合理设计分区很难做到
2）将其中一个大表进行切分切分成小表在执行大表*小表
优点：并行度高不存在数据倾斜问题运行效率高
优先选择mapjoin

优雅程序员

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop之join

1.reduce端joinmapreduce的join过程：reduce端的join 在reduce端完成join订单数据表t_order： flag=0id date pid amount1001 20150710 P0001 21002 20150710 P0001 31003 20150710 P0002 3Id:数据记录idDate 日期Pid 商品...
复制链接

扫一扫