MR 表连接

最新推荐文章于 2022-09-16 17:55:33 发布

Oasen

最新推荐文章于 2022-09-16 17:55:33 发布

阅读量398

点赞数

分类专栏： Hadoop

本文链接：https://blog.csdn.net/dec_sun/article/details/93748144

版权

Hadoop 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

文章目录

- Reduce端join
- Map 端 join

Reduce 端 join 相较于 Map 端 join 更为普遍，因为输入的数据不需要特定的结构，弊端则是效率比较低，因为数据都须经过Shuffle过程。

MapReduce 表连接操作

Map 端 join：Map 端 join 是指数据到达 map 处理函数之前进行合并, 效率要远远高于 Reduce 端 join, 因为 Reduce 端 join 是把所有数据都经过 Shuffle, 非常耗资源.
Reduce 端 join：Reduce 端 join 相较于 Map 端 join 更为普遍，因为输入的数据不需要特定的结构，弊端则是效率比较低，因为数据都须经过Shuffle过程。

Reduce端join

user.txt  -> <用户编号，用户名，城市编号>
1    A     2
2    B     1
3    C     3
4    D     1
5    E     3
6    F     1
7    G     2
8    H     3

city.txt  -> <城市编号，城市名>
1   beijing
2   shanghai
3   wuhan

实现思路

Map 端读取所有的文件，并在输出的内容里加上标示，代表数据是从哪个文件里来的。
在 reduce 处理函数中，按照标识对数据进行处理。
然后根据 Key 去 join 来求出结果直接输出。

步骤

构建一个新的 bean ‐> UserCity.java

// 需要一个WritableComparable的序列化接口.否则会报空指针异常
public class UserCity implements WritableComparable<UserCity>{
	
	// name的key --- name  --- city的名字 --- city的编号
	// 标识
	private String userID = "";
	private String userName = "";
	private String cityID = "";
	private String cityName = "";
	private int flag = 0;
	
	public UserCity() {
	}
	
	public UserCity(String userID, String userName,String cityID ,String cityName,int flag) {
		this.userID=userID;
		this.userName=userName;
		this.cityID=cityID;
		this.cityName=cityName;
		this.userID=userID;
	}
	
	public UserCity(UserCity uc) {
		this.userID=uc.getUserID();
		this.userName=uc.getUserName();
		this.cityID=uc.getCityID();
		this.cityName=uc.getCityName();
		this.flag=uc.getFlag();
	}

	public String getUserID() {
		return userID;
	}
	public void setUserID(String userID) {
		this.userID = userID;
	}
	public String getUserName() {
		return userName;
	}
	public void setUserName(String userName) {
		this.userName = userName;
	}
	public String getCityID() {
		return cityID;
	}
	public void setCityID(String cityID) {
		this.cityID = cityID;
	}
	public String getCityName() {
		return cityName;
	}
	public void setCityName(String cityName) {
		this.cityName = cityName;
	}
	public int getFlag() {
		return flag;
	}
	public void setFlag(int flag) {
		this.flag = flag;
	}

	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.userID);
		out.writeUTF(this.userName);
		out.writeUTF(this.cityID);
		out.writeUTF(this.cityName);
		out.writeInt(this.flag);
		
	}

	public void readFields(DataInput input) throws IOException {
		this.userID=input.readUTF();
		this.userName=input.readUTF();
		this.cityID=input.readUTF();
		this.cityName=input.readUTF();
		this.flag=input.readInt();
	}

	public int compareTo(UserCity arg0) {
		return 0;
	}
	
	@Override
	public String toString() {
		return "userID"+userID+",userName"+userName+",cityName"+cityName;
	}
}

map 端根据表的长度来进行切分, 根据 flag 来进行汇总, 以两表中共有属性作为 key.
reduce 根据flag来进行汇总

public class ReduceSideJoin extends Configured implements Tool{

	//输出类型<key value>
	//key:城市编号 value：UserCity
	public static class RSJMapper extends Mapper<LongWritable, Text, IntWritable, UserCity>{
				
		private UserCity user =null;
		private IntWritable outkey = new IntWritable();
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//切分数据 0->城市 1->user
			String line = value.toString();
			String[] splits=line.split(" ");
			//判断一下，数据来自于哪个文件，长度为2是city，长度为3是user
			if(splits.length==2) 
			{
				//city
				user = new UserCity();
				user.setCityID(splits[0]);
				user.setCityName(splits[1]);
				user.setFlag(0);
				outkey.set(Integer.parseInt(splits[0])); //以两个表中共有的参数作为 key输出
				context.write(outkey, user);
			}
			else if(splits.length==3) {
				user = new UserCity();
				user.setUserID(splits[0]);
				user.setUserName(splits[1]);
				user.setCityID(splits[2]);
				user.setFlag(1);
				outkey.set(Integer.parseInt(splits[2])); //以两个表中共有的参数作为 key输出
				context.write(outkey, user); 
			}

		}
	}
	/**
	*  将两份数据整合成一份数据，由于从 map 端输出的是 <城市编号，<用户编号，用户名>> 和 
	* <城市编号，城市名>，经过 shuffle 后，就变成了
	*  <城市编号，list(<用户编号，用户名>，<城市名>) >，这样就可以相当于对两份数据进行了 join 操作。
	*/
	// reducer,map的输出就是我们reduce的输入
	public static class RSJReducer extends Reducer<IntWritable, UserCity, IntWritable, Text>{
		
		private List<UserCity> userCities = new ArrayList<UserCity>();
		private UserCity user =null;
		private Text outValue = new Text();
		
		@Override
		protected void reduce(IntWritable key, Iterable<UserCity> values,
				Context context) throws IOException, InterruptedException {
			
			//根据flag来进行区分
			userCities.clear();
			//values里面包含一个city 以及 多个user
			for(UserCity uc : values) {
				//唯一的一个city
				if(uc.getFlag()==0) {
					user = new UserCity(uc);
				}
				else if (uc.getFlag()==1) {
					//除了城市之外，其他用户的对象，将这些对象添加到集合
					userCities.add(new UserCity(uc));
				}				
			}
			
			//遍历集合，补全城市信息
			for(UserCity u:userCities ) {
				//补全
				u.setCityName(user.getCityName());
				outValue.set(u.toString());
				//传递出去
				context.write(key, outValue);
			}
		}
	}
	
	//driver:任务相关设置
	public int run(String[] args) throws Exception{
		//获取相关配置
		Configuration conf = this.getConf();
		Job job = Job.getInstance(conf, this.getClass().getSimpleName());
		job.setJarByClass(ReduceSideJoin.class);
		//设置input
		Path inpath = new Path(args[0]);
		FileInputFormat.addInputPath(job, inpath);
		//设置output
		Path outpath = new Path(args[1]);
		//进行outpath的验真，存在即删除
		FileSystem fs = FileSystem.get(new URI(outpath.toString()), conf);
		if(fs.exists(new Path(outpath.toString()))) {
			fs.delete(new Path(outpath.toString()));
		}		
		FileOutputFormat.setOutputPath(job, outpath);
		
		//设置map
		job.setMapperClass(RSJMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(UserCity.class);		
		
		//设置reduce
		job.setReducerClass(RSJReducer.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		//将job提交给Yarn
		boolean isSuccess = job.waitForCompletion(true);
		return isSuccess?0:1;
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		//保存的输入输出路径
		args = new String[]{
			"hdfs://lee01.cniao5.com:8020/user/root/mapreduce/input",
			"hdfs://lee01.cniao5.com:8020/user/root/mapreduce/output"
		};
		// 将任务跑起来
		//int statas = new WordCountMapReduce().run(args);
		int statas = ToolRunner.run(conf, new ReduceSideJoin(), args);
		// 关闭我们的job
		System.exit(statas);
	}
}

Map 端 join

file1.txt  -> <用户编号，用户名，城市编号，phone>
1    A     beijing        110
2    B     shanghai    119
3    C     wuhan        120
4    D     shanghai    119
5    E     wuhan         110
6    F     shanghai     119
7    G     beijing         120
8    H     wuhan         119

file2.txt  -> <订单编号，买家编号，费用>
1   2    200
2   3    500
3   1    300
5   6    400
7   3    200
9   2    500

实现思路

我们往往将较小的表添加到内存中，因为内存的资源是很宝贵的（如果表的数据量都非常大则不适合使用 Map 端 join）。需要 join 的两个文件，一个存储在 HDFS 中，一个使用DistributedCache.addCacheFile() 将需要 join 的另外一个文件加入到所有 Map 缓存中。
在 Map 函数里读取该文件，进行 join
将结果输出到 reduce
DistributedCache.addCacheFile() 需要在作业提交前设置。

DistributedCache 是为了方便用户进行应用程序开发而设计的文件分发工具。它能够将只读的外部文件进行自动分发到各个节点上进行本地缓存，以便task运行时加载。

DistributedCache 使用步骤

在HDFS中上传文件(文本文件、压缩文件、jar包等)
调用相关API添加文件信息
task运行前直接调用文件读写API获取文件。

步骤
1.构建一个新的bean

public class CustomerBean {    
    private int custId;
    private String name;
    private String address;
    private String phone;
   
    public CustomerBean() {}
   
    public CustomerBean(int custId, String name, String address,
            String phone) {
        super();
        this.custId = custId;
        this.name = name;
        this.address = address;
        this.phone = phone;
    }
CustOrderMapOutKey.java
    public int getCustId() {
        return custId;
    }
    public String getName() {
        return name;
    }
    public String getAddress() {
        return address;
    }
    public String getPhone() {
        return phone;
    }
}

自定义输入MapOutKey

public class CustOrderMapOutKey implements WritableComparable<CustOrderMapOutKey> {
    private int custId;
    private int orderId;
    public void set(int custId, int orderId) {
        this.custId = custId;
        this.orderId = orderId;
    }
   
    public int getCustId() {
        return custId;
    }
   
    public int getOrderId() {
        return orderId;
    }
   
    public void write(DataOutput out) throws IOException {
        out.writeInt(custId);
        out.writeInt(orderId);
    }
    public void readFields(DataInput in) throws IOException {
        custId = in.readInt();
        orderId = in.readInt();
    }
    public int compareTo(CustOrderMapOutKey o) {
        int res = Integer.compare(custId, o.custId);
        return res == 0 ? Integer.compare(orderId, o.orderId) : res;
    }
   
    @Override
    public boolean equals(Object obj) {
        if (obj instanceof CustOrderMapOutKey) {
            CustOrderMapOutKey o = (CustOrderMapOutKey)obj;
            return custId == o.custId && orderId == o.orderId;
        } else {
            return false;
        }
    }
   
    @Override
    public String toString() {
        return custId + "\t" + orderId;
    }
}

在map执行之前，将文件放进内存
reduce直接输出结果

public class MapSideJoin extends Configured implements Tool {
    private static final String CUSTOMER_LIST =
"hdfs://lee01.cniao5.com:8020/user/root/mapreduce/user/file1";
    // 输出类型<key value>
    // key:城市编号 value：UserCity
    public static class MSJMapper extends Mapper<LongWritable, Text, CustOrderMapOutKey, Text>
{
        private static final Map<Integer, CustomerBean> CUSTOMER_MAP = new HashMap<Integer,
CustomerBean>();
        private final Text outputValue = new Text();
        private final CustOrderMapOutKey outputKey = new CustOrderMapOutKey();
       
        // map之前的准备工作，用来提高效率
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            // 小表读取，并传输
            FileSystem fs = FileSystem.get(URI.create(CUSTOMER_LIST),
context.getConfiguration());
            FSDataInputStream fdis = fs.open(new Path(CUSTOMER_LIST));
            // 进行一下数据验证
            BufferedReader sr = new BufferedReader(new InputStreamReader(fdis));
            String line = null;
            String[] splits = null;
            while ((line = sr.readLine()) != null) {
                splits = line.split(" ");
                // 进行数据是否完整的验证，亦即是说，splits是否为4个
            }
            CustomerBean cb = new CustomerBean(Integer.parseInt(splits[0]), splits[1],
splits[2], splits[3]);
            CUSTOMER_MAP.put(cb.getCustId(), cb);
        }
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
            String line = value.toString();
            String[] splits = line.split(" ");
            // 构建一下我们的数据
            int custID = Integer.parseInt(splits[1]);// 通过客户编号获取信息
            CustomerBean customerBean = CUSTOMER_MAP.get(custID);
            // 验证
            if (customerBean == null) {
                return;
            }
            // 消费 + 名字 + 地址 + 电话 -> value
            StringBuffer sb = new StringBuffer();
            sb.append(splits[2]).append(" ").append(customerBean.getName()).append("
").append(customerBean.getAddress()).append(" ").append(customerBean.getPhone());
            outputValue.set(sb.toString());
            outputKey.set(custID,Integer.parseInt(splits[0])); <客户id ,订单编号>  -> key
            context.write(outputKey, outputValue);
        }
    }
    /**
    *  首先对 file1，将数据封装在 java bean 中。然后再 map 阶段进行 <<客户id ,订单编号>,<消费, 名字, 地址,电话 >>
    *  在 shuffle 阶段 所有的 <客户id ,订单编号>，都会放在一起，使用 list 将 value 数据保存起来。
    * /
    // reducer
    public static class RSJReducer extends Reducer<CustOrderMapOutKey, Text,
CustOrderMapOutKey, Text> {
        @Override
        protected void reduce(CustOrderMapOutKey key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            //原样输出
            for(Text value :values) {
                context.write(key, value);
            }
        }
    }
    // driver:任务相关设置
    public int run(String[] args) throws Exception {
        // 获取相关配置
        Configuration conf = this.getConf();
        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        job.setJarByClass(MapSideJoin.class);
        // 设置input
        Path inpath = new Path(args[0]);
        FileInputFormat.addInputPath(job, inpath);
        // 设置output
        Path outpath = new Path(args[1]);
        // 进行outpath的验真，存在即删除
        FileSystem fs = FileSystem.get(new URI(outpath.toString()), conf);
        if (fs.exists(new Path(outpath.toString()))) {
            fs.delete(new Path(outpath.toString()), true);
        }
        FileOutputFormat.setOutputPath(job, outpath);
        job.addCacheFile(URI.create(CUSTOMER_LIST));
       
        // 设置map
        job.setMapperClass(MSJMapper.class);
        job.setMapOutputKeyClass(CustOrderMapOutKey.class);
        job.setMapOutputValueClass(Text.class);
        // 设置reduce
        job.setReducerClass(RSJReducer.class);
        job.setOutputKeyClass(CustOrderMapOutKey.class);
        job.setOutputValueClass(Text.class);
        // 将job提交给Yarn
        boolean isSuccess = job.waitForCompletion(true);
        return isSuccess ? 0 : 1;
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        // 保存的输入输出路径
        args = new String[] {
                "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/city/file2",
                "hdfs://lee01.cniao5.com:8020/user/root/mapreduce/output" };
        // 将任务跑起来
        // int statas = new WordCountMapReduce().run(args);
        int statas = ToolRunner.run(conf, new MapSideJoin(), args);
        // 关闭我们的job
        System.exit(statas);
    }
}