hadoop09--map端及reduce端的join, 数据压缩

最新推荐文章于 2021-06-11 18:36:28 发布

forever428

最新推荐文章于 2021-06-11 18:36:28 发布

阅读量190

点赞数

分类专栏： hadoop 文章标签： map端join reduce端join join原理压缩压缩代码

本文链接：https://blog.csdn.net/forever428/article/details/83793206

版权

hadoop 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

map端join算法实现

原理阐述

适用于关联表中有小表的情形:
可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行join
并输出最终结果，可以大大提高join操作的并发度，加快处理速度

实现示例

--先在mapper类中预先定义好小表，进行join
--引入实际场景中的解决方案：一次加载数据库或者用distributedcache

总结

适用场景

一个大表join一个小表

实现方式：

a. 将小表先准备在一个hdfs的目录中
b. 在代码的main方法中用job.addCacheFile()将其分发到maptask的工作目录下；还需要将reduce task的数量设置为0
c. 在代码的mapper的setup方法中用本地文件api读取小表文件到内存中
d. 在map方法中根据输入数据匹配内存小表进行拼接即可

代码实现

缓存小表


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CacheMap extends Mapper<LongWritable, Text, Text, NullWritable> {

	// v保存缓存数据的hashmap
	Map<String, String> pMap = new HashMap<>();
	Text k = new Text();

	@Override
	protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub

		// 1. 获得缓存的文件
		BufferedReader reader = new BufferedReader(
				new InputStreamReader(new FileInputStream("C:\\Users\\55454_000\\Desktop\\product.txt"), "UTF-8"));

		String line = null;

		while (StringUtils.isNotEmpty(line = reader.readLine())) {
			// 切割
			String[] fieds = line.split(",");

			// 缓存到集合中
			pMap.put(fieds[0], fieds[1]);

		}

		// 关闭流
		reader.close();
	}

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// 获取数据
		String line = value.toString();

		// 截取字段
		String[] fields = line.split(",");

		// 获得订单ID
		String id = fields[0];

		// 获得产品的id
		String pid = fields[2];

		// 获得商品名称
		String pName = pMap.get(pid);

		// join
		k.set(line + "\t" + pName);

		// 输出
		context.write(k, NullWritable.get());
	}

}

主函数Driver

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args)
			throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(CacheMap.class);
		// job.setReducerClass(InputFormatReduce.class);

		// job.setInputFormatClass(MyFileInputFormat.class);
		//job.addCacheFile(new URI("file:///C:/Users/55454_000/Desktop/product.txt"));
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		conf.setBoolean("mapreduce.map.output.compress", true);
		conf.setClass("mapreduce.map.output.compress", BZip2Codec.class, CompressionCodec.class);
		
		FileOutputFormat.setCompressOutput(job, true);
		FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
		// job.setOutputKeyClass(Text.class);
		// job.setOutputValueClass(BytesWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);
		System.exit(res ? 0 : 1);

	}
}

reduce端join实现

实现

自定义数据类型

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class InfobeanWritable implements Writable {

	private int order_id; // 订单id
	private String date; // 日期
	private String pid; // 商品id
	private int amount; // 订单数量
	private String name; // 商品名称
	private String category_id; // 商品类别
	private double price; // 商品价格
	private String flag;// 标记位 0 表示订单表 1 表示商品表

	// 无参构造方法
	public InfobeanWritable() {

	}

	// 有参构造方法
	public InfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
			double price, String flag) {
		this.setInfobeanWritable(order_id, date, pid, amount, name, category_id, price, flag);
	}

	@Override
	public String toString() {
		return "InfobeanWritable [order_id=" + order_id + ", date=" + date + ", pid=" + pid + ", amount=" + amount
				+ ", name=" + name + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag + "]";
	}

	public void setInfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
			double price, String flag) {
		this.order_id = order_id;
		this.date = date;
		this.pid = pid;
		this.amount = amount;
		this.name = name;
		this.category_id = category_id;
		this.price = price;
		this.flag = flag;
	}

	public int getOrder_id() {
		return order_id;
	}

	public void setOrder_id(int order_id) {
		this.order_id = order_id;
	}

	public String getDate() {
		return date;
	}

	public void setDate(String date) {
		this.date = date;
	}

	public String getPid() {
		return pid;
	}

	public void setPid(String pid) {
		this.pid = pid;
	}

	public int getAmount() {
		return amount;
	}

	public void setAmount(int amount) {
		this.amount = amount;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getCategory_id() {
		return category_id;
	}

	public void setCategory_id(String category_id) {
		this.category_id = category_id;
	}

	public double getPrice() {
		return price;
	}

	public void setPrice(double price) {
		this.price = price;
	}

	public String getFlag() {
		return flag;
	}

	public void setFlag(String flag) {
		this.flag = flag;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub

		this.order_id = in.readInt();
		this.date = in.readUTF();
		this.pid = in.readUTF();
		this.amount = in.readInt();
		this.name = in.readUTF();
		this.category_id = in.readUTF();
		this.price = in.readDouble();
		this.flag = in.readUTF();

	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub

		out.writeInt(this.order_id);
		out.writeUTF(this.date);
		out.writeUTF(this.pid);
		out.writeInt(this.amount);
		out.writeUTF(this.name);
		out.writeUTF(this.category_id);
		out.writeDouble(this.price);
		out.writeUTF(this.flag);
	}

}

map端

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class MapJoin extends Mapper<LongWritable, Text, Text, InfobeanWritable> {

	Text outputkey = new Text();
	InfobeanWritable infobean = new InfobeanWritable();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// 1. 获取一行的内容
		String line = value.toString();

		// 2. 获得文件名称
		FileSplit fileSplit = (FileSplit) context.getInputSplit();

		String fileName = fileSplit.getPath().getName();

		// 切割
		String[] fileds = line.split(",");

		String pid;

		// 从文件名称里判断, 如果是order开头说明是订单表
		if (fileName.startsWith("order")) {
			int order_id = Integer.valueOf(fileds[0]);
			String date = fileds[1];
			pid = fileds[2];
			int amount = Integer.valueOf(fileds[3]);
			infobean.setInfobeanWritable(order_id, date, pid, amount, "", "", 0, "0");

		} else {
			pid = fileds[0];
			String name = fileds[1];
			String category_id = fileds[2];
			double price = Double.valueOf(fileds[3]);
			infobean.setInfobeanWritable(0, "", pid, 0, name, category_id, price, "1");
		}

		outputkey.set(pid);
		context.write(outputkey, infobean);

	}
}

reduce端

import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class JoinReduce extends Reducer<Text, InfobeanWritable, InfobeanWritable, NullWritable> {

	@Override
	protected void reduce(Text key, Iterable<InfobeanWritable> values, Context context)
			throws IOException, InterruptedException {

		ArrayList<InfobeanWritable> orderlist = new ArrayList<>();
		InfobeanWritable pdBean = new InfobeanWritable();

		for (InfobeanWritable value : values) {
			if ("1".equals(value.getFlag())) {
				try {
					BeanUtils.copyProperties(pdBean, value);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			} else {
				InfobeanWritable odBean = new InfobeanWritable();
				try {
					BeanUtils.copyProperties(odBean, value);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				orderlist.add(odBean);
			}

		}

		for (InfobeanWritable bean : orderlist) {
			bean.setName(pdBean.getName());
			bean.setCategory_id(pdBean.getCategory_id());
			bean.setPrice(pdBean.getPrice());

			context.write(bean, NullWritable.get());
		}
	}
}

主函数Driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Driver.class);
		job.setMapperClass(MapJoin.class);
		job.setReducerClass(JoinReduce.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(InfobeanWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		// job.setOutputFormatClass(FilteroutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : 1);
	}
}

数据压缩

作用

有效减少磁盘空间或者IO的带宽

常用的压缩的方式

压缩的格式	是否切分	解压缩
gzip	否	不需要处理
Bzip2	是	不需要处理
Snappy	否	不需要处理

Snappy 特点

Snappy 需要单独的安装 hive
Snappy 速度是最快的

使用压缩的情况

在不频繁进行计算的时候, 并且有大量文件传输的情景下可以使用压缩

使用阶段

1.输入阶段

2.map输出阶段

// 在driver类中开启map端的压缩 	
config.setBoolean("mapreduce.map.output.compress", true);

//设置压缩方式
config.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);

3.reduce输出阶段

//开启reduce端压缩	
FileOutputFormat.setCompressOutput(job, true);

//压缩格式的设置
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);

forever428

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop09--map端及reduce端的join, 数据压缩

文章目录map端join算法实现原理阐述实现示例总结适用场景实现方式：代码实现缓存小表主函数Driverreduce端join实现实现自定义数据类型map端reduce端主函数Driver数据压缩作用常用的压缩的方式Snappy 特点使用压缩的情况使用阶段1.输入阶段2.map输出阶段3.reduce输出阶段map端join算法实现原理阐述适用于关联表中有小表的情形:可以将小表分发到所有的...
复制链接

扫一扫