文章目录
map端join算法实现
原理阐述
适用于关联表中有小表的情形:
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join
并输出最终结果,可以大大提高join操作的并发度,加快处理速度
实现示例
--先在mapper类中预先定义好小表,进行join
--引入实际场景中的解决方案:一次加载数据库或者用distributedcache
总结
适用场景
一个大表join一个小表
实现方式:
a. 将小表先准备在一个hdfs的目录中
b. 在代码的main方法中用job.addCacheFile()将其分发到maptask的工作目录下;还需要将reduce task的数量设置为0
c. 在代码的mapper的setup方法中用本地文件api读取小表文件到内存中
d. 在map方法中根据输入数据匹配内存小表进行拼接即可
代码实现
缓存小表
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CacheMap extends Mapper<LongWritable, Text, Text, NullWritable> {
// v保存缓存数据的hashmap
Map<String, String> pMap = new HashMap<>();
Text k = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 1. 获得缓存的文件
BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream("C:\\Users\\55454_000\\Desktop\\product.txt"), "UTF-8"));
String line = null;
while (StringUtils.isNotEmpty(line = reader.readLine())) {
// 切割
String[] fieds = line.split(",");
// 缓存到集合中
pMap.put(fieds[0], fieds[1]);
}
// 关闭流
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取数据
String line = value.toString();
// 截取字段
String[] fields = line.split(",");
// 获得订单ID
String id = fields[0];
// 获得产品的id
String pid = fields[2];
// 获得商品名称
String pName = pMap.get(pid);
// join
k.set(line + "\t" + pName);
// 输出
context.write(k, NullWritable.get());
}
}
主函数Driver
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(CacheMap.class);
// job.setReducerClass(InputFormatReduce.class);
// job.setInputFormatClass(MyFileInputFormat.class);
//job.addCacheFile(new URI("file:///C:/Users/55454_000/Desktop/product.txt"));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
conf.setBoolean("mapreduce.map.output.compress", true);
conf.setClass("mapreduce.map.output.compress", BZip2Codec.class, CompressionCodec.class);
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
// job.setOutputKeyClass(Text.class);
// job.setOutputValueClass(BytesWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
reduce端join实现
实现
自定义数据类型
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class InfobeanWritable implements Writable {
private int order_id; // 订单id
private String date; // 日期
private String pid; // 商品id
private int amount; // 订单数量
private String name; // 商品名称
private String category_id; // 商品类别
private double price; // 商品价格
private String flag;// 标记位 0 表示订单表 1 表示商品表
// 无参构造方法
public InfobeanWritable() {
}
// 有参构造方法
public InfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
double price, String flag) {
this.setInfobeanWritable(order_id, date, pid, amount, name, category_id, price, flag);
}
@Override
public String toString() {
return "InfobeanWritable [order_id=" + order_id + ", date=" + date + ", pid=" + pid + ", amount=" + amount
+ ", name=" + name + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag + "]";
}
public void setInfobeanWritable(int order_id, String date, String pid, int amount, String name, String category_id,
double price, String flag) {
this.order_id = order_id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.name = name;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategory_id() {
return category_id;
}
public void setCategory_id(String category_id) {
this.category_id = category_id;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.order_id = in.readInt();
this.date = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
this.name = in.readUTF();
this.category_id = in.readUTF();
this.price = in.readDouble();
this.flag = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(this.order_id);
out.writeUTF(this.date);
out.writeUTF(this.pid);
out.writeInt(this.amount);
out.writeUTF(this.name);
out.writeUTF(this.category_id);
out.writeDouble(this.price);
out.writeUTF(this.flag);
}
}
map端
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class MapJoin extends Mapper<LongWritable, Text, Text, InfobeanWritable> {
Text outputkey = new Text();
InfobeanWritable infobean = new InfobeanWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1. 获取一行的内容
String line = value.toString();
// 2. 获得文件名称
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
// 切割
String[] fileds = line.split(",");
String pid;
// 从文件名称里判断, 如果是order开头说明是订单表
if (fileName.startsWith("order")) {
int order_id = Integer.valueOf(fileds[0]);
String date = fileds[1];
pid = fileds[2];
int amount = Integer.valueOf(fileds[3]);
infobean.setInfobeanWritable(order_id, date, pid, amount, "", "", 0, "0");
} else {
pid = fileds[0];
String name = fileds[1];
String category_id = fileds[2];
double price = Double.valueOf(fileds[3]);
infobean.setInfobeanWritable(0, "", pid, 0, name, category_id, price, "1");
}
outputkey.set(pid);
context.write(outputkey, infobean);
}
}
reduce端
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class JoinReduce extends Reducer<Text, InfobeanWritable, InfobeanWritable, NullWritable> {
@Override
protected void reduce(Text key, Iterable<InfobeanWritable> values, Context context)
throws IOException, InterruptedException {
ArrayList<InfobeanWritable> orderlist = new ArrayList<>();
InfobeanWritable pdBean = new InfobeanWritable();
for (InfobeanWritable value : values) {
if ("1".equals(value.getFlag())) {
try {
BeanUtils.copyProperties(pdBean, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
InfobeanWritable odBean = new InfobeanWritable();
try {
BeanUtils.copyProperties(odBean, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
orderlist.add(odBean);
}
}
for (InfobeanWritable bean : orderlist) {
bean.setName(pdBean.getName());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
}
主函数Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Driver.class);
job.setMapperClass(MapJoin.class);
job.setReducerClass(JoinReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfobeanWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// job.setOutputFormatClass(FilteroutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
数据压缩
作用
有效减少磁盘空间或者IO的带宽
常用的压缩的方式
压缩的格式 | 是否切分 | 解压缩 |
---|---|---|
gzip | 否 | 不需要处理 |
Bzip2 | 是 | 不需要处理 |
Snappy | 否 | 不需要处理 |
Snappy 特点
Snappy 需要单独的安装 hive
Snappy 速度是最快的
使用压缩的情况
在不频繁进行计算的时候, 并且有大量文件传输的情景下可以使用压缩
使用阶段
1.输入阶段
2.map输出阶段
// 在driver类中开启map端的压缩
config.setBoolean("mapreduce.map.output.compress", true);
//设置压缩方式
config.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
3.reduce输出阶段
//开启reduce端压缩
FileOutputFormat.setCompressOutput(job, true);
//压缩格式的设置
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);