一、表连接操作(Join)
-
Reduce Join
(1) 缺点:容易造成数据倾斜。当造成数据倾斜以后,需要根据具体的业务场景进行更加细致的分区。
(2) 案例:
场景:order.txt
中已经存储多条订单信息,product.txt
存储了产品信息,现在要求将两个文件中的信息连接起来。# 每个文件中的各字段都用制表符分割 # order.txt文件中的内容 orderId pid amount 1001 01 2 # product.txt文件中的内容 pid pname 01 小米 # order.txt和product.txt连接后生成的文件中的信息 orderId pname amount 1001 小米 2
代码实现如下:
引入依赖:<dependencies> <!-- junit测试依赖坐标 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!-- 日志依赖坐标 --> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j-impl</artifactId> <version>2.12.0</version> </dependency> <!-- hadoop依赖坐标 --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.1.3</version> </dependency> </dependencies>
在
resources
下写配置文件:<!-- 配置文件名为:log4j2.xml --> <?xml version="1.0" encoding="UTF-8"?> <Configuration status="error" strict="true" name="XMLConfig"> <Appenders> <!-- 类型名为Console,名称为必须属性 --> <Appender type="Console" name="STDOUT"> <!-- 布局为PatternLayout的方式, 输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here --> <Layout type="PatternLayout" pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" /> </Appender> </Appenders> <Loggers> <!-- 可加性为false --> <Logger name="test" level="info" additivity="false"> <AppenderRef ref="STDOUT" /> </Logger> <!-- root loggerConfig设置 --> <Root level="info"> <AppenderRef ref="STDOUT" /> </Root> </Loggers> </Configuration>
封装
ComplexItem
import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** * 该类用于封装order.txt和product.txt中的各字段 */ public class ComplexItem implements Writable { // 该对象中的数据来自于哪一个文件 private String from = ""; private Integer orderId = 0; private String pid = ""; private String pname = ""; private Integer amount = 0; // 空参构造方法,必须存在 public ComplexItem() { } public Integer getOrderId() { return orderId; } public String getPid() { return pid; } public String getPname() { return pname; } public Integer getAmount() { return amount; } /** * 为该对象中来自order.txt的数据赋值,并且标识该对象存储了来自order的数据 * @param orderId orderId * @param pid pid * @param amount amount */ public void setOrderFields(Integer orderId, String pid, Integer amount){ this.from = "order"; this.orderId = orderId; this.pid = pid; this.amount = amount; } /** * 为该对象中来自product.txt的数据赋值,并且标识该对象存储了来自product的数据 * @param pid pid * @param pname pname */ public void setProductFields(String pid, String pname){ this.from = "product"; this.pid = pid; this.pname = pname; } /** * 判断该对象中的数据是否来自order.txt * @return 来自order.txt => true、否则 => false */ public boolean isFromOrder(){ return "order".equalsIgnoreCase(from); } @Override public void write(DataOutput out) throws IOException { out.writeUTF(this.from); out.writeInt(this.orderId); out.writeUTF(this.pid); out.writeUTF(this.pname); out.writeInt(this.amount); } @Override public void readFields(DataInput in) throws IOException { this.from = in.readUTF(); this.orderId = in.readInt(); this.pid = in.readUTF(); this.pname = in.readUTF(); this.amount = in.readInt(); } @Override public String toString() { return this.orderId + "\t" + this.pname + "\t" + this.amount; } }
Mapper
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class ComplexItemMapper extends Mapper<Object, Text, Text, ComplexItem> { // 当前Mapper处理的该切片所属的文件名称 private String from; private Text keyOut = new Text(); private ComplexItem valueOut = new ComplexItem(); /** * 该方法只会在每一个切片的Mapper阶段的最开始运行一次 * @param context 上下文对象 * @throws IOException * @throws InterruptedException */ @Override protected void setup(Context context) throws IOException, InterruptedException { // 获取切片对象 InputSplit split = context.getInputSplit(); // 获取该切片所属文件的文件名称 from = ((FileSplit)split).getPath().getName(); } @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); if (from.contains("order")){ valueOut.setOrderFields(Integer.parseInt(fields[0]), fields[1], Integer.parseInt(fields[2])); }else { valueOut.setProductFields(fields[0], fields[1]); } keyOut.set(valueOut.getPid()); context.write(keyOut, valueOut); } }
Reducer
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class ComplexItemReducer extends Reducer<Text, ComplexItem, ComplexItem, NullWritable> { // 缓存每一组key-values中来自order.txt的封装对象 private List<ComplexItem> orderList = new ArrayList<>(); private NullWritable valueOut = NullWritable.get(); @Override protected void reduce(Text key, Iterable<ComplexItem> values, Context context) throws IOException, InterruptedException { ComplexItem product = new ComplexItem(); for (ComplexItem value : values) { if (value.isFromOrder()){ ComplexItem order = new ComplexItem(); order.setOrderFields(value.getOrderId(), value.getPid(), value.getAmount()); orderList.add(order); }else { product.setProductFields(value.getPid(), value.getPname()); } } for (ComplexItem order : orderList) { order.setProductFields(product.getPid(), product.getPname()); context.write(order, valueOut); } orderList.clear(); } }
Driver
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.Arrays; public class ComplexItemDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ComplexItemDriver.class); job.setMapperClass(ComplexItemMapper.class); job.setReducerClass(ComplexItemReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ComplexItem.class); job.setOutputKeyClass(ComplexItem.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length - 1).map(Path::new).toArray(Path[]::new)); FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
# 运行方式1:Idea直接运行 # 运行方式2:在集群上运行 hadoop.sh jar xxx.jar xxx.xxx.ComplexItemDriver input-path output-path
-
Map Join
(1) 使用场景:适合一张表十分小、另一张表十分大的情况
(2) 案例:
场景:同Reduce Join
引入依赖:<dependencies> <!-- junit测试依赖坐标 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!-- 日志依赖坐标 --> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j-impl</artifactId> <version>2.12.0</version> </dependency> <!-- hadoop依赖坐标 --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.1.3</version> </dependency> </dependencies>
在
resources
下写配置文件:<!-- 配置文件名为:log4j2.xml --> <?xml version="1.0" encoding="UTF-8"?> <Configuration status="error" strict="true" name="XMLConfig"> <Appenders> <!-- 类型名为Console,名称为必须属性 --> <Appender type="Console" name="STDOUT"> <!-- 布局为PatternLayout的方式, 输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here --> <Layout type="PatternLayout" pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" /> </Appender> </Appenders> <Loggers> <!-- 可加性为false --> <Logger name="test" level="info" additivity="false"> <AppenderRef ref="STDOUT" /> </Logger> <!-- root loggerConfig设置 --> <Root level="info"> <AppenderRef ref="STDOUT" /> </Root> </Loggers> </Configuration>
封装
OrderDetail
/** * 用来封装order.txt和product.txt进行Join操作后的字段的类 */ public class OrderDetail { private Integer orderId = 0; private String pname = ""; private Integer ammount = 0; public OrderDetail() { } public OrderDetail(Integer orderId, String pname, Integer ammount) { this.orderId = orderId; this.pname = pname; this.ammount = ammount; } public Integer getOrderId() { return orderId; } public void setOrderId(Integer orderId) { this.orderId = orderId; } public String getPname() { return pname; } public void setPname(String pname) { this.pname = pname; } public Integer getAmmount() { return ammount; } public void setAmmount(Integer ammount) { this.ammount = ammount; } public void setAll(Integer orderId, String pname, Integer ammount){ setOrderId(orderId); setPname(pname); setAmmount(ammount); } @Override public String toString() { return orderId + "\t" + pname + "\t" + ammount; } }
Mapper
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; public class OrderDetailMapper extends Mapper<Object, Text, OrderDetail, NullWritable> { // 存储pid和pname的映射 private Map<String, String> productMapper = new HashMap<>(); private OrderDetail keyOut = new OrderDetail(); private NullWritable valueOut = NullWritable.get(); @Override protected void setup(Context context) throws IOException, InterruptedException { // 获取缓冲在内存中的文件 URI[] cacheFiles = context.getCacheFiles(); if (cacheFiles.length > 0){ URI productURI = cacheFiles[0]; FileSystem fs = FileSystem.get(context.getConfiguration()); BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(productURI)), StandardCharsets.UTF_8)); // 读取product.txt中的数据,并将其放入到mapper中 String line = null; while ((line = reader.readLine()) != null){ String[] fields = line.split("\t"); productMapper.put(fields[0], fields[1]); } // 释放资源 IOUtils.closeStreams(reader, fs); } } @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] fields = value.toString().split("\t"); keyOut.setAll(Integer.parseInt(fields[0]), productMapper.get(fields[1]), Integer.parseInt(fields[2])); context.write(keyOut, valueOut); } }
Driver
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; import java.util.Arrays; public class OrderDetailDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(OrderDetailDriver.class); // 把order.txt文件加载到内存中使用,使用本地文件系统协议:file:///;如果想用集群上的文件,使用:hdfs:// job.addCacheFile(URI.create("file:///D:/moudle/hadoop_results/mapred/in/reducejoin/pd.txt")); job.setMapperClass(OrderDetailMapper.class); // 没有Reducer job.setNumReduceTasks(0); job.setMapOutputKeyClass(OrderDetail.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(OrderDetail.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length -1).map(Path::new).toArray(Path[]::new)); FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
二、计数器
案例:记录调用map
方法的总次数
解决方案:在map
方法中添加如下代码
// context.getCounter(组名称, 计数器名称).increment(增加的值)
// 记录会在输出日志中显示
context.getCounter("mapjoin", "mapCount").increment(1);
三、ETL
清洗
不需要Reducer阶段
-
案例:有一日志文件,每一行都是用空白符(制表符或者空格)分割的众多字段,要求过滤该文件,将字段数大于11的记录过滤出来;
-
代码如下:
引入依赖:<dependencies> <!-- junit测试依赖坐标 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!-- 日志依赖坐标 --> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j-impl</artifactId> <version>2.12.0</version> </dependency> <!-- hadoop依赖坐标 --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.1.3</version> </dependency> </dependencies>
在
resources
下写配置文件:<!-- 配置文件名为:log4j2.xml --> <?xml version="1.0" encoding="UTF-8"?> <Configuration status="error" strict="true" name="XMLConfig"> <Appenders> <!-- 类型名为Console,名称为必须属性 --> <Appender type="Console" name="STDOUT"> <!-- 布局为PatternLayout的方式, 输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here --> <Layout type="PatternLayout" pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" /> </Appender> </Appenders> <Loggers> <!-- 可加性为false --> <Logger name="test" level="info" additivity="false"> <AppenderRef ref="STDOUT" /> </Logger> <!-- root loggerConfig设置 --> <Root level="info"> <AppenderRef ref="STDOUT" /> </Root> </Loggers> </Configuration>
Mapper
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class ETLMapper extends Mapper<Object, Text, Text, NullWritable> { private Text keyOut = new Text(); private NullWritable valueOut = NullWritable.get(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] fields = value.toString().split("\\s"); if (fields.length > 11){ keyOut.set(value); context.write(keyOut, valueOut); } } }
Driver
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.Arrays; public class ETLDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ETLDriver.class); job.setMapperClass(ETLMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length -1).map(Path::new).toArray(Path[]::new)); FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }