Hadoop学习(六)
1.MapReduce工作流程
上面的流程是整个 MapReduce 最全工作流程,但是 Shuffle 过程只是从第 7 步开始到第 16 步结束,具体 Shuffle 过程详解,如下:
(1)MapTask 收集我们的 map()方法输出的 kv 对,放到内存缓冲区中
(2)从内存缓冲区不断溢出本地磁盘文件,可能会溢出多个文件
(3)多个溢出文件会被合并成大的溢出文件
(4)在溢出过程及合并的过程中,都要调用 Partitioner 进行分区和针对 key 进行排序
(5)ReduceTask 根据自己的分区号,去各个 MapTask 机器上取相应的结果分区数据
(6)ReduceTask 会抓取到同一个分区的来自不同 MapTask 的结果文件,ReduceTask 会将这些文件再进行合并(归并排序)
(7)合并成大文件后,Shuffle 的过程也就结束了,后面进入 ReduceTask 的逻辑运算过 程(从文件中取出一个一个的键值对 Group,调用用户自定义的 reduce()方法)
注意:
(1)Shuffle 中的缓冲区大小会影响到 MapReduce 程序的执行效率,原则上说,缓冲区 越大,磁盘 io 的次数越少,执行速度就越快。 (调优会用到)
(2)缓冲区的大小可以通过参数调整,参数:mapreduce.task.io.sort.mb 默认 100M。
2.Shuffle机制
1.Shuffle 机制
Map 方法之后,Reduce 方法之前的数据处理过程称之为 Shuffle。
2.Partition 分区
1、问题引出
要求将统计结果按照条件输出到不同文件中(分区)。比如:将统计结果按照手机 归属地不同省份输出到不同文件中(分区)
2、默认Partitioner分区
public class HashPartitioner extends Partitioner {
public int getPartition(K key, V value, int numReduceTasks)
{ return (key.hashCode() & Integer.MAX_VALUE) %numReduceTasks; }
}
默认分区是根据key的hashCode对ReduceTasks个数取模得到的。用户没法控制哪个 key存储到哪个分区。
3.自定义Partitioner步骤
(1)自定义类继承Partitioner,重写getPartition()方法
public class CustomPartitioner extends Partitioner<Text, FlowBean> {
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
// 控制分区代码逻辑
… …
return partition;
}
}
(2)在Job驱动中,设置自定义Partitioner
job.setPartitionerClass(CustomPartitioner.class);
(3)自定义Partition后,要根据自定义Partitioner的逻辑设置相应数量的ReduceTask.
job.setNumReduceTasks(5);
4.分区总结
(1)如果ReduceTask的数量> getPartition的结果数,则会多产生几个空的输出文件part-r-000xx;
(2)如果1<ReduceTask的数量<getPartition,则会有一部分分区数据无法存放,会Exception
(3)如 果ReduceTask的数量=1,则不管MapTask端输出多少个分区文件,最终结果都交给这一个 ReduceTask,最终也就只会产生一个结果文件 part-r-00000;
(4)分区号必须从零开始,逐一累加。
5.Partition分区示例实操
将统计结果按照手机归属地不同省份输出到不同文件中(分区)
期望输出数据
手机号 136、137、138、139 开头都分别放到一个独立的 4 个文件中,其他开头的放到 一个文件中。
代码如下:
分区类:
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text,FlowBean> {
@Override
public int getPartition(Text text, FlowBean flowBean, int i) {
String s = text.toString();
String substring = s.substring(0, 3);
int firstThreeNum = Integer.parseInt(substring);
if (firstThreeNum%136 == 0){
return 0;
}else if (firstThreeNum%136 == 1){
return 1;
}else if (firstThreeNum%136 == 2){
return 2;
}else if (firstThreeNum%136 == 3){
return 3;
}
return 4;
}
}
Driver类
package com.simon.mapreduce.partitioner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//1.
Configuration cf = new Configuration();
Job job = Job.getInstance(cf);
//2.
job.setJarByClass(FlowDriver.class);
//3.
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4.
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//5.
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//6.
job.setPartitionerClass(ProvincePartitioner.class);
job.setNumReduceTasks(5);
//7.
FileInputFormat.setInputPaths(job,new Path("D:\\input\\inputflow"));
FileOutputFormat.setOutputPath(job,new Path("D:\\output\\partitioner1"));
//7.
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
Bean类
package com.simon.mapreduce.partitioner;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 序列化过程
* 定义类实现writable接口
* 重写序列化和反序列化方法
* 重写空参构造
* 重写toSrting用于打印输出
* */
public class FlowBean implements Writable {
private long upFlow;//上行流量
private long downFlow;//下行流量
private long sumFlow;//总流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow+this.upFlow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
//空参构造
public FlowBean() {
}
//重写toString
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
}
Mapper类
package com.simon.mapreduce.partitioner;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable,Text, Text, FlowBean> {
private FlowBean fb = new FlowBean();
private Text phone = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String res = value.toString();
String[] ss = res.split("\t");
phone.set(ss[1]);
fb.setUpFlow(Long.parseLong(ss[ss.length-3]));
fb.setDownFlow(Long.parseLong(ss[ss.length-2]));
context.write(phone,fb);
}
}
Reducer类
package com.simon.mapreduce.partitioner;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text, FlowBean,Text, FlowBean> {
private Text phone;
private FlowBean ans = new FlowBean();
private long upFlow = 0;
private long downFlow = 0;
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
upFlow = 0;
downFlow = 0;
phone = key;
for (FlowBean value : values) {
upFlow +=value.getUpFlow();
downFlow+=value.getDownFlow();
}
ans.setUpFlow(upFlow);
ans.setDownFlow(downFlow);
ans.setSumFlow();
context.write(phone,ans);
}
}
3. WritableComparable 排序
排序概述
排序分类
自定义排序 WritableComparable 原理分析
bean 对象做为 key 传输,需要实现 WritableComparable 接口重写 compareTo 方法就可 以实现排序。
@Override
public int compareTo(FlowBean bean) {
int result;
// 按照总流量大小,倒序排列
if (this.sumFlow > bean.getSumFlow()) {
result = -1;
}else if (this.sumFlow < bean.getSumFlow()) {
result = 1;
}else {
result = 0;
}
return result;
}
① WritableComparable 排序案例实操(全排序)
输入phone_data.txt
期望按照总流量降序
代码实现:
Bean类
package com.simon.mapreduce.writablecomparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 序列化过程
* 定义类实现writable接口
* 重写序列化和反序列化方法
* 重写空参构造
* 重写toSrting用于打印输出
* */
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;//上行流量
private long downFlow;//下行流量
private long sumFlow;//总流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow+this.upFlow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
//空参构造
public FlowBean() {
}
//重写toString
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
@Override
public int compareTo(FlowBean o) {
//总流量倒序
if (this.sumFlow > o.sumFlow){
return -1;
}else if (this.sumFlow<o.sumFlow){
return 1;
}
return 0;
}
}
Driver类
package com.simon.mapreduce.writablecomparable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
//1 获取 job 对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2 关联本 Driver 类
job.setJarByClass(FlowDriver.class);
//3 关联 Mapper 和 Reducer
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4 设置 Map 端输出数据的 KV 类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//5 设置程序最终输出的 KV 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//6 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\output\\output2"));
FileOutputFormat.setOutputPath(job, new Path("D:\\output\\comparout"));
//7 提交 Job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
Mapper类
package com.simon.mapreduce.writablecomparable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, FlowBean, Text>
{
private FlowBean outK = new FlowBean();
private Text outV = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//1 获取一行数据
String line = value.toString();
//2 按照"\t",切割数据
String[] split = line.split("\t");
//3 封装 outK outV
outK.setUpFlow(Long.parseLong(split[1]));
outK.setDownFlow(Long.parseLong(split[2]));
outK.setSumFlow();
outV.set(split[0]);
//4 写出 outK outV
context.write(outK,outV);
}
}
Reducer类
package com.simon.mapreduce.writablecomparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<FlowBean, Text, Text, FlowBean>
{
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context
context) throws IOException, InterruptedException {
//遍历 values 集合,循环写出,避免总流量相同的情况
for (Text value : values) {
//调换 KV 位置,反向写出
context.write(value,key);
}
}
}
②WritableComparable 排序案例实操(区内排序)
需求+上:要实现省份分区
136,137,138,139,其他分区
增加自定义分区类:
package com.simon.mapreduce.partitionerandwritablecomparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<FlowBean, Text> {
@Override
public int getPartition(FlowBean flowBean, Text text, int i) {
String s = text.toString();
String substring = s.substring(0, 3);
int firstThreeNum = Integer.parseInt(substring);
if (firstThreeNum%136 == 0){
return 0;
}else if (firstThreeNum%136 == 1){
return 1;
}else if (firstThreeNum%136 == 2){
return 2;
}else if (firstThreeNum%136 == 3){
return 3;
}
return 4;
}
}
Driver驱动中加上
// 设置自定义分区器
job.setPartitionerClass(ProvincePartitioner2.class);
// 设置对应的 ReduceTask 的个数
job.setNumReduceTasks(5);
③WritableComparable 排序案例实操(二次排序/自定义排序)
增加需求:总流量相同时,按上行从大往小排
只需改变Bean的类
package com.simon.mapreduce.writablecomparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 序列化过程
* 定义类实现writable接口
* 重写序列化和反序列化方法
* 重写空参构造
* 重写toSrting用于打印输出
* */
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;//上行流量
private long downFlow;//下行流量
private long sumFlow;//总流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.downFlow+this.upFlow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
//空参构造
public FlowBean() {
}
//重写toString
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow ;
}
@Override
public int compareTo(FlowBean o) {
//总流量倒序
if (this.sumFlow > o.sumFlow){
return -1;
}else if (this.sumFlow<o.sumFlow){
return 1;
}else {
if (this.upFlow > o.upFlow){
return 1;
}else if (this.upFlow < o.upFlow){
return -1;
}
}
return 0;
}
}
4. Combiner 合并
(1)Combiner是MR程序中Mapper和Reducer之外的一种组件。
(2)Combiner组件的父类就是Reducer。
(3)Combiner和Reducer的区别在于运行的位置 Combiner是在每一个MapTask所在的节点运行;
(4)Combiner的意义就是对每一个MapTask的输出进行局部汇总,以减小网络传输量。
(5)Combiner能够应用的前提是不能影响最终的业务逻辑,而且,Combiner的输出kv 应该跟Reducer的输入kv类型要对应起来。
(6)自定义Combiner实现步骤
public class WordCountCombiner extends Reducer<Text, IntWritable, Text,
IntWritable> {
private IntWritable outV = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context
context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outV.set(sum);
context.write(key,outV);
}
}
(7)job里面设置驱动
job.setCombinerClass(WordCountCombiner.class);
注意:其实仔细地可以发现我们的自定义Combiner和我们的Reducer一模一样,所以以后可以不写,直接在job里面设置驱动类为Reducer,前提是不影响逻辑(加减可以,求平均值不行)
案例实操
需求:
统计过程中对每一个 MapTask 的输出进行局部汇总,以减小网络传输量即采用 Combiner 功能。
期望输出:
期望:Combine 输入数据多,输出时经过合并,输出数据降低。
增加一个 WordCountCombiner 类继承 Reducer
package com.simon.mapreduce.combiner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountCombiner extends Reducer<Text, IntWritable,Text, IntWritable> {
private IntWritable intWritable = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum =0;
for (IntWritable value : values) {
sum+=value.get();
}
intWritable.set(sum);
context.write(key,intWritable);
}
}
在 WordcountDriver 驱动类中指定 Combiner
// 指定需要使用 combiner,以及用哪个类作为 combiner 的逻辑
job.setCombinerClass(WordCountCombiner.class);
5.OutputFormat 数据输出
1.OutputFormat 接口实现类
OutputFormat是MapReduce输出的基类,所有实现MapReduce输出都实现了 OutputFormat 接口。下面我们介绍几种常见的OutputFormat实现类。
①NullOutputFormat
②MapFileOutputFormat
③SequenceFileOutputFormat
④TextFileOutputFormat
⑤LazyOutputFormat
⑥DBOutputFormat
默认是TextFileOutputFormat
自定义OutputFormat
应用场景:
例如:输出数据到MySQL/HBase/Elasticsearch等存储框架中。
步骤
自定义一个类继承FileOutputFormat。
改写RecordWriter,具体改写输出数据的方法write()。
自定义OutputFormat案例实操
过滤输入的 log 日志,包含 atguigu 的网站输出到 e:/atguigu.log,不包含 atguigu 的网站输出到 e:/other.log。
期望输出
两个日志文件,一个包含atguigu,一个不包含atguigu
代码如下:
Driver类
package com.simon.mapreduce.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(LogDriver.class);
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputFormatClass(LogOutputFormat.class);
FileInputFormat.setInputPaths(job,new Path("D:\\input\\inputoutputformat"));
FileOutputFormat.setOutputPath(job,new Path("D:\\output\\outputformat"));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
Reducer类
package com.simon.mapreduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class LogReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
context.write(key,value);
}
}
}
Mapper类
package com.simon.mapreduce.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class LogMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
自定义OutputFormat类
package com.simon.mapreduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
LogRecordWriter lg = new LogRecordWriter(job);
return lg;
}
}
RecordWriter类
package com.simon.mapreduce.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {
private FSDataOutputStream atguigu;
private FSDataOutputStream other;
public LogRecordWriter(TaskAttemptContext job) {
try {
FileSystem fs = FileSystem.get(job.getConfiguration());
atguigu = fs.create(new Path("D:\\output\\atguigu.log"));
other = fs.create(new Path("D:\\output\\other.log"));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
String s = text.toString();
if (s.contains("atguigu")){
atguigu.writeBytes(s+"\n");
}else{
other.writeBytes(s+"\n");
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
IOUtils.closeStream(atguigu);
IOUtils.closeStream(other);
}
}