//需求分析:对上一个MR程序生成的文件再处理:按照总流量进行倒序输出。
//MapReduce(shuffle阶段)只能按照key进行排序,
//那么Mapper输出的时候应该以FlowBean对象(实现了WritableComparable --> 变成了可持久化与可比较)为key,然后让shuffle去排序。最后在Reduce阶段,只需将key / vlaue调换即可。
FlowCountDriver类:
MR程序的入口:
public class FlowCountDriver {
//区内排序
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(FlowCountDriver.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReduce.class);
//设置Map的输出KV数据类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//设置最终输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//设置分区
job.setPartitionerClass(FlowCountPartitioner.class);
job.setNumReduceTasks(5);
//设置MR程序的文件输入目录和最终结果的输出目录
Path inputPath = new Path("I:/input/inputflow2/part-r-00000");
Path outputPath = new Path("I:/output/outputflow2/");
//文件系统中如果存在输出路径,就删除文件系统中输出路径
FileSystem fileSystem = FileSystem.get(configuration);
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
//提交job对象, 这里涉及job提交流程源码分析,一切的一切从这里开始:
//inputformat -> Mapper -> Shuffle(各种排序,分区排序,归并排序) -> Reducer -> outputformat -->end
boolean b = job.waitForCompletion(true);
System.out.println(b ? "success" : "failed");
}
}
FlowCountMapper 类:
//在此之前,TextInputFormat会对每个切片进行读取。
// MapTask通过InputFormat获得的RecordReader,从输入InputSplit中解析出一个个key/value。
// 每个key/value都会调用map()方法。
public class FlowCountMapper extends Mapper<LongWritable, Text,FlowBean, Text> {
private Text phoneNumber = new Text();
private FlowBean flowBean = new FlowBean();
//inputformat -> Mapper -> Shuffle(各种排序,分区排序,归并排序) -> Reducer -> outputformat
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//每条数据例子: 13509468723 7335 110349 117684
String[] split = value.toString().split("\t");
flowBean.setUpFlow(Long.parseLong(split[1]));
flowBean.setDownFlow(Long.parseLong(split[2]));
flowBean.setSumFlow();
phoneNumber.set(split[0]);
context.write(flowBean,phoneNumber);
}
}
FlowCountPartitioner类:
/**
* MR的shuffle阶段,对Map输出的key, value 数据进行分区
* map()方法向内存中的环形缓冲区输入key/value,maptask会对key/value进行分区,并排序
* 这里的Partitioner就是对文件进行分区,当环形缓冲区的数据的数据容量达到缓冲区的80%,将会溢写文件到本地磁盘中。
* 这些溢写文件可能有多个,多个溢写出来的文件会 归并排序 到一个大文件中。这里的一个maptask任务结束...
*/
public class FlowCountPartitioner extends Partitioner<FlowBean, Text> {
private int partition = 0;
@Override
public int getPartition(FlowBean flowBean, Text text, int numPartitions) {
String phone = text.toString();
String phonePre = phone.substring(0, 3);
switch (phonePre){
case "136":
partition = 0;
break;
case "137":
partition = 1;
break;
case "138":
partition = 2;
break;
case "139":
partition = 3;
break;
default:
partition = 4;
break;
}
return partition;
}
}
FlowCountReduce类:
//在此之前shuffle阶段进行了排序,使其可以分组,在这里便可以做分组统计的工作
//reduce作用:对Map阶段传输的半成品数据做统计工作,k -> {....},
public class FlowCountReduce extends Reducer<FlowBean, Text, Text, FlowBean> {
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//不需处理
for (Text value : values) {
context.write(value, key);
}
}
}
FlowBean类:
/**
* 实现了WritableComparable --> 变成了可持久化与可比较
*/
public class FlowBean implements WritableComparable<FlowBean> {
public FlowBean() {
}
private long upFlow;
private long downFlow;
private long sumFlow;
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.upFlow + this.downFlow;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(this.upFlow);
out.writeLong(this.downFlow);
out.writeLong(this.sumFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
@Override
public String toString() {
return this.upFlow + "\t" + this.downFlow + "\t" + this.sumFlow;
}
@Override
public int compareTo(FlowBean o) {
if(this.getSumFlow() > o.getSumFlow()){
return -1; //按照总流量倒序排序
}else if( this.getSumFlow() < o.getSumFlow()){
return 1;
}else{
//总流量相同的,按照上行流量进行升序(正序)排列
if(this.getUpFlow() > o.getUpFlow()){
return 1;
}else if(this.getUpFlow() < o.getUpFlow()){
return -1;
}
return 0;
}
}
}