0)需求
根据需求1产生的结果再次对总流量进行排序。
1)数据准备
13926251106 240 0 240
13826544101 264 0 264
13480253104 180 180 360
13926435656 132 1512 1644
15989002119 1938 180 2118
18211575961 1527 2106 3633
13602846565 1938 2910 4848
84138413 4116 1432 5548
15920133257 3156 2936 6092
13922314466 3008 3720 6728
15013685858 3659 3538 7197
13660577991 6960 690 7650
13560439658 2034 5892 7926
18320173382 9531 2412 11943
13726230503 2481 24681 27162
13560436666 3597 25635 29232
13925057413 11058 48243 59301
13502468823 7335 110349 117684
2)分析(缓冲区一开始在内存中,当100M满后,存入磁盘中先进行分区,再排序,hadoop只对key进行排序,并不会对value排序,因此要把流量作为key进行排序)
(1)把程序分两步走,第一步正常统计总流量,第二步再把结果进行排序
(2)context.write(总流量,手机号)
(3)FlowBean实现WritableComparable接口重写compareTo方法
3)代码实现
(1)FlowBean对象在在需求1基础上增加了比较功能
(2)编写mapper
package com.lzz.mapreduce.flowsort;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowSortMapper extends Mapper<LongWritable, Text, FlowBean, Text>{
// 13502468823 7335 110349 117684
// 13560436666 3597 25635 29232
// 13560439658 2034 5892 7926
FlowBean k=new FlowBean();
Text v= new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//1获取一行
String line=value.toString();
//2切割
String[] fields=line.split("\t");
//3封装对象
long upflow=Long.parseLong(fields[2]);
long downflow=Long.parseLong(fields[3]);
k.set(upflow,downflow);
v.set(fields[0]);
//4写出
context.write(k, v);
}
}
(3)编写reducer
package com.lzz.mapreduce.flowsort;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowSortReducer extends Reducer<FlowBean, Text, Text, FlowBean>{
@Override
protected void reduce(FlowBean key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
context.write(values.iterator().next(), key);
}
}
(4)编写driver
package com.lzz.mapreduce.flowsort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowSortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration=new Configuration();
Job job=Job.getInstance(configuration);
job.setJarByClass(FlowSortDriver.class);
job.setMapperClass(FlowSortMapper.class);
job.setReducerClass(FlowSortReducer.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
结果