流量汇总需求:
- 统计每一个用户(手机号)所耗费的总上行流量、下行流量,总流量
- 1.在mapper和reducer之间传递多个value值
- 序列化:自己实现了Writable接口(FlowSumBean),在mapper和reducer之间传递一个Bean对象。
- 也可以拼接字符串的方式来实现写出多个值。
- 2.将统计结果按照手机归属地不同省份输出到不同文件中
- 分区:自己实现了Partitioner分区器(FlowSumPartitioner)。reducer任务数量应该等于分区器分区的数量。
- 3.将统计结果按照总流量倒序排序
- 实现WritableComparable接口,重写一个方法(compareTo)方法来排序。
1.在mapper和reducer之间传递多个value值
- 序列化:自己实现了Writable接口(FlowSumBean),在mapper和reducer之间传递一个Bean对象。
- 实现WritableComparable接口,重写一个方法(compareTo)方法来排序。
package com.hadoop.mapreduce.flowsum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class FlowSumBean implements WritableComparable<FlowSumBean>{
private long upFlow; // 上行流量
private long dwFlow; // 下行流量
private long sumFlow; // 总流量
/**
* 反序列化时,需要反射调用空参构造函数,所以要显示定义一个
*/
public FlowSumBean() {
}
public void setData(long upFlow, long dwFlow) {
this.upFlow = upFlow;
this.dwFlow = dwFlow;
this.sumFlow = this.upFlow + this.dwFlow;
}
/**
* 序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(this.upFlow);
out.writeLong(this.dwFlow);
out.writeLong(this.sumFlow);
}
/**
* 反序列化方法
* 注意:反序列化的顺序跟序列化的顺序完全一致
*/
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.dwFlow = in.readLong();
this.sumFlow = in.readLong();
}
@Override
public String toString() {
return this.upFlow + "\t" + this.dwFlow + "\t" + this.sumFlow;
}
@Override
public int compareTo(FlowSumBean bean) {
// 从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置
return this.sumFlow > bean.getSumFlow() ? -1 : 1;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDwFlow() {
return dwFlow;
}
public void setDwFlow(long dwFlow) {
this.dwFlow = dwFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
}
2.将统计结果按照手机归属地不同省份输出到不同文件中
- 分区:自己实现了Partitioner分区器(FlowSumPartitioner)。reducer任务数量应该等于分区器分区的数量。
package com.hadoop.mapreduce.flowsum;
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* KEY,VALUE对应的是map输出kv的类型
*/
public class FlowSumPartitioner extends Partitioner<Text, FlowSumBean>{
public static HashMap<String, Integer> proviceDict = new HashMap<String, Integer>();
static {
proviceDict.put("136", 0);
proviceDict.put("137", 1);
proviceDict.put("138", 2);
proviceDict.put("139", 3);
}
@Override
public int getPartition(Text key, FlowSumBean value, int numPartitions) {
//截取手机号码前3位
String prefix = key.toString().substring(0, 3);
Integer provinceId = proviceDict.get(prefix);
return provinceId == null ? 4 : provinceId;
}
}
Mapper阶段
package com.hadoop.mapreduce.flowsum;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowSumMapper extends Mapper<LongWritable, Text, Text, FlowSumBean>{
private Text text = new Text();
private FlowSumBean bean = new FlowSumBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将一行内容转成string
String line = value.toString();
//切分字段
String[] fields = line.split("\t");
//取出手机号
String phoneNbr = fields[1];
//取出上行流量下行流量(从右向左,倒着数更方便)
long upFlow = Long.parseLong(fields[fields.length-3]);
long dwFlow = Long.parseLong(fields[fields.length-2]);
text.set(phoneNbr);
bean.setData(upFlow, dwFlow);
context.write(text, bean);
}
}
Reducer阶段
package com.hadoop.mapreduce.flowsum;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowSumReducer extends Reducer<Text, FlowSumBean, Text, FlowSumBean>{
private FlowSumBean bean = new FlowSumBean();
@Override
protected void reduce(Text key, Iterable<FlowSumBean> values, Context context) throws IOException, InterruptedException {
long sum_upFlow = 0;
long sum_dwFlow = 0;
for(FlowSumBean bean : values){
sum_upFlow += bean.getUpFlow();
sum_dwFlow += bean.getDwFlow();
}
bean.setData(sum_upFlow, sum_dwFlow);
context.write(key, bean);
}
}
Driver阶段
package com.hadoop.mapreduce.flowsum;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 流量汇总需求:统计每一个用户(手机号)所耗费的总上行流量、下行流量,总流量
* 1.在mapper和reducer之间传递多个value值
* 序列化:自己实现了Writable接口(FlowSumBean),在mapper和reducer之间传递一个Bean对象。
* 也可以拼接字符串的方式来实现写出多个值。
* 2.将统计结果按照手机归属地不同省份输出到不同文件中
* 分区:自己实现了Partitioner分区器(FlowSumPartitioner)。reducer任务数量应该等于分区器分区的数量。
* 3.将统计结果按照总流量倒序排序
* 实现WritableComparable接口,重写一个方法(compareTo)方法来排序。
*/
public class FlowSumDriver extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
// 1.获取配置信息,和job对象实例信息
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
// 2.设置加载jar的位置
job.setJarByClass(FlowSumDriver.class);
// 8.指定自定义数据分区
job.setPartitionerClass(FlowSumPartitioner.class);
// 9.同时指定相应数量的reduce task
job.setNumReduceTasks(5);
// 3.指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowSumMapper.class);
job.setReducerClass(FlowSumReducer.class);
// 4.指定mapper输出数据的key-value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowSumBean.class);
// 5.指定最终输出的数据的key-value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowSumBean.class);
// 6.指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7.提交
boolean result = job.waitForCompletion(true);
return result ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Usage: hadoop jar <jarname> <classname> <input path> <output path>");
System.exit(-1);
}
int ret = ToolRunner.run(new FlowSumDriver(), args);
System.exit(ret);
}
}
- 关于运行程序前一篇已经说得非常清楚了,将这里的输入文件当作下面排序的输入文件
3.将统计结果按照总流量倒序排序
- Map Task和Reduce Task均会对数据(按照key)进行排序。该操作属于Hadoop的默认行为。任何应用程序中的数据均会被排序,而不管逻辑上是否需要。
- Mapreduce框架在记录到达reducer之前按键对记录排序,但键所对应的值并没有被排序。
- 理解清楚这上面知识之后,mapper把bean作为key,手机号作为value来写出;
- context.write(bean, value); bean类须实现WritableComparable接口,然后多重写一个方法(compareTo);
- 就可以利用框架来自动排序总流量了。
FlowSumSortMapper
package com.hadoop.mapreduce.flowsumsort;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.hadoop.mapreduce.flowsum.FlowSumBean;
public class FlowSumSortMapper extends Mapper<LongWritable, Text, FlowSumBean, Text>{
private Text text = new Text();
private FlowSumBean bean = new FlowSumBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将一行内容转成string
String line = value.toString();
//切分字段
String[] fields = line.split("\t");
//取出手机号
String phoneNbr = fields[0];
//取出上行流量下行流量(从右向左,倒着数更方便)
long upFlow = Long.parseLong(fields[1]);
long dwFlow = Long.parseLong(fields[2]);
text.set(phoneNbr);
bean.setData(upFlow, dwFlow);
context.write(bean, text);
}
}
FlowSumSortReducer
package com.hadoop.mapreduce.flowsumsort;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import com.hadoop.mapreduce.flowsum.FlowSumBean;
public class FlowSumSortReducer extends Reducer<FlowSumBean, Text, Text, FlowSumBean>{
@Override
protected void reduce(FlowSumBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(), key);
}
}
FlowSumSortDriver
package com.hadoop.mapreduce.flowsumsort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.hadoop.mapreduce.flowsum.FlowSumBean;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 要实现排序,把程序分两步走,第一步正常统计总流量,第二步再把结果进行排序。
* 把前面flowsum包下程序产生的输出文件,当作这里的输入文件二次加工处理排序
*/
public class FlowSumSortDriver extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
// 1.获取配置信息,和job对象实例信息
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
// 2.设置加载jar的位置
job.setJarByClass(FlowSumSortDriver.class);
// 3.指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowSumSortMapper.class);
job.setReducerClass(FlowSumSortReducer.class);
// 4.指定mapper输出数据的key-value类型
job.setMapOutputKeyClass(FlowSumBean.class);
job.setMapOutputValueClass(Text.class);
// 5.指定最终输出的数据的key-value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowSumBean.class);
// 6.指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7.提交
boolean result = job.waitForCompletion(true);
return result ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Usage: hadoop jar <jarname> <classname> <input path> <output path>");
System.exit(-1);
}
int ret = ToolRunner.run(new FlowSumSortDriver(), args);
System.exit(ret);
}
}
- 要实现排序,把程序分两步走,第一步正常统计总流量,第二步再把结果进行排序。
- 把前面flowsum包下程序产生的输出文件,当作这里的输入文件二次加工处理排序。
- 输出结果:
13502468823 7335 110349 117684
13925057413 11058 48243 59301
13726238888 2481 24681 27162
13726230503 2481 24681 27162
18320173382 9531 2412 11943
13560439658 2034 5892 7926
13660577991 6960 690 7650
15013685858 3659 3538 7197
13922314466 3008 3720 6728
15920133257 3156 2936 6092
84138413 4116 1432 5548
13602846565 1938 2910 4848
18211575961 1527 2106 3633
15989002119 1938 180 2118
13560436666 1116 954 2070
13926435656 132 1512 1644
13480253104 180 180 360
13826544101 264 0 264
13926251106 240 0 240
13760778710 120 120 240
13719199419 240 0 240
测试数据
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200