需求
统计每一个手机号耗费的总上行流量、下行流量、总流量。
数据如下。
手机号 | ip | 上行流量 | 下行流量 | 状态 |
---|---|---|---|---|
13012345678 | 153.31.146.171 | 995 | 800 | 200 |
13112345678 | 193.179.2.134 | 1024 | 2501 | 200 |
13512345678 | 227.39.131.173 | 1232 | 890 | 200 |
13312345678 | 236.15.230.17 | 1231 | 257 | 200 |
13112345678 | 84.227.134.147 | 444 | 894 | 200 |
13412345678 | 193.179.2.134 | 567 | 123 | 200 |
13212345678 | 193.179.2.134 | 7655 | 534 | 200 |
13512345678 | 84.227.134.147 | 123 | 123 | 200 |
☁ input pwd
/Users/ylj/demo/input
☁ input cat mobile.txt
13012345678 153.31.146.171 995 800 200
13112345678 193.179.2.134 1024 2501 200
13512345678 227.39.131.173 1232 890 200
13312345678 236.15.230.17 1231 257 200
13112345678 84.227.134.147 444 894 200
13412345678 193.179.2.134 567 123 200
13212345678 193.179.2.134 7655 534 200
13512345678 84.227.134.147 123 123 200
分析
输入格式:
手机号 | IP | 上行流量 | 下行流量 | 状态 |
---|---|---|---|---|
13012345678 | 153.31.146.171 | 995 | 800 | 200 |
输出格式
手机号 | 上行流量 | 下行流量 | 总流量 |
---|---|---|---|
13012345678 | 995 | 800 | 1795 |
代码实现
编写流量统计的 bean 对象
package com.yljphp.mapreduce.flowsum;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements Writable {
//上行流量
private long upFlow;
//下行流量
private long downFlow;
//总流量
private long sumFlow;
//空参构造,为了后续反射
public FlowBean() {
super();
}
public FlowBean(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
// 6 编写 toString 方法,方便后续打印到文本
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public void set(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
}
编写 mapper
package com.yljphp.mapreduce.flowsum;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
FlowBean v = new FlowBean();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1 获取一行
String line = value.toString();
//2 切割字符串
String[] fields = line.split("\t");
//3. 封装对象
//取出手机号码
String phoneNum = fields[0];
// 取出上行流量和下行流量
long upFlow = Long.parseLong(fields[2]);
long downFlow = Long.parseLong(fields[3]);
//v.setUpFlow(upFlow);
//v.setDownFlow(downFlow);
v.set(upFlow,downFlow);
k.set(phoneNum);
context.write(k, v);
}
}
编写 reducer
package com.yljphp.mapreduce.flowsum;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
FlowBean v = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sumUpFlow = 0;
long sumDownFlow = 0;
// 1 遍历所用 bean,将其中的上行流量,下行流量分别累加
for (FlowBean value : values) {
sumUpFlow += value.getUpFlow();
sumDownFlow += value.getDownFlow();
}
// 2 封装对象
//v.setUpFlow(sumUpFlow);
//v.setDownFlow(sumDownFlow);
v.set(sumUpFlow,sumDownFlow);
// 3 写出
context.write(key, v);
}
}
编写驱动
package com.yljphp.mapreduce.flowsum;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 获取配置信息,或者 job 对象实例
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2 指定本程序的 jar 包所在的本地路径
job.setJarByClass(FlowCountDriver.class);
//3 指定本业务 job 要使用的 mapper/Reducer 业务类
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
//4 指定 mapper 输出数据的 kv 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//5 指定最终输出的数据的 kv 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//6 指定 job 的输入路径和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//7 将 job 中配置的相关参数,以及 job 所用的 java 类所在的 jar 包, 提交给 yarn 去运行
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
输出结果
☁ output pwd
/Users/ylj/demo/output
☁ output cat part-r-00000
13012345678 995 800 1795
13112345678 1468 3395 4863
13212345678 7655 534 8189
13312345678 1231 257 1488
13412345678 567 123 690
13512345678 1355 1013 2368