需求:统计一下文件中,每一个用户所耗费的总上行流量,总下行流量,总流量
1363157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427248124681200
1363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.4402640200
1363157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.99241321512200
1363154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.4402400200
1363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站151215272106200
1363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.12201641161432200
1363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.9918151116954200
1363157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全202031562936200
1363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.82402400200
1363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计2496960690200
1363157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎282736593538200
1363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计331938180200
1363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991599184938200
1363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433180180200
1363157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户151219382910200
1363157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn121230083720200
1363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户571027335110349200
1363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎211895312412200
1363157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69631105848243200
1363157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222120120200
1363157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427248124681200
1363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.9918151116954200
思路:map阶段:将每一行按tab切分成各字段,提取其中的手机号作为输出key,流量信息封装到FlowBean对象中,作为输出的value
要点:自定义类型如何实现Hadoop的序列化接口
FlowBean:这种自定义数据类型必须实现Hadoop的序列化接口:Writable
实现其中的两个方法:
1.readFields(in)——反序列化方法
2.write(out)——序列化方法
reduce阶段:遍历一组数据的所有value(flowbean),进行累加,然后以手机号作为key输出,以总流量信息bean作为value输出。
代码实现
1.FlowBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 本案例功能:演示自定义数据类型如何实现Hadoop的序列化接口
* 1,该类一定要保留空参构造器
* 2.write方法中输出字段二进制数据的顺序要与readFiles方法读取数据的顺序一致
*/
public class FlowBean implements Writable {
private int upFlow;
private int dFlow;
private String phone;
private int amountFlow;
public int getUpFlow() {
return upFlow;
}
public void setUpFlow(int upFlow) {
this.upFlow = upFlow;
}
public int getdFlow() {
return dFlow;
}
public void setdFlow(int dFlow) {
this.dFlow = dFlow;
}
public int getAmountFlow() {
return amountFlow;
}
public void setAmountFlow(int amountFlow) {
this.amountFlow = amountFlow;
}
public FlowBean() {
}
public FlowBean(int upFlow, int dFlow,String phone) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.phone=phone;
this.amountFlow=upFlow+dFlow;
}
/**
* hadoop 系统在序列化该类的对象时要调用得方法
* @param dataOutput
* @throws IOException
*/
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(upFlow);
dataOutput.writeUTF(phone);
dataOutput.writeInt(dFlow);
dataOutput.writeInt(amountFlow);
}
/**
* hadoop系统在反序列化时要调用的方法
* @param dataInput
* @throws IOException
*/
public void readFields(DataInput dataInput) throws IOException {
this.upFlow=dataInput.readInt();
this.phone=dataInput.readUTF();
this.dFlow=dataInput.readInt();
this.amountFlow=dataInput.readInt();
}
@Override
public String toString() {
return this.upFlow+","+this.dFlow+","+this.amountFlow;
}
}
2.FlowCountMapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountMapper extends Mapper {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String phone = fields[1];
int upFlow=Integer.parseInt(fields[fields.length-3]);
int dFlow=Integer.parseInt(fields[fields.length-2]);
context.write(new Text(phone),new FlowBean(upFlow,dFlow,phone));
}
}
3.FlowCountReduce
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReduce extends Reducer {
/**
*
* @param key:手机号
* @param values:某个手机号所产生的所有访问记录中的流量数据
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
int upSum=0;
int dSum=0;
for(FlowBean value:values){
upSum +=value.getUpFlow();
dSum +=value.getdFlow();
}
context.write(key,new FlowBean(upSum,dSum,key.toString()));
}
}
4.JobSubmitter
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter{
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job,new Path("F:\\mrdata\\flow\\input"));
FileOutputFormat.setOutputPath(job,new Path("F:\\mrdata\\flow\\output"));
boolean res = job.waitForCompletion(true);
System.exit(res ? 0:-1);
}
}
5.JobSubmitter程序运行统计结果【手机号 上行流量 下行流量 总流量】
13480253104180,180,360
135024688237335,110349,117684
135604366661116,954,2070
135604396582034,5892,7926
136028465651938,2910,4848
136605779916960,690,7650
13719199419240,0,240
137262305032481,24681,27162
137262388882481,24681,27162
13760778710120,120,240
13826544101264,0,264
139223144663008,3720,6728
1392505741311058,48243,59301
13926251106240,0,240
13926435656132,1512,1644
150136858583659,3538,7197
159201332573156,2936,6092
159890021191938,180,2118
182115759611527,2106,3633
183201733829531,2412,11943
841384134116,1432,5548