这里只是根据一段日志,来做手机流量分析:
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
运行后的结果如下:
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 3597 25635 29232
13560439658 2034 5892 7926
13602846565 1938 2910 4848
13660577991 6960 690 7650
13719199419 240 0 240
13726230503 2481 24681 27162
13760778710 120 120 240
13826544101 264 0 264
13922314466 3008 3720 6728
13925057413 11058 48243 59301
13926251106 240 0 240
13926435656 132 1512 1644
15013685858 3659 3538 7197
15920133257 3156 2936 6092
1:写了FlowBean的get/set 和 toString方法
package com.zhang_bigdata02;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 封装数据类型需要怎么做?
* hadoop数据类型实现了序列化接口
* 如果自定义需要实现这个序列化接口
*/
public class FlowBean implements Writable {
//定义属性:上行流量、下行流量、流量总和
private long upFlow;
private long dfFlow;
private long flowsum;
public FlowBean() {}
public FlowBean(long upFlow,long dfFlow) {
this.upFlow = upFlow;
this.dfFlow = dfFlow;
this.flowsum = upFlow + dfFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDfFlow() {
return dfFlow;
}
public void setDfFlow(long dfFlow) {
this.dfFlow = dfFlow;
}
public long getFlowsum() {
return flowsum;
}
public void setFlowsum(long flowsum) {
this.flowsum = flowsum;
}
//序列化
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(dfFlow);
out.writeLong(flowsum);
}
//反序列化
public void readFields(DataInput in) throws IOException {
upFlow = in.readLong();
dfFlow = in.readLong();
flowsum = in.readLong();
}
@Override
public String toString() {
return upFlow + "\t" + dfFlow + "\t" + flowsum;
}
}
2:写了FlowCountMapper类
package com.zhang_bigdata02;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* KeyIn:
* ValueIn:
*
* 思路:根据想要的结果的KV类型,手机号 流量总和(上行+下行) 自定义类
* KeyOut:
* ValueOut:
*
*/
public class FlowCountMapper extends Mapper<LongWritable, Text,Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1:接入数据
String line = value.toString();
//2:切割 \t
String[] fileds = line.split("\t");
//3:拿到关键字段,手机号、上行流量、下行流量
String phoneNr = fileds[1];
long upFlow = Long.parseLong(fileds[fileds.length - 3]);
long dfFlow = Long.parseLong(fileds[fileds.length - 2]);
//4:写出到Reducer
context.write(new Text(phoneNr),new FlowBean(upFlow,dfFlow));
}
}
3:写了FlowCountReducer类
package com.zhang_bigdata02;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long upFlow_sum = 0;
long dfFlow_sum = 0;
for (FlowBean v:values) {
upFlow_sum += v.getUpFlow();
dfFlow_sum += v.getDfFlow();
}
FlowBean rsSum = new FlowBean(upFlow_sum, dfFlow_sum);
//输出结果
context.write(key,rsSum);
}
}
4:添加了PhonenumPartitioner类,将手机号做了分区
package com.zhang_bigdata02;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 手机号自定义分区:根据手机号前3位
* 默认分区方式:hash
*
* 总结:
* 1:自定义类继承Partitioner<key,value>
* 2:重写方法getPartition()
* 3:业务逻辑判断分区
* 4:在driver类中加入setPartitionerClass
* 5:注意,需要指定setNumReduceTasks(个数=分区数+1)
*/
public class PhonenumPartitioner extends Partitioner<Text,FlowBean> {
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
//1:获取手机号的前三位
String phoneNum = key.toString().substring(0, 3);
//2:分区
int partitioner = 4;
if("135".equals(phoneNum)) {
return 0;
}else if("136".equals(phoneNum)) {
return 1;
}else if("137".equals(phoneNum)) {
return 2;
}else if("138".equals(phoneNum)) {
return 3;
}
return partitioner;
}
}
5:写了FlowCountDriver类
package com.zhang_bigdata02;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{"E:\\bigdata_code\\wcword.txt", "E:\\bigdata_code\\out"};
//1:创建job任务
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2:指定jar包位置
job.setJarByClass(FlowCountDriver.class);
//3:关联使用Mapper类,设置Mapper阶段数据输出类型
job.setMapperClass(FlowCountMapper.class);
//4:关联使用Reducer类
job.setReducerClass(FlowCountReducer.class);
//5:设置Mapper阶段输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//6:设置Reducer阶段输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//设置读取数据切片的类,为了优化小文件
job.setInputFormatClass(CombineTextInputFormat.class);
//设置切片大小,最大8M,最小4M
CombineTextInputFormat.setMaxInputSplitSize(job,8*1024*1024);
CombineTextInputFormat.setMinInputSplitSize(job,4*1024*1024);
//加入根据手机号自定义分区,注意有超过分区之外的号码,也需要一个分区,所以一共需要5个分区,这个和上面的切片可以选择使用
job.setPartitionerClass(PhonenumPartitioner.class);
job.setNumReduceTasks(5);
//7:设置数据输入的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//8:设置数据输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//9:提交任务
boolean rs = job.waitForCompletion(true);
System.exit(rs ? 0 : 1);
}
}