FlowCountMapper.java
package os.os.flowcount;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
* KEYIN:日子文件一行的起始偏移量
* VALUE:日子文件的一行内容
*
* KEYOUT:map阶段输出的key
* VALUEOUT:流量信息,FlowBean
*/
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//拿到一行日志内容 转换为string
String line = value.toString();
// 通过分隔符切换出各个字段
String[] fields = StringUtils.split(line,"\t");
//抽取需要的字段
String phone = fields[1];
long upFlow = Long.parseLong(fields[fields.length-3]);
long downFlow = Long.parseLong(fields[fields.length -2 ]);
//输出key,value对 <phone,FlowBean>
FlowBean bean = new FlowBean(upFlow,downFlow);
context.write(new Text(phone), bean);
}
}
FlowCountReduce.java
package os.os.flowcount;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowCountReduce extends Reducer<Text, FlowBean, Text, FlowBean>{
/*
* key是一个手机号
* values是这个手机号对应的所有kv
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(Text key, java.lang.Iterable<FlowBean> values, Context context) throws java.io.IOException ,InterruptedException {
long upflowSum = 0;//上行流量的和
long downflowSum = 0; //下行流量和
for(FlowBean value: values) {
upflowSum += value.getUpflow();
downflowSum += value.getDownflow();
}
FlowBean bean = new FlowBean(upflowSum,downflowSum);
context.write(key, bean);
}
}
FlowBean.java
package os.os.flowcount;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/*
* 自定义的数据类型,要做hadoop集群中传递,需要实现hadoop序列化框架,就是去实现一个接口 Writable
*/
public class FlowBean implements Writable{
private long upflow; //上行流量
private long downflow; //下行流量
private long sumflow; //总流量
//因为反射机制需要,必须定义一个无参构造函数
public FlowBean() {
}
public long getUpflow() {
return upflow;
}
public void setUpflow(long upflow) {
this.upflow = upflow;
}
public long getDownflow() {
return downflow;
}
public void setDownflow(long downflow) {
this.downflow = downflow;
}
public long getSumflow() {
return sumflow;
}
public void setSumflow(long sumflow) {
this.sumflow = sumflow;
}
public FlowBean(long upflow, long downflow) {
this.upflow = upflow;
this.downflow = downflow;
this.sumflow = upflow + downflow;
}
/*
* 序列化方法,将我们要传输的数据序列化字节流
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upflow);
out.writeLong(downflow);
}
/*
* 反序列化的方法,从自己流中恢复出各个字段
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
@Override
public void readFields(DataInput in) throws IOException {
upflow = in.readLong(); //从网络字节序,变成主机字节序
downflow = in.readLong();
}
@Override
public String toString() {
return "FlowBean [upflow=" + upflow + ", downflow=" + downflow + ", sumflow=" + sumflow + "]";
}
}
FlowCountJob.java
package os.os.flowcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(FlowCountJob.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
日志格式
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200