Hadoop mapreduce 输出结果实现自定义总流量倒序排序
第一遍数据清洗java代码:
JavaBean
package com.cjp.sumflow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable {
private String phoneNum;
private long up_flow;
private long d_flow;
private long s_flow;
public FlowBean() {
super();
}
public FlowBean(String phoneNum, long up_flow, long d_flow) {
super();
this.phoneNum = phoneNum;
this.up_flow = up_flow;
this.d_flow = d_flow;
this.s_flow = up_flow + d_flow;
}
public String getPhoneNum() {
return phoneNum;
}
public void setPhoneNum(String phoneNum) {
this.phoneNum = phoneNum;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getD_flow() {
return d_flow;
}
public void setD_flow(long d_flow) {
this.d_flow = d_flow;
}
public long getS_flow() {
return s_flow;
}
public void setS_flow(long s_flow) {
this.s_flow = s_flow;
}
@Override
public String toString() {
return phoneNum + "\t" + up_flow + "\t" + d_flow + "\t" + s_flow;
}
@Override
public void readFields(DataInput d) throws IOException {
phoneNum = d.readUTF();
up_flow = d.readLong();
d_flow = d.readLong();
s_flow = d.readLong();
}
@Override
public void write(DataOutput d) throws IOException {
d.writeUTF(phoneNum);
d.writeLong(up_flow);
d.writeLong(d_flow);
d.writeLong(s_flow);
}
}
map
package com.cjp.sumflow;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.commons.lang.StringUtils;
public class Maps extends Mapper<LongWritable, Text, Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = StringUtils.split(line, "\t");
String phoneNum = split[1];
long up_flow = Long.parseLong(split[5]);
long d_flow = Long.parseLong(split[6]);
context.write(new Text(phoneNum), new FlowBean(phoneNum, up_flow, d_flow));
}
}
reduce
package com.cjp.sumflow;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduces extends Reducer<Text, FlowBean, FlowBean, NullWritable> {
@Override
protected void reduce(Text arg0, Iterable<FlowBean> value, Context context)
throws IOException, InterruptedException {
long up_flow_sum = 0;
long d_flow_sum = 0;
for (FlowBean fb : value) {
up_flow_sum += fb.getUp_flow();
d_flow_sum += fb.getD_flow();
}
context.write(new FlowBean(arg0.toString(), up_flow_sum, d_flow_sum), NullWritable.get());
}
}
经Hadoop jar 运算结果进行初步清洗计算后得到干净数据,如下
[root@hadoop ~]# hadoop fs -cat /flow/output/part-r-00000
13341414499 2736 810 3546 //手机号 上传流量 下载流量 总流量
13357024777 30 936 966
13357183311 174 152 326
13944131313 260 662685 662945
15310952123 936 1404 2340
15321168157 9368 924 10292
15360724444 225 1155 1380
15366243999 280 1070 1350
15371842777 2592 192 2784
15377892222 492 2060 2552
15388889557 131 1213 1344
15388889881 12 123 135
15399997474 20556 21126 41682
17312434888 2076 93696 95772
17314210999 112 430 542
17703771999 2134 123 2257
17719830222 25 23141 23166
17719860222 197394 1030 198424
17733299995 249350 134 249484
17739116888 54 7856 7910
17739127999 0 0 0
18013497666 234 4235 4469
18015629333 914 712 1626
18016533288 360 1224 1584
18026666666 0 4800 4800
18051549666 12 86 98
18083815777 13515 156 13671
18099913682 13 1234 1247
18100800709 1410 1926 3336
18120085511 115 105 220
18135679777 1234 54 1288
18144442222 1758 30 1788
18150000004 523 234 757
18161236777 8 246 254
18168526111 1249812 492 1250304
18187879292 156 484 640
18383838384 828 13544 14372
18650718998 2576 4452 7028
18867299999 1434 21318 22752
18913626262 5 637 642
18915648666 1068 74 1142
18948121234 11725 2655 14380
18991370888 16 13564 13580
以清洗好的数据为数据源,进行数据排序:
默认情况下mapreduce是根据map输出的结果在缓存中根据K-V中的KEY值进行排序;
所以,我们改变map输出结果的类型,让javaBean实现Conplarable接口,重写compareTo方法,以javaBean的形式输出;
在缓存中排序时按照javaBean中的排序方式进行排序:
修改后的javaBean
package com.cjp.sumflow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
public class FlowBean implements WritableComparable<FlowBean> {
private String phoneNum;
private long up_flow;
private long d_flow;
private long s_flow;
public FlowBean() {
super();
}
public FlowBean(String phoneNum, long up_flow, long d_flow) {
super();
this.phoneNum = phoneNum;
this.up_flow = up_flow;
this.d_flow = d_flow;
this.s_flow = up_flow + d_flow;
}
public String getPhoneNum() {
return phoneNum;
}
public void setPhoneNum(String phoneNum) {
this.phoneNum = phoneNum;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getD_flow() {
return d_flow;
}
public void setD_flow(long d_flow) {
this.d_flow = d_flow;
}
public long getS_flow() {
return s_flow;
}
public void setS_flow(long s_flow) {
this.s_flow = s_flow;
}
@Override
public String toString() {
return phoneNum + "\t" + up_flow + "\t" + d_flow + "\t" + s_flow;
}
@Override
public void readFields(DataInput d) throws IOException {
phoneNum = d.readUTF();
up_flow = d.readLong();
d_flow = d.readLong();
s_flow = d.readLong();
}
@Override
public void write(DataOutput d) throws IOException {
d.writeUTF(phoneNum);
d.writeLong(up_flow);
d.writeLong(d_flow);
d.writeLong(s_flow);
}
@Override
public int compareTo(FlowBean o) {
return this.s_flow > o.getS_flow() ? -1 : 1;
}
}
排序的mapreduce
package com.cjp.sumflow2;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.cjp.sumflow.FlowBean;
import com.cjp.sumflow.Maps;
import com.cjp.sumflow.Reduces;
import com.cjp.sumflow.SumRunner;
public class SortFlow {
public static class Maps extends Mapper<LongWritable, Text, FlowBean, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = StringUtils.split(line, "\t");
String phoneNum = split[0];
long u_flow = Long.parseLong(split[1]);
long d_flow = Long.parseLong(split[2]);
context.write(new FlowBean(phoneNum, u_flow, d_flow), NullWritable.get());
}
}
public static class Reduces extends Reducer<FlowBean, NullWritable, FlowBean, NullWritable> {
protected void reduce(FlowBean arg0, Iterable<FlowBean> value, Context context)
throws IOException, InterruptedException {
context.write(arg0, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SortFlow.class);
job.setMapperClass(Maps.class);
job.setReducerClass(Reduces.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(FlowBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
运行结果:
[root@hadoop ~]# hadoop fs -cat /flow/output2/part-r-00000
18168526111 1249812 492 1250304
13944131313 260 662685 662945
17733299995 249350 134 249484
17719860222 197394 1030 198424
17312434888 2076 93696 95772
15399997474 20556 21126 41682
17719830222 25 23141 23166
18867299999 1434 21318 22752
18948121234 11725 2655 14380
18383838384 828 13544 14372
18083815777 13515 156 13671
18991370888 16 13564 13580
15321168157 9368 924 10292
17739116888 54 7856 7910
18650718998 2576 4452 7028
18026666666 0 4800 4800
18013497666 234 4235 4469
13341414499 2736 810 3546
18100800709 1410 1926 3336
15371842777 2592 192 2784
15377892222 492 2060 2552
15310952123 936 1404 2340
17703771999 2134 123 2257
18144442222 1758 30 1788
18015629333 914 712 1626
18016533288 360 1224 1584
15360724444 225 1155 1380
15366243999 280 1070 1350
15388889557 131 1213 1344
18135679777 1234 54 1288
18099913682 13 1234 1247
18915648666 1068 74 1142
13357024777 30 936 966
18150000004 523 234 757
18913626262 5 637 642
18187879292 156 484 640
17314210999 112 430 542
13357183311 174 152 326
18161236777 8 246 254
18120085511 115 105 220
15388889881 12 123 135
18051549666 12 86 98
17739127999 0 0 0