Hadoop mapreduce 自定义实现排序

Hadoop mapreduce 输出结果实现自定义总流量倒序排序

第一遍数据清洗java代码:

JavaBean

package com.cjp.sumflow;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable {

	private String phoneNum;
	private long up_flow;
	private long d_flow;
	private long s_flow;

	public FlowBean() {
		super();
	}

	public FlowBean(String phoneNum, long up_flow, long d_flow) {
		super();
		this.phoneNum = phoneNum;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.s_flow = up_flow + d_flow;
	}

	public String getPhoneNum() {
		return phoneNum;
	}

	public void setPhoneNum(String phoneNum) {
		this.phoneNum = phoneNum;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getS_flow() {
		return s_flow;
	}

	public void setS_flow(long s_flow) {
		this.s_flow = s_flow;
	}
	
	

	@Override
	public String toString() {
		return  phoneNum + "\t" + up_flow + "\t" + d_flow + "\t" + s_flow;
	}

	@Override
	public void readFields(DataInput d) throws IOException {

		phoneNum = d.readUTF();
		up_flow = d.readLong();
		d_flow = d.readLong();
		s_flow = d.readLong();

	}

	@Override
	public void write(DataOutput d) throws IOException {

		d.writeUTF(phoneNum);
		d.writeLong(up_flow);
		d.writeLong(d_flow);
		d.writeLong(s_flow);

	}
}

map
package com.cjp.sumflow;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.commons.lang.StringUtils;

public class Maps extends Mapper<LongWritable, Text, Text, FlowBean> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

	String line = value.toString();

	String[] split = StringUtils.split(line, "\t");

	String phoneNum = split[1];
	long up_flow = Long.parseLong(split[5]);

	long d_flow = Long.parseLong(split[6]);
	
	context.write(new Text(phoneNum), new FlowBean(phoneNum, up_flow, d_flow));
	
}

}
reduce

package com.cjp.sumflow;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class Reduces extends Reducer<Text, FlowBean, FlowBean, NullWritable> {

	@Override
	protected void reduce(Text arg0, Iterable<FlowBean> value, Context context)
			throws IOException, InterruptedException {

		long up_flow_sum = 0;
		long d_flow_sum = 0;

		for (FlowBean fb : value) {

			up_flow_sum += fb.getUp_flow();
			d_flow_sum += fb.getD_flow();

		}
		context.write(new FlowBean(arg0.toString(), up_flow_sum, d_flow_sum), NullWritable.get());

	}

}

经Hadoop jar 运算结果进行初步清洗计算后得到干净数据,如下

[root@hadoop ~]# hadoop fs -cat /flow/output/part-r-00000
13341414499     2736    810     3546     //手机号  上传流量	下载流量		总流量
13357024777     30      936     966
13357183311     174     152     326
13944131313     260     662685  662945
15310952123     936     1404    2340
15321168157     9368    924     10292
15360724444     225     1155    1380
15366243999     280     1070    1350
15371842777     2592    192     2784
15377892222     492     2060    2552
15388889557     131     1213    1344
15388889881     12      123     135
15399997474     20556   21126   41682
17312434888     2076    93696   95772
17314210999     112     430     542
17703771999     2134    123     2257
17719830222     25      23141   23166
17719860222     197394  1030    198424
17733299995     249350  134     249484
17739116888     54      7856    7910
17739127999     0       0       0
18013497666     234     4235    4469
18015629333     914     712     1626
18016533288     360     1224    1584
18026666666     0       4800    4800
18051549666     12      86      98
18083815777     13515   156     13671
18099913682     13      1234    1247
18100800709     1410    1926    3336
18120085511     115     105     220
18135679777     1234    54      1288
18144442222     1758    30      1788
18150000004     523     234     757
18161236777     8       246     254
18168526111     1249812 492     1250304
18187879292     156     484     640
18383838384     828     13544   14372
18650718998     2576    4452    7028
18867299999     1434    21318   22752
18913626262     5       637     642
18915648666     1068    74      1142
18948121234     11725   2655    14380
18991370888     16      13564   13580

以清洗好的数据为数据源,进行数据排序:

默认情况下mapreduce是根据map输出的结果在缓存中根据K-V中的KEY值进行排序;
所以,我们改变map输出结果的类型,让javaBean实现Conplarable接口,重写compareTo方法,以javaBean的形式输出;
在缓存中排序时按照javaBean中的排序方式进行排序:

修改后的javaBean

package com.cjp.sumflow;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {

	private String phoneNum;
	private long up_flow;
	private long d_flow;
	private long s_flow;

	public FlowBean() {
		super();
	}

	public FlowBean(String phoneNum, long up_flow, long d_flow) {
		super();
		this.phoneNum = phoneNum;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.s_flow = up_flow + d_flow;
	}

	public String getPhoneNum() {
		return phoneNum;
	}

	public void setPhoneNum(String phoneNum) {
		this.phoneNum = phoneNum;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getS_flow() {
		return s_flow;
	}

	public void setS_flow(long s_flow) {
		this.s_flow = s_flow;
	}
	
	

	@Override
	public String toString() {
		return  phoneNum + "\t" + up_flow + "\t" + d_flow + "\t" + s_flow;
	}

	@Override
	public void readFields(DataInput d) throws IOException {

		phoneNum = d.readUTF();
		up_flow = d.readLong();
		d_flow = d.readLong();
		s_flow = d.readLong();

	}

	@Override
	public void write(DataOutput d) throws IOException {

		d.writeUTF(phoneNum);
		d.writeLong(up_flow);
		d.writeLong(d_flow);
		d.writeLong(s_flow);

	}

	@Override
	public int compareTo(FlowBean o) {
		return this.s_flow > o.getS_flow() ? -1 : 1;
	}

}

排序的mapreduce

package com.cjp.sumflow2;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.cjp.sumflow.FlowBean;
import com.cjp.sumflow.Maps;
import com.cjp.sumflow.Reduces;
import com.cjp.sumflow.SumRunner;

public class SortFlow {

	public static class Maps extends Mapper<LongWritable, Text, FlowBean, NullWritable> {

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			String line = value.toString();

			String[] split = StringUtils.split(line, "\t");

			String phoneNum = split[0];
			long u_flow = Long.parseLong(split[1]);
			long d_flow = Long.parseLong(split[2]);

			context.write(new FlowBean(phoneNum, u_flow, d_flow), NullWritable.get());

		}

	}

	public static class Reduces extends Reducer<FlowBean, NullWritable, FlowBean, NullWritable> {

		protected void reduce(FlowBean arg0, Iterable<FlowBean> value, Context context)
				throws IOException, InterruptedException {

			context.write(arg0, NullWritable.get());

		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(SortFlow.class);

		job.setMapperClass(Maps.class);
		job.setReducerClass(Reduces.class);

		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setOutputKeyClass(FlowBean.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

运行结果:

[root@hadoop ~]# hadoop fs -cat /flow/output2/part-r-00000
18168526111     1249812 492     1250304
13944131313     260     662685  662945
17733299995     249350  134     249484
17719860222     197394  1030    198424
17312434888     2076    93696   95772
15399997474     20556   21126   41682
17719830222     25      23141   23166
18867299999     1434    21318   22752
18948121234     11725   2655    14380
18383838384     828     13544   14372
18083815777     13515   156     13671
18991370888     16      13564   13580
15321168157     9368    924     10292
17739116888     54      7856    7910
18650718998     2576    4452    7028
18026666666     0       4800    4800
18013497666     234     4235    4469
13341414499     2736    810     3546
18100800709     1410    1926    3336
15371842777     2592    192     2784
15377892222     492     2060    2552
15310952123     936     1404    2340
17703771999     2134    123     2257
18144442222     1758    30      1788
18015629333     914     712     1626
18016533288     360     1224    1584
15360724444     225     1155    1380
15366243999     280     1070    1350
15388889557     131     1213    1344
18135679777     1234    54      1288
18099913682     13      1234    1247
18915648666     1068    74      1142
13357024777     30      936     966
18150000004     523     234     757
18913626262     5       637     642
18187879292     156     484     640
17314210999     112     430     542
13357183311     174     152     326
18161236777     8       246     254
18120085511     115     105     220
15388889881     12      123     135
18051549666     12      86      98
17739127999     0       0       0
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值