MAPREDUCE实战

最新推荐文章于 2024-05-23 12:48:56 发布

勤径苦舟

最新推荐文章于 2024-05-23 12:48:56 发布

阅读量900

点赞数

分类专栏： hadoop 大数据

本文链接：https://blog.csdn.net/zhou920786312/article/details/86666337

版权

大数据同时被 2 个专栏收录

40 篇文章 0 订阅

订阅专栏

hadoop

14 篇文章 0 订阅

订阅专栏

测试前先启动hadoop

[hadoop@mini-yum ~]$ start-dfs.sh

[hadoop@mini-yum ~]$ start-yarn.sh

1在一堆给定的文本文件中统计输出每一个单词出现的总次数

代码

package cn.feizhou.wcdemo;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 相当于一个yarn集群的客户端
 * 需要在此封装我们的mr程序的相关运行参数，指定jar包
 * 最后提交给yarn
 * @author
 *
 */
public class WordcountDriver {
	
	public static void main(String[] args) throws Exception {
		
		if (args == null || args.length == 0) {
			args = new String[2];
			args[0] = "hdfs://mini-yum:9000/wordcount/input/wordcount.txt";
			args[1] = "hdfs://mini-yum:9000/wordcount/output8";
		}
		
		Configuration conf = new Configuration();
		
		//设置的没有用!  ??????
//		conf.set("HADOOP_USER_NAME", "hadoop");
//		conf.set("dfs.permissions.enabled", "false");
		
		
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(WordcountDriver.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		//指定reduce最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
		
	}
	

}
--------------------------------------------------
package cn.feizhou.wcdemo;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * KEYIN: 默认情况下，是mr框架所读到的一行文本的起始偏移量，Long,
 * 但是在hadoop中有自己的更精简的序列化接口，所以不直接用Long，而用LongWritable
 * 
 * VALUEIN:默认情况下，是mr框架所读到的一行文本的内容，String，同上，用Text
 * 
 * KEYOUT：是用户自定义逻辑处理完成之后输出数据中的key，在此处是单词，String，同上，用Text
 * VALUEOUT：是用户自定义逻辑处理完成之后输出数据中的value，在此处是单词次数，Integer，同上，用IntWritable
 * 
 * @author
 *
 */

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	/**
	 * map阶段的业务逻辑就写在自定义的map()方法中 maptask会对每一行输入数据调用一次我们自定义的map()方法
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		// 将maptask传给我们的文本内容先转换成String
		String line = value.toString();
		// 根据空格将这一行切分成单词
		String[] words = line.split(" ");

		// 将单词输出为<单词，1>
		for (String word : words) {
			// 将单词作为key，将次数1作为value，以便于后续的数据分发，可以根据单词分发，以便于相同单词会到相同的reduce task
			context.write(new Text(word), new IntWritable(1));
		}
	}
}

--------------------------------------------------

package cn.feizhou.wcdemo;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * KEYIN, VALUEIN 对应  mapper输出的KEYOUT,VALUEOUT类型对应
 * 
 * KEYOUT, VALUEOUT 是自定义reduce逻辑处理结果的输出数据类型
 * KEYOUT是单词
 * VLAUEOUT是总次数
 * @author
 *
 */
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

	/**
	 * <angelababy,1><angelababy,1><angelababy,1><angelababy,1><angelababy,1>
	 * <hello,1><hello,1><hello,1><hello,1><hello,1><hello,1>
	 * <banana,1><banana,1><banana,1><banana,1><banana,1><banana,1>
	 * 入参key，是一组相同单词kv对的key
	 */
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

		int count=0;
		/*Iterator<IntWritable> iterator = values.iterator();
		while(iterator.hasNext()){
			count += iterator.next().get();
		}*/
		
		for(IntWritable value:values){
		
			count += value.get();
		}
		context.write(key, new IntWritable(count));
	}
	
}

2项目打包为：wordcountDriver.jar

3上传jar包

4上传被解析的文件到hdfs

[hadoop@mini4 hadoop-2.6.4]$ hadoop fs -mkdir -p /wordcount/input
[hadoop@mini4 hadoop-2.6.4]$ hadoop fs -put LICENSE.txt NOTICE.txt README.txt /wordcount/input

5执行测试

[hadoop@mini4 ~]$ hadoop jar wordcountDriver.jar cn.feizhou.wcdemo.WordcountDriver /wordcount/input /wordcount/out

测试报告

[hadoop@mini4 ~]$ hadoop jar wordcountDriver.jar cn.feizhou.wcdemo.WordcountDriver /wordcount/input   /wordcount/out
19/01/27 16:01:56 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 16:01:56 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 16:01:58 INFO input.FileInputFormat: Total input paths to process : 3
19/01/27 16:01:58 INFO mapreduce.JobSubmitter: number of splits:3
19/01/27 16:01:58 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0001
19/01/27 16:01:59 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0001
19/01/27 16:01:59 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0001/
19/01/27 16:01:59 INFO mapreduce.Job: Running job: job_1548576099360_0001
19/01/27 16:02:11 INFO mapreduce.Job: Job job_1548576099360_0001 running in uber mode : false
19/01/27 16:02:11 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 16:02:25 INFO mapreduce.Job:  map 33% reduce 0%
19/01/27 16:02:30 INFO mapreduce.Job:  map 67% reduce 0%
19/01/27 16:02:31 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 16:02:43 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 16:02:43 INFO mapreduce.Job: Job job_1548576099360_0001 completed successfully
19/01/27 16:02:43 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=38854
		FILE: Number of bytes written=504645
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=17233
		HDFS: Number of bytes written=8989
		HDFS: Number of read operations=12
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=3
		Launched reduce tasks=1
		Data-local map tasks=3
		Total time spent by all maps in occupied slots (ms)=47704
		Total time spent by all reduces in occupied slots (ms)=13574
		Total time spent by all map tasks (ms)=47704
		Total time spent by all reduce tasks (ms)=13574
		Total vcore-milliseconds taken by all map tasks=47704
		Total vcore-milliseconds taken by all reduce tasks=13574
		Total megabyte-milliseconds taken by all map tasks=48848896
		Total megabyte-milliseconds taken by all reduce tasks=13899776
	Map-Reduce Framework
		Map input records=322
		Map output records=3664
		Map output bytes=31520
		Map output materialized bytes=38866
		Input split bytes=337
		Combine input records=0
		Combine output records=0
		Reduce input groups=841
		Reduce shuffle bytes=38866
		Reduce input records=3664
		Reduce output records=841
		Spilled Records=7328
		Shuffled Maps =3
		Failed Shuffles=0
		Merged Map outputs=3
		GC time elapsed (ms)=976
		CPU time spent (ms)=9200
		Physical memory (bytes) snapshot=735395840
		Virtual memory (bytes) snapshot=3372982272
		Total committed heap usage (bytes)=441135104
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=16896
	File Output Format Counters 
		Bytes Written=8989

测试结果：数据很多，我就截取一点

2统计不同手机手机号的流量信息

01统计每个用户的总上行流量，下行流程，总流量，
02并按手机号的归属地输出到不同文件中
03统计结果按总流量倒序排序，

数据源

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

先处理01,02的问题

代码

package cn.feizhou.provinceflow;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable{
	
	private long upFlow;
	private long dFlow;
	private long sumFlow;
	
	//反序列化时，需要反射调用空参构造函数，所以要显示定义一个
	public FlowBean(){}
	
	public FlowBean(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public long getUpFlow() {
		return upFlow;
	}
	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}
	public long getdFlow() {
		return dFlow;
	}
	public void setdFlow(long dFlow) {
		this.dFlow = dFlow;
	}


	public long getSumFlow() {
		return sumFlow;
	}
	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}
	/**
	 * 序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upFlow);
		out.writeLong(dFlow);
		out.writeLong(sumFlow);
		
	}
	/**
	 * 反序列化方法
	 * 注意：反序列化的顺序跟序列化的顺序完全一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		 upFlow = in.readLong();
		 dFlow = in.readLong();
		 sumFlow = in.readLong();
	}
	
	@Override
	public String toString() {
		 
		return upFlow + "\t" + dFlow + "\t" + sumFlow;
	}

}
--------------------------------------------------
package cn.feizhou.provinceflow;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowCount {
	
	static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			 
			String line = value.toString();	//将一行内容转成string
			String[] fields = line.split("\t");	//切分字段
			String phoneNbr = fields[1];	//取出手机号
			
			long upFlow = Long.parseLong(fields[fields.length-3]);	//取出上行流量下行流量
			long dFlow = Long.parseLong(fields[fields.length-2]);
			
			context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));
		}
	}
	
	
	static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		//<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

			long sum_upFlow = 0;
			long sum_dFlow = 0;
			
			//遍历所有bean，将其中的上行流量，下行流量分别累加
			for(FlowBean bean: values){
				sum_upFlow += bean.getUpFlow();
				sum_dFlow += bean.getdFlow();
			}
			
			FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);
			context.write(key, resultBean);
		}
	}
	
	
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(FlowCount.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);
		
		//指定我们自定义的数据分区器
		job.setPartitionerClass(ProvincePartitioner.class);
		//同时指定相应“分区”数量的reducetask
		job.setNumReduceTasks(5);
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		//指定最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	}
	

}
-------------------------------------------------------
package cn.feizhou.provinceflow;

import java.util.HashMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * K2  V2  对应的是map输出kv的类型
 * @author
 *
 */
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{

	public static HashMap<String, Integer> proviceDict = new HashMap<String, Integer>();
	static{
		proviceDict.put("136", 0);
		proviceDict.put("137", 1);
		proviceDict.put("138", 2);
		proviceDict.put("139", 3);
	}
	@Override
	public int getPartition(Text key, FlowBean value, int numPartitions) {
		String prefix = key.toString().substring(0, 3);
		Integer provinceId = proviceDict.get(prefix);
		//返回分区号,136,137,138,139以外的返回分区号是4
		return provinceId==null?4:provinceId;
	}
}

上传jar和测试数据

上传被解析的文件到hdfs

[hadoop@mini4 ~]$ hadoop fs -mkdir -p /FlowCount/input
[hadoop@mini4 ~]$ hadoop fs -put flow.log /FlowCount/input

执行测试

[hadoop@mini4 ~]$ hadoop jar FlowCount.jar cn.feizhou.provinceflow.FlowCount /FlowCount/input /FlowCount/out

测试报告

19/01/27 19:17:08 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 19:17:10 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 19:17:22 INFO input.FileInputFormat: Total input paths to process : 1
19/01/27 19:17:22 INFO mapreduce.JobSubmitter: number of splits:1
19/01/27 19:17:23 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0002
19/01/27 19:17:23 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0002
19/01/27 19:17:23 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0002/
19/01/27 19:17:23 INFO mapreduce.Job: Running job: job_1548576099360_0002
19/01/27 19:17:45 INFO mapreduce.Job: Job job_1548576099360_0002 running in uber mode : false
19/01/27 19:17:45 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 19:18:04 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 19:18:21 INFO mapreduce.Job:  map 100% reduce 20%
19/01/27 19:18:22 INFO mapreduce.Job:  map 100% reduce 40%
19/01/27 19:18:23 INFO mapreduce.Job:  map 100% reduce 60%
19/01/27 19:18:42 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 19:18:45 INFO mapreduce.Job: Job job_1548576099360_0002 completed successfully
19/01/27 19:18:45 INFO mapreduce.Job: Counters: 50
	File System Counters
		FILE: Number of bytes read=863
		FILE: Number of bytes written=643259
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=2300
		HDFS: Number of bytes written=551
		HDFS: Number of read operations=18
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=10
	Job Counters 
		Killed reduce tasks=2
		Launched map tasks=1
		Launched reduce tasks=6
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=16543
		Total time spent by all reduces in occupied slots (ms)=111014
		Total time spent by all map tasks (ms)=16543
		Total time spent by all reduce tasks (ms)=111014
		Total vcore-milliseconds taken by all map tasks=16543
		Total vcore-milliseconds taken by all reduce tasks=111014
		Total megabyte-milliseconds taken by all map tasks=16940032
		Total megabyte-milliseconds taken by all reduce tasks=113678336
	Map-Reduce Framework
		Map input records=22
		Map output records=22
		Map output bytes=789
		Map output materialized bytes=863
		Input split bytes=110
		Combine input records=0
		Combine output records=0
		Reduce input groups=21
		Reduce shuffle bytes=863
		Reduce input records=22
		Reduce output records=21
		Spilled Records=44
		Shuffled Maps =5
		Failed Shuffles=0
		Merged Map outputs=5
		GC time elapsed (ms)=1466
		CPU time spent (ms)=9710
		Physical memory (bytes) snapshot=755515392
		Virtual memory (bytes) snapshot=5083746304
		Total committed heap usage (bytes)=294457344
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=2190
	File Output Format Counters 
		Bytes Written=551

结果：

读取part-r-00004分区内容

[hadoop@mini4 ~]$ hadoop fs -cat /FlowCount/out/part-r-00004
13480253104   180   180   360
13502468823   7335   110349   117684
13560436666   1116   954   2070
13560439658   2034   5892   7926
15013685858   3659   3538   7197
15920133257   3156   2936   6092
15989002119   1938   180   2118
18211575961   1527   2106   3633
18320173382   9531   2412   11943
84138413   4116   1432   5548

结论：

01 统计每个用户的总上行流量，下行流程，总流量。OK
02并按手机号的归属地输出到不同文件中。OK

接下问题：03统计结果按总流量倒序排序

我的做法是，将上面的每一个输出文件作为输入文件，重新解析。这样就对每一个文件进行排序了。

因为Map的输出是按照key来排序的，所以我们要用bean来作为key。

代码

package cn.feizhou.flowsum;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 * 
 * 要实现序列化、排序接口（WritableComparable），如果只要序列化，实现Writable就行了
 *
 */
public class FlowBean implements WritableComparable<FlowBean>{
	
	private long upFlow;
	private long dFlow;
	private long sumFlow;
	
	//反序列化时，需要反射调用空参构造函数，所以要显示定义一个
	public FlowBean(){}
	
	public FlowBean(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public void set(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public long getUpFlow() {
		return upFlow;
	}
	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}
	public long getdFlow() {
		return dFlow;
	}
	public void setdFlow(long dFlow) {
		this.dFlow = dFlow;
	}


	public long getSumFlow() {
		return sumFlow;
	}


	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}


	/**
	 * 序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upFlow);
		out.writeLong(dFlow);
		out.writeLong(sumFlow);
		
	}


	/**
	 * 反序列化方法
	 * 注意：反序列化的顺序跟序列化的顺序完全一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		 upFlow = in.readLong();
		 dFlow = in.readLong();
		 sumFlow = in.readLong();
	}
	
	@Override
	public String toString() {
		 
		return upFlow + "\t" + dFlow + "\t" + sumFlow;
	}

	/**
	 * 排序方法
	 */
	@Override
	public int compareTo(FlowBean o) {
		return this.sumFlow>o.getSumFlow()?-1:1;	//从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置(自己的理解)
	}

}
--------------------------------------------

/**
 * 13480253104 180 180 360 13502468823 7335 110349 117684 13560436666 1116 954
 * 2070
 * 
 * @author
 * 
 */
public class FlowCountSort {

	static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {

		FlowBean bean = new FlowBean();
		Text v = new Text();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			//将一行内容转成string
			String line = value.toString();
			//切分字段
			String[] fields = line.split("\t");
			//取出手机号
			String phoneNbr = fields[0];
			//取出上行流量下行流量
			long upFlow = Long.parseLong(fields[1]);
			long dFlow = Long.parseLong(fields[2]);

			bean.set(upFlow, dFlow);
			v.set(phoneNbr);
			//这里保存的是序列化后的数据，所以不用担忧引用地址对应的对象内容变化的问题
			//key:FlowBean
			//value:phone
			context.write(bean, v);

		}

	}

	/**
	 * 根据key来掉, 传过来的是对象, 每个对象都是不一样的, 所以每个对象都调用一次reduce方法
	  * @author: 张政
	  * @date: 2016年4月11日 下午7:08:18
	  * @package_name: day07.sample
	 */
	static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {

		// <bean(),phonenbr>
		@Override
		protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			
			//values内容只有一个--》电话
			context.write(values.iterator().next(), bean);

		}

	}
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(FlowCountSort.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);
		
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(Text.class);
		
		//指定最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		
		Path outPath = new Path(args[1]);
		/*FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outPath)){
			fs.delete(outPath, true);
		}*/
		FileOutputFormat.setOutputPath(job, outPath);
		
		//将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	}
}

上传jar

执行测试

[hadoop@mini4 ~]$ hadoop jar FlowCountSort.jar cn.feizhou.flowsum.FlowCountSort /FlowCount/out/part-r-00004 /FlowCountSort/out/

测试报告

19/01/27 19:34:59 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 19:35:00 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 19:35:03 INFO input.FileInputFormat: Total input paths to process : 1
19/01/27 19:35:03 INFO mapreduce.JobSubmitter: number of splits:1
19/01/27 19:35:03 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0003
19/01/27 19:35:03 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0003
19/01/27 19:35:04 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0003/
19/01/27 19:35:04 INFO mapreduce.Job: Running job: job_1548576099360_0003
19/01/27 19:35:15 INFO mapreduce.Job: Job job_1548576099360_0003 running in uber mode : false
19/01/27 19:35:15 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 19:35:47 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 19:35:59 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 19:36:01 INFO mapreduce.Job: Job job_1548576099360_0003 completed successfully
19/01/27 19:36:04 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=383
		FILE: Number of bytes written=214283
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=379
		HDFS: Number of bytes written=267
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=9820
		Total time spent by all reduces in occupied slots (ms)=9427
		Total time spent by all map tasks (ms)=9820
		Total time spent by all reduce tasks (ms)=9427
		Total vcore-milliseconds taken by all map tasks=9820
		Total vcore-milliseconds taken by all reduce tasks=9427
		Total megabyte-milliseconds taken by all map tasks=10055680
		Total megabyte-milliseconds taken by all reduce tasks=9653248
	Map-Reduce Framework
		Map input records=10
		Map output records=10
		Map output bytes=357
		Map output materialized bytes=383
		Input split bytes=112
		Combine input records=0
		Combine output records=0
		Reduce input groups=10
		Reduce shuffle bytes=383
		Reduce input records=10
		Reduce output records=10
		Spilled Records=20
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=239
		CPU time spent (ms)=2790
		Physical memory (bytes) snapshot=314318848
		Virtual memory (bytes) snapshot=1691697152
		Total committed heap usage (bytes)=168103936
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=267
	File Output Format Counters 
		Bytes Written=267

结果：

[hadoop@mini4 ~]$ hadoop fs -cat /FlowCountSort/out/part-r-00000
13502468823   7335   110349   117684
18320173382   9531   2412   11943
13560439658   2034   5892   7926
15013685858   3659   3538   7197
15920133257   3156   2936   6092
84138413   4116   1432   5548
18211575961   1527   2106   3633
15989002119   1938   180   2118
13560436666   1116   954   2070
13480253104   180   180   360

结论：

该文件已经按照总流量大小排序

备注：标红的是错误数据，该数据应该Map解析的时候就过滤出去，这样因为没有过滤，所以才出现

所有代码

https://download.csdn.net/download/zhou920786312/10940769

勤径苦舟

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MAPREDUCE实战

测试前先启动hadoop[hadoop@mini-yum ~]$ start-dfs.sh[hadoop@mini-yum ~]$ start-yarn.sh 1在一堆给定的文本文件中统计输出每一个单词出现的总次数代码package cn.feizhou.wcdemo;import org.apache.hadoop.conf.Configuration;im...
复制链接

扫一扫