MAPREDUCE实战

测试前先启动hadoop

[hadoop@mini-yum ~]$ start-dfs.sh

[hadoop@mini-yum ~]$ start-yarn.sh

 

1在一堆给定的文本文件中统计输出每一个单词出现的总次数

代码

package cn.feizhou.wcdemo;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 相当于一个yarn集群的客户端
 * 需要在此封装我们的mr程序的相关运行参数,指定jar包
 * 最后提交给yarn
 * @author
 *
 */
public class WordcountDriver {
	
	public static void main(String[] args) throws Exception {
		
		if (args == null || args.length == 0) {
			args = new String[2];
			args[0] = "hdfs://mini-yum:9000/wordcount/input/wordcount.txt";
			args[1] = "hdfs://mini-yum:9000/wordcount/output8";
		}
		
		Configuration conf = new Configuration();
		
		//设置的没有用!  ??????
//		conf.set("HADOOP_USER_NAME", "hadoop");
//		conf.set("dfs.permissions.enabled", "false");
		
		
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(WordcountDriver.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		//指定reduce最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
		
	}
	

}
--------------------------------------------------
package cn.feizhou.wcdemo;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * KEYIN: 默认情况下,是mr框架所读到的一行文本的起始偏移量,Long,
 * 但是在hadoop中有自己的更精简的序列化接口,所以不直接用Long,而用LongWritable
 * 
 * VALUEIN:默认情况下,是mr框架所读到的一行文本的内容,String,同上,用Text
 * 
 * KEYOUT:是用户自定义逻辑处理完成之后输出数据中的key,在此处是单词,String,同上,用Text
 * VALUEOUT:是用户自定义逻辑处理完成之后输出数据中的value,在此处是单词次数,Integer,同上,用IntWritable
 * 
 * @author
 *
 */

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	/**
	 * map阶段的业务逻辑就写在自定义的map()方法中 maptask会对每一行输入数据调用一次我们自定义的map()方法
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		// 将maptask传给我们的文本内容先转换成String
		String line = value.toString();
		// 根据空格将这一行切分成单词
		String[] words = line.split(" ");

		// 将单词输出为<单词,1>
		for (String word : words) {
			// 将单词作为key,将次数1作为value,以便于后续的数据分发,可以根据单词分发,以便于相同单词会到相同的reduce task
			context.write(new Text(word), new IntWritable(1));
		}
	}
}

--------------------------------------------------

package cn.feizhou.wcdemo;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * KEYIN, VALUEIN 对应  mapper输出的KEYOUT,VALUEOUT类型对应
 * 
 * KEYOUT, VALUEOUT 是自定义reduce逻辑处理结果的输出数据类型
 * KEYOUT是单词
 * VLAUEOUT是总次数
 * @author
 *
 */
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

	/**
	 * <angelababy,1><angelababy,1><angelababy,1><angelababy,1><angelababy,1>
	 * <hello,1><hello,1><hello,1><hello,1><hello,1><hello,1>
	 * <banana,1><banana,1><banana,1><banana,1><banana,1><banana,1>
	 * 入参key,是一组相同单词kv对的key
	 */
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

		int count=0;
		/*Iterator<IntWritable> iterator = values.iterator();
		while(iterator.hasNext()){
			count += iterator.next().get();
		}*/
		
		for(IntWritable value:values){
		
			count += value.get();
		}
		context.write(key, new IntWritable(count));
	}
	
}

2项目打包为:wordcountDriver.jar

3上传jar包

4上传被解析的文件到hdfs

[hadoop@mini4 hadoop-2.6.4]$ hadoop fs -mkdir -p /wordcount/input
[hadoop@mini4 hadoop-2.6.4]$ hadoop fs -put LICENSE.txt NOTICE.txt README.txt /wordcount/input

5执行测试

[hadoop@mini4 ~]$ hadoop jar wordcountDriver.jar cn.feizhou.wcdemo.WordcountDriver /wordcount/input   /wordcount/out

测试报告

[hadoop@mini4 ~]$ hadoop jar wordcountDriver.jar cn.feizhou.wcdemo.WordcountDriver /wordcount/input   /wordcount/out
19/01/27 16:01:56 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 16:01:56 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 16:01:58 INFO input.FileInputFormat: Total input paths to process : 3
19/01/27 16:01:58 INFO mapreduce.JobSubmitter: number of splits:3
19/01/27 16:01:58 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0001
19/01/27 16:01:59 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0001
19/01/27 16:01:59 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0001/
19/01/27 16:01:59 INFO mapreduce.Job: Running job: job_1548576099360_0001
19/01/27 16:02:11 INFO mapreduce.Job: Job job_1548576099360_0001 running in uber mode : false
19/01/27 16:02:11 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 16:02:25 INFO mapreduce.Job:  map 33% reduce 0%
19/01/27 16:02:30 INFO mapreduce.Job:  map 67% reduce 0%
19/01/27 16:02:31 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 16:02:43 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 16:02:43 INFO mapreduce.Job: Job job_1548576099360_0001 completed successfully
19/01/27 16:02:43 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=38854
		FILE: Number of bytes written=504645
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=17233
		HDFS: Number of bytes written=8989
		HDFS: Number of read operations=12
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=3
		Launched reduce tasks=1
		Data-local map tasks=3
		Total time spent by all maps in occupied slots (ms)=47704
		Total time spent by all reduces in occupied slots (ms)=13574
		Total time spent by all map tasks (ms)=47704
		Total time spent by all reduce tasks (ms)=13574
		Total vcore-milliseconds taken by all map tasks=47704
		Total vcore-milliseconds taken by all reduce tasks=13574
		Total megabyte-milliseconds taken by all map tasks=48848896
		Total megabyte-milliseconds taken by all reduce tasks=13899776
	Map-Reduce Framework
		Map input records=322
		Map output records=3664
		Map output bytes=31520
		Map output materialized bytes=38866
		Input split bytes=337
		Combine input records=0
		Combine output records=0
		Reduce input groups=841
		Reduce shuffle bytes=38866
		Reduce input records=3664
		Reduce output records=841
		Spilled Records=7328
		Shuffled Maps =3
		Failed Shuffles=0
		Merged Map outputs=3
		GC time elapsed (ms)=976
		CPU time spent (ms)=9200
		Physical memory (bytes) snapshot=735395840
		Virtual memory (bytes) snapshot=3372982272
		Total committed heap usage (bytes)=441135104
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=16896
	File Output Format Counters 
		Bytes Written=8989

测试结果:数据很多,我就截取一点

 

 

2统计不同手机手机号的流量信息

  • 01统计每个用户的总上行流量,下行流程,总流量,
  • 02并按手机号的归属地输出到不同文件中
  • 03统计结果按总流量倒序排序,

数据源

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

先处理01,02的问题

代码

package cn.feizhou.provinceflow;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable{
	
	private long upFlow;
	private long dFlow;
	private long sumFlow;
	
	//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
	public FlowBean(){}
	
	public FlowBean(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public long getUpFlow() {
		return upFlow;
	}
	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}
	public long getdFlow() {
		return dFlow;
	}
	public void setdFlow(long dFlow) {
		this.dFlow = dFlow;
	}


	public long getSumFlow() {
		return sumFlow;
	}
	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}
	/**
	 * 序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upFlow);
		out.writeLong(dFlow);
		out.writeLong(sumFlow);
		
	}
	/**
	 * 反序列化方法
	 * 注意:反序列化的顺序跟序列化的顺序完全一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		 upFlow = in.readLong();
		 dFlow = in.readLong();
		 sumFlow = in.readLong();
	}
	
	@Override
	public String toString() {
		 
		return upFlow + "\t" + dFlow + "\t" + sumFlow;
	}

}
--------------------------------------------------
package cn.feizhou.provinceflow;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowCount {
	
	static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			 
			String line = value.toString();	//将一行内容转成string
			String[] fields = line.split("\t");	//切分字段
			String phoneNbr = fields[1];	//取出手机号
			
			long upFlow = Long.parseLong(fields[fields.length-3]);	//取出上行流量下行流量
			long dFlow = Long.parseLong(fields[fields.length-2]);
			
			context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));
		}
	}
	
	
	static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		//<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

			long sum_upFlow = 0;
			long sum_dFlow = 0;
			
			//遍历所有bean,将其中的上行流量,下行流量分别累加
			for(FlowBean bean: values){
				sum_upFlow += bean.getUpFlow();
				sum_dFlow += bean.getdFlow();
			}
			
			FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);
			context.write(key, resultBean);
		}
	}
	
	
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(FlowCount.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);
		
		//指定我们自定义的数据分区器
		job.setPartitionerClass(ProvincePartitioner.class);
		//同时指定相应“分区”数量的reducetask
		job.setNumReduceTasks(5);
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		//指定最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	}
	

}
-------------------------------------------------------
package cn.feizhou.provinceflow;

import java.util.HashMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * K2  V2  对应的是map输出kv的类型
 * @author
 *
 */
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{

	public static HashMap<String, Integer> proviceDict = new HashMap<String, Integer>();
	static{
		proviceDict.put("136", 0);
		proviceDict.put("137", 1);
		proviceDict.put("138", 2);
		proviceDict.put("139", 3);
	}
	@Override
	public int getPartition(Text key, FlowBean value, int numPartitions) {
		String prefix = key.toString().substring(0, 3);
		Integer provinceId = proviceDict.get(prefix);
		//返回分区号,136,137,138,139以外的返回分区号是4
		return provinceId==null?4:provinceId;
	}
}

 

上传jar和测试数据

 

 上传被解析的文件到hdfs

 

[hadoop@mini4 ~]$ hadoop fs -mkdir -p /FlowCount/input
[hadoop@mini4 ~]$ hadoop fs -put flow.log  /FlowCount/input

执行测试

[hadoop@mini4 ~]$ hadoop jar FlowCount.jar cn.feizhou.provinceflow.FlowCount /FlowCount/input   /FlowCount/out

测试报告

19/01/27 19:17:08 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 19:17:10 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 19:17:22 INFO input.FileInputFormat: Total input paths to process : 1
19/01/27 19:17:22 INFO mapreduce.JobSubmitter: number of splits:1
19/01/27 19:17:23 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0002
19/01/27 19:17:23 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0002
19/01/27 19:17:23 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0002/
19/01/27 19:17:23 INFO mapreduce.Job: Running job: job_1548576099360_0002
19/01/27 19:17:45 INFO mapreduce.Job: Job job_1548576099360_0002 running in uber mode : false
19/01/27 19:17:45 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 19:18:04 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 19:18:21 INFO mapreduce.Job:  map 100% reduce 20%
19/01/27 19:18:22 INFO mapreduce.Job:  map 100% reduce 40%
19/01/27 19:18:23 INFO mapreduce.Job:  map 100% reduce 60%
19/01/27 19:18:42 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 19:18:45 INFO mapreduce.Job: Job job_1548576099360_0002 completed successfully
19/01/27 19:18:45 INFO mapreduce.Job: Counters: 50
	File System Counters
		FILE: Number of bytes read=863
		FILE: Number of bytes written=643259
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=2300
		HDFS: Number of bytes written=551
		HDFS: Number of read operations=18
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=10
	Job Counters 
		Killed reduce tasks=2
		Launched map tasks=1
		Launched reduce tasks=6
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=16543
		Total time spent by all reduces in occupied slots (ms)=111014
		Total time spent by all map tasks (ms)=16543
		Total time spent by all reduce tasks (ms)=111014
		Total vcore-milliseconds taken by all map tasks=16543
		Total vcore-milliseconds taken by all reduce tasks=111014
		Total megabyte-milliseconds taken by all map tasks=16940032
		Total megabyte-milliseconds taken by all reduce tasks=113678336
	Map-Reduce Framework
		Map input records=22
		Map output records=22
		Map output bytes=789
		Map output materialized bytes=863
		Input split bytes=110
		Combine input records=0
		Combine output records=0
		Reduce input groups=21
		Reduce shuffle bytes=863
		Reduce input records=22
		Reduce output records=21
		Spilled Records=44
		Shuffled Maps =5
		Failed Shuffles=0
		Merged Map outputs=5
		GC time elapsed (ms)=1466
		CPU time spent (ms)=9710
		Physical memory (bytes) snapshot=755515392
		Virtual memory (bytes) snapshot=5083746304
		Total committed heap usage (bytes)=294457344
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=2190
	File Output Format Counters 
		Bytes Written=551

 

结果:

读取part-r-00004分区内容

[hadoop@mini4 ~]$ hadoop fs -cat /FlowCount/out/part-r-00004
13480253104    180    180    360
13502468823    7335    110349    117684
13560436666    1116    954    2070
13560439658    2034    5892    7926
15013685858    3659    3538    7197
15920133257    3156    2936    6092
15989002119    1938    180    2118
18211575961    1527    2106    3633
18320173382    9531    2412    11943
84138413    4116    1432    5548

结论:

  • 01 统计每个用户的总上行流量,下行流程,总流量。OK
  • 02并按手机号的归属地输出到不同文件中。OK

 

接下问题:03统计结果按总流量倒序排序

我的做法是,将上面的每一个输出文件作为输入文件,重新解析。这样就对每一个文件进行排序了。

因为Map的输出是按照key来排序的,所以我们要用bean来作为key。

代码

package cn.feizhou.flowsum;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 * 
 * 要实现序列化、排序接口(WritableComparable),如果只要序列化,实现Writable就行了
 *
 */
public class FlowBean implements WritableComparable<FlowBean>{
	
	private long upFlow;
	private long dFlow;
	private long sumFlow;
	
	//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
	public FlowBean(){}
	
	public FlowBean(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public void set(long upFlow, long dFlow) {
		this.upFlow = upFlow;
		this.dFlow = dFlow;
		this.sumFlow = upFlow + dFlow;
	}
	public long getUpFlow() {
		return upFlow;
	}
	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}
	public long getdFlow() {
		return dFlow;
	}
	public void setdFlow(long dFlow) {
		this.dFlow = dFlow;
	}


	public long getSumFlow() {
		return sumFlow;
	}


	public void setSumFlow(long sumFlow) {
		this.sumFlow = sumFlow;
	}


	/**
	 * 序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upFlow);
		out.writeLong(dFlow);
		out.writeLong(sumFlow);
		
	}


	/**
	 * 反序列化方法
	 * 注意:反序列化的顺序跟序列化的顺序完全一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		 upFlow = in.readLong();
		 dFlow = in.readLong();
		 sumFlow = in.readLong();
	}
	
	@Override
	public String toString() {
		 
		return upFlow + "\t" + dFlow + "\t" + sumFlow;
	}

	/**
	 * 排序方法
	 */
	@Override
	public int compareTo(FlowBean o) {
		return this.sumFlow>o.getSumFlow()?-1:1;	//从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置(自己的理解)
	}

}
--------------------------------------------

/**
 * 13480253104 180 180 360 13502468823 7335 110349 117684 13560436666 1116 954
 * 2070
 * 
 * @author
 * 
 */
public class FlowCountSort {

	static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {

		FlowBean bean = new FlowBean();
		Text v = new Text();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			//将一行内容转成string
			String line = value.toString();
			//切分字段
			String[] fields = line.split("\t");
			//取出手机号
			String phoneNbr = fields[0];
			//取出上行流量下行流量
			long upFlow = Long.parseLong(fields[1]);
			long dFlow = Long.parseLong(fields[2]);

			bean.set(upFlow, dFlow);
			v.set(phoneNbr);
			//这里保存的是序列化后的数据,所以不用担忧引用地址对应的对象内容变化的问题
			//key:FlowBean
			//value:phone
			context.write(bean, v);

		}

	}

	/**
	 * 根据key来掉, 传过来的是对象, 每个对象都是不一样的, 所以每个对象都调用一次reduce方法
	  * @author: 张政
	  * @date: 2016年4月11日 下午7:08:18
	  * @package_name: day07.sample
	 */
	static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {

		// <bean(),phonenbr>
		@Override
		protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			
			//values内容只有一个--》电话
			context.write(values.iterator().next(), bean);

		}

	}
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		/*conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resoucemanager.hostname", "mini1");*/
		Job job = Job.getInstance(conf);
		
		/*job.setJar("/home/hadoop/wc.jar");*/
		//指定本程序的jar包所在的本地路径
		job.setJarByClass(FlowCountSort.class);
		
		//指定本业务job要使用的mapper/Reducer业务类
		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);
		
		
		//指定mapper输出数据的kv类型
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(Text.class);
		
		//指定最终输出的数据的kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//指定job的输入原始文件所在目录
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		//指定job的输出结果所在目录
		
		Path outPath = new Path(args[1]);
		/*FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outPath)){
			fs.delete(outPath, true);
		}*/
		FileOutputFormat.setOutputPath(job, outPath);
		
		//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
		/*job.submit();*/
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	}
}

 

上传jar

执行测试

[hadoop@mini4 ~]$ hadoop jar FlowCountSort.jar cn.feizhou.flowsum.FlowCountSort   /FlowCount/out/part-r-00004  /FlowCountSort/out/

测试报告

19/01/27 19:34:59 INFO client.RMProxy: Connecting to ResourceManager at mini-yum/192.168.232.128:8032
19/01/27 19:35:00 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/01/27 19:35:03 INFO input.FileInputFormat: Total input paths to process : 1
19/01/27 19:35:03 INFO mapreduce.JobSubmitter: number of splits:1
19/01/27 19:35:03 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548576099360_0003
19/01/27 19:35:03 INFO impl.YarnClientImpl: Submitted application application_1548576099360_0003
19/01/27 19:35:04 INFO mapreduce.Job: The url to track the job: http://mini-yum:8088/proxy/application_1548576099360_0003/
19/01/27 19:35:04 INFO mapreduce.Job: Running job: job_1548576099360_0003
19/01/27 19:35:15 INFO mapreduce.Job: Job job_1548576099360_0003 running in uber mode : false
19/01/27 19:35:15 INFO mapreduce.Job:  map 0% reduce 0%
19/01/27 19:35:47 INFO mapreduce.Job:  map 100% reduce 0%
19/01/27 19:35:59 INFO mapreduce.Job:  map 100% reduce 100%
19/01/27 19:36:01 INFO mapreduce.Job: Job job_1548576099360_0003 completed successfully
19/01/27 19:36:04 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=383
		FILE: Number of bytes written=214283
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=379
		HDFS: Number of bytes written=267
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=9820
		Total time spent by all reduces in occupied slots (ms)=9427
		Total time spent by all map tasks (ms)=9820
		Total time spent by all reduce tasks (ms)=9427
		Total vcore-milliseconds taken by all map tasks=9820
		Total vcore-milliseconds taken by all reduce tasks=9427
		Total megabyte-milliseconds taken by all map tasks=10055680
		Total megabyte-milliseconds taken by all reduce tasks=9653248
	Map-Reduce Framework
		Map input records=10
		Map output records=10
		Map output bytes=357
		Map output materialized bytes=383
		Input split bytes=112
		Combine input records=0
		Combine output records=0
		Reduce input groups=10
		Reduce shuffle bytes=383
		Reduce input records=10
		Reduce output records=10
		Spilled Records=20
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=239
		CPU time spent (ms)=2790
		Physical memory (bytes) snapshot=314318848
		Virtual memory (bytes) snapshot=1691697152
		Total committed heap usage (bytes)=168103936
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=267
	File Output Format Counters 
		Bytes Written=267

结果:

[hadoop@mini4 ~]$ hadoop fs -cat /FlowCountSort/out/part-r-00000
13502468823    7335    110349    117684
18320173382    9531    2412    11943
13560439658    2034    5892    7926
15013685858    3659    3538    7197
15920133257    3156    2936    6092
 84138413    4116    1432    5548
18211575961    1527    2106    3633
15989002119    1938    180    2118
13560436666    1116    954    2070
13480253104    180    180    360

结论:

该文件已经按照总流量大小排序

备注:标红的是错误数据,该数据应该Map解析的时候就过滤出去,这样因为没有过滤,所以才出现

 

所有代码

https://download.csdn.net/download/zhou920786312/10940769

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: HadoopMapReduce实战案例有很多,以下是一些常见的案例: 1. WordCount:统计文本中单词出现的次数。 2. TopN:找出文本中出现次数最多的前N个单词。 3. 求平均值:计算文本中一列数字的平均值。 4. 倒排索引:将文本中的单词与出现的文档建立索引,方便快速查找。 5. 关联规则挖掘:通过分析大量数据,找出其中的关联规则,如购物篮分析。 6. 图像处理:通过MapReduce处理大量图像数据,如图像分类、图像识别等。 7. 推荐系统:通过分析用户行为数据,为用户推荐相关的产品或服务。 8. 日志分析:通过MapReduce处理大量日志数据,如网站访问日志、服务器日志等,分析用户行为、系统性能等。 以上是一些常见的HadoopMapReduce实战案例,实际应用中还有很多其他的案例。 ### 回答2: Hadoop是一个开源的分布式存储和处理大数据的解决方案,而MapReduceHadoop中的一种计算框架。其实战案例很多,下面就列举一些经典的案例。 1. 单词计数:在一个大文本文件中统计每个单词出现的次数,是Hadoop入门案例。MapReduce的Map函数进行分割文本并将每个单词都映射到(key,value)对上,reduce函数对同一个key的value进行合并并输出。 2. 网页排名:Google使用了PageRank算法对网页搜索结果进行排序,而这个排序算法的实现就是MapReduce模型。Map函数将网页信息和链接信息映射到(key,value)对上,reduce函数计算网页的排名并输出。 3. 日志分析:大型网站的日志通常非常庞大,Hadoop可应用于实时分析与处理这些日志。MapReduce的Map函数解析日志并提取重要信息,reduce函数进行计数统计或者聚合操作。 4. 图像处理:MapReduce模型常用于图像识别、处理和分析。Map函数把处理的图像块分配到不同的机器上,在不同的机器上并行化地处理。而reduce函数通常用于汇总结果并输出。 这些案例只是MapReduceHadoop中的应用之一,Hadoop还可以通过Hive、Pig等组件来对数据进行高层次的查询、脚本处理和数据分析。Hadoop在大数据领域的应用日益广泛,如商业智能、金融风控、医疗健康等,在数据收集、存储和处理中扮演着重要的角色。 ### 回答3: Hadoop是一个大数据处理平台,而MapReduce是其中最主要的一种分布式计算框架。MapReduce的核心思想是将一个大数据集拆分成多个子集,然后通过并行计算将这些子集进行处理得到最终结果。 在实际应用中,人们利用HadoopMapReduce来处理各种各样的大数据问题。下面我们来介绍一些MapReduce实战案例: 1.单词统计 这是一个最简单却也最典型的案例。它的思路是读入一个大文本文件,然后将文件中每个单词逐个拆分出来,统计每个单词出现的频次,并将结果输出。这个过程可以通过MapReduce的"map"和"reduce"函数来实现。其中"map"函数负责将文本文件拆分成单词,将每个单词与1这个数字配对,并将结果输出。而"reduce"函数负责将配对结果按照单词将其分组,然后将每组中所有数字进行累加,得到每个单词的频次。 2. 声明式流媒体处理 MapReduce不仅仅是用来处理静态的大数据,还可以用来处理流媒体数据。具体来说,就是将流媒体数据流中的元素分割成小块,逐块对其进行MapReduce运算处理,得到最终结果。例如,在一个音乐流媒体应用中,我们可以通过MapReduce对歌曲库进行快速索引,以便用户能够快速检索和播放他们喜爱的歌曲。同时,在流媒体数据处理过程中,MapReduce还能够根据所处理的数据类型,自动调整MapReduce算法的参数和并行度,以确保处理效率和质量。 3.处理图形数据 图形处理是一个非常关键的应用领域。通过MapReduce计算框架,我们能够处理极大规模的图形数据集,例如在社交网络中对用户关系进行建模或者对搜索引擎中的网页链接关系进行分析。具体操作过程是,我们首先通过MapReduce的"map"函数将每个节点的邻居节点列表作为输出键值,将每个节点的ID作为输出值。然后通过MapReduce的"reduce"函数将具有相同邻居节点的节点交给同一个处理器进行处理。最终,我们得到的结果是每个节点及其所有邻居节点的详细信息。 总之,以上三个案例充分展示了MapReduce在大数据处理过程中的应用价值。作为一种分析大规模数据和自动化处理复杂问题的工具,MapReduce框架不仅使我们能够更好地管理和分析数据,而且还推动了数据处理领域的创新和发展。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值