利用mapreduce简单统计信息

最新推荐文章于 2022-10-04 22:45:56 发布

学习中....

最新推荐文章于 2022-10-04 22:45:56 发布

阅读量1.8k

点赞数 1

分类专栏： Hadoop

本文链接：https://blog.csdn.net/qq_36055407/article/details/82872374

版权

Hadoop 专栏收录该内容

31 篇文章 0 订阅

订阅专栏

利用mapreduce对一下文本进行处理，对每个用户的上行流量，下行流量还有宗总流量进行分析，输出手机号、上行流量、下行流量

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

1.思路分析，map会对文本进行一行的读取，读取完之后，根据key值进行分组，然后汇总到reduce中进行集中处理

每一组在mapper中处理完成之后，汇总信息，然后写入磁盘

根据需求可知需要从文本中获取四个信息，手机号、上传流量、下载流量、总流量

其中上传流量、下载流量、总流量需要作为key值进行传递，此时需要建立一个Bean类，用来承载信息

以下为具体代码：

package com.oracle.j2ee.mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyFlow {
	//写一个BEAN
	public static class FlowBean implements Writable{
		public long up;
		public long down;
		public long count;
        //此空参构造方法必须写，不然会出现methodNotFoundException异常
		public FlowBean() {
			
		}
		public FlowBean(long up,long down) {
			this.up=up;
			this.down=down;
			this.count=up+down;
		}
		public long getUp() {
			return up;
		}

		public void setUp(long up) {
			this.up = up;
		}

		public long getDown() {
			return down;
		}

		public void setDown(long down) {
			this.down = down;
		}

		public long getCount() {
			return count;
		}

		public void setCount(long count) {
			this.count = count;
		}

		@Override
		public void write(DataOutput out) throws IOException {
			out.writeLong(up);
			out.writeLong(down);
			out.writeLong(count);
			

		}

		@Override
		public void readFields(DataInput in) throws IOException {
			up=in.readLong();
			down=in.readLong();
			count=in.readLong();

		}
        //toString 的重写根据自己的需求，书写，此处只影响输出文件中是信息
		@Override
		public String toString() {
			return up+" "+down+" "+count;
		}

	}
	//处理读取的数据
	public static class MyMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
            //将读取的每行的信息转化位字符串
			String str=value.toString();
            //分割
			String[] news=str.split("\t");
            //由文件信息可以知道，手机号信息在数组中占第二个元素
			String phone=news[1];
            //上传与下载流量伪倒数第三与第二个元素
			long up=Long.parseLong(news[news.length-3]);
			long down=Long.parseLong(news[news.length-2]);
            //创建Bean实例，将上传与下载数据写入
			FlowBean flow=new FlowBean(up, down);
            //以手机号作为key值，bean作为value写处
            //手机号作为key值，在分组时会将手机号信息一样的分为一组，以方便总流量的汇总
			context.write(new Text(phone), flow);
		}
	}
//创建Myducer对map传过来的数据进行处理
	public static class MyReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		@Override
		protected void reduce(Text key, Iterable<FlowBean> iter, Context context)
				throws IOException, InterruptedException {
			long upSum=0;
			long downSum=0;
//遍历Iterable对 数据进行重新汇总，写入bean
			for(FlowBean arr:iter) {
				upSum+=arr.getUp();
				downSum+=arr.getDown();
			}
			FlowBean flow=new FlowBean(upSum, downSum);
			context.write(key, flow);
		}
	}
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		//获取配置
		Configuration conf=new Configuration();
		//获取Job实例
		Job job=Job.getInstance(conf);
		//加载jar包，以方便可以在集群上运行
		job.setJarByClass(MyFlow.class);
		//加载Mymapper与MyReducer
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		//加载map输出key与value的类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		//加载reduce的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		//配置输入输出路径
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//提交任务
		job.waitForCompletion(true);
	}

}

完成代码之后，把项目打成jar包，利用工具上传到集群中，将需要处理的文本上传到集群的根目录之下

上传命令为： hadoop fs -put flow.log /input/

执行jar包， hadoop jar flow.jar com.oracle.j2ee.mapreduce.MyFlow /input/flow.log /output

出现以下信息一般代表执行成功

[root@hadoop-1 apps]# hadoop jar flow.jar com.oracle.j2ee.mapreduce.MyFlow /flow.log /myoutput4
18/09/28 03:42:59 INFO client.RMProxy: Connecting to ResourceManager at hadoop-1/192.168.1.222:8032
18/09/28 03:42:59 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/09/28 03:43:00 INFO input.FileInputFormat: Total input paths to process : 1
18/09/28 03:43:00 INFO mapreduce.JobSubmitter: number of splits:1
18/09/28 03:43:00 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1538049918445_0009
18/09/28 03:43:01 INFO impl.YarnClientImpl: Submitted application application_1538049918445_0009
18/09/28 03:43:01 INFO mapreduce.Job: The url to track the job: http://hadoop-1:8088/proxy/application_1538049918445_0009/
18/09/28 03:43:01 INFO mapreduce.Job: Running job: job_1538049918445_0009
18/09/28 03:43:10 INFO mapreduce.Job: Job job_1538049918445_0009 running in uber mode : false
18/09/28 03:43:10 INFO mapreduce.Job:  map 0% reduce 0%
18/09/28 03:43:19 INFO mapreduce.Job:  map 100% reduce 0%
18/09/28 03:43:27 INFO mapreduce.Job:  map 100% reduce 100%
18/09/28 03:43:28 INFO mapreduce.Job: Job job_1538049918445_0009 completed successfully
18/09/28 03:43:28 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=839
		FILE: Number of bytes written=214929
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=2284
		HDFS: Number of bytes written=551
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=6508
		Total time spent by all reduces in occupied slots (ms)=6420
		Total time spent by all map tasks (ms)=6508
		Total time spent by all reduce tasks (ms)=6420
		Total vcore-milliseconds taken by all map tasks=6508
		Total vcore-milliseconds taken by all reduce tasks=6420
		Total megabyte-milliseconds taken by all map tasks=6664192
		Total megabyte-milliseconds taken by all reduce tasks=6574080
	Map-Reduce Framework
		Map input records=22
		Map output records=22
		Map output bytes=789
		Map output materialized bytes=839
		Input split bytes=94
		Combine input records=0
		Combine output records=0
		Reduce input groups=21
		Reduce shuffle bytes=839
		Reduce input records=22
		Reduce output records=21
		Spilled Records=44
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=167
		CPU time spent (ms)=1570
		Physical memory (bytes) snapshot=330604544
		Virtual memory (bytes) snapshot=1684975616
		Total committed heap usage (bytes)=136450048
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=2190
	File Output Format Counters 
		Bytes Written=551

然后查看对应目录下输出文件的信息，如下：

[root@hadoop-1 apps]# hadoop fs -cat /myoutput4/part-r-00000
13480253104	180 180 360
13502468823	7335 110349 117684
13560436666	1116 954 2070
13560439658	2034 5892 7926
13602846565	1938 2910 4848
13660577991	6960 690 7650
13719199419	240 0 240
13726230503	2481 24681 27162
13726238888	2481 24681 27162
13760778710	120 120 240
13826544101	264 0 264
13922314466	3008 3720 6728
13925057413	11058 48243 59301
13926251106	240 0 240
13926435656	132 1512 1644
15013685858	3659 3538 7197
15920133257	3156 2936 6092
15989002119	1938 180 2118
18211575961	1527 2106 3633
18320173382	9531 2412 11943
84138413	4116 1432 5548

注意事项：

1.需要注意的是，Bean类需要进行序列化，实现writable接口，然后重写read和write方法，其中需要注意，输入与输出数据的一致性，否则会报错

2.在集群中运行需要 job.setJarByClass(****.class);

3.如果mapper和reducer作为内部类，需要被静态static修饰符修饰

学习中....

关注

1
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
利用mapreduce简单统计信息

利用mapreduce对一下文本进行处理，对每个用户的上行流量，下行流量还有宗总流量进行分析，输出手机号、上行流量、下行流量 1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 2001363157995052 138265...
复制链接

扫一扫