wordcount

最新推荐文章于 2022-11-16 20:02:21 发布

小丽0228

最新推荐文章于 2022-11-16 20:02:21 发布

阅读量108

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/m0_37558366/article/details/89202250

版权

大数据专栏收录该内容

31 篇文章 0 订阅

订阅专栏

package com.bjsxt.mr.wordcount;
 

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 

public class MainClass {

	public static void main(String[] args) throws Exception {
		//1.加载配置文件
		Configuration conf=new Configuration(true);
		//2.创建Job对象,用到用户配置文件
		Job job=Job.getInstance(conf);
		
		//3.设定Job的程序入口 
		job.setJarByClass(MainClass.class);
		//4.设置Job名称
		job.setJobName("mywordcount");
		
		//5.设置mapper任务类
		job.setMapperClass(WordCountMapper.class);
		//6.设置reduce任务类
		job.setReducerClass(WordCountReduce.class);
		
		//7.指定文件从哪里读取，从HDFS加载一个输入文件给job来处理
		FileInputFormat.addInputPath(job, new Path(args[0]));
		//8.指定HDFS上不存在的一个路径作为job的输出路径
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//9.指定map输出键值对中的键的类型
		job.setOutputKeyClass(Text.class);
		//10.指定map输出键值对中值得类型
		job.setOutputValueClass(LongWritable.class);
		
		//11.设置reduce的任务数量
		job.setNumReduceTasks(2);
		
		//提交作业 等待作业的完成
		job.waitForCompletion(true);
		 
	}

}

package com.bjsxt.mr.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

	private LongWritable outValue = new LongWritable();
	private Text outKey = new Text();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		// 将当前读取的行转换为String 类型
		String lineString = value.toString();
		String[] words = lineString.split(" ");
		for (String word : words) {
			outKey.set(word);
			outValue.set(1L);
			// 将计算出的结果写入环形缓冲区
			context.write(outKey, outValue);
		}
	}
}

package com.bjsxt.mr.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReduce extends Reducer<Text, LongWritable, Text, LongWritable> {
	private LongWritable outValue=new LongWritable();
	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context)
			throws IOException, InterruptedException {
		long sum=0;
		for (LongWritable longWritable : values) {
			sum+=longWritable.get();
		}
		outValue.set(sum);
		
		//将结果输出到HDFS的文件中
		context.write(key, outValue);
	}
}

[root@nodeok ~]# hdfs dfs -D dfs.replication=1 -D dfs.blocksize=1048576 -put hello.txt /
19/04/11 16:40:49 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[root@nodeok ~]# clear
[root@nodeok ~]# ls
anaconda-ks.cfg  hadoop-2.6.5.tar.gz  hello.txt  install.log  install.log.syslog  jdk-7u80-linux-x64.rpm  wordcount.jar
[root@nodeok ~]# yarn jar wordcount.jar com.bjsxt.mr.wordcount.MainClass /hello.txt /mr
19/04/11 16:41:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19/04/11 16:41:21 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/04/11 16:41:22 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
19/04/11 16:41:23 INFO input.FileInputFormat: Total input paths to process : 1
19/04/11 16:41:23 INFO mapreduce.JobSubmitter: number of splits:2
19/04/11 16:41:24 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1554971932274_0001
19/04/11 16:41:25 INFO impl.YarnClientImpl: Submitted application application_1554971932274_0001
19/04/11 16:41:25 INFO mapreduce.Job: The url to track the job: http://nodeok:8088/proxy/application_1554971932274_0001/
19/04/11 16:41:25 INFO mapreduce.Job: Running job: job_1554971932274_0001
19/04/11 16:41:45 INFO mapreduce.Job: Job job_1554971932274_0001 running in uber mode : false
19/04/11 16:41:45 INFO mapreduce.Job:  map 0% reduce 0%
19/04/11 16:42:16 INFO mapreduce.Job:  map 83% reduce 0%
19/04/11 16:42:17 INFO mapreduce.Job:  map 100% reduce 0%
19/04/11 16:42:47 INFO mapreduce.Job:  map 100% reduce 71%
19/04/11 16:42:50 INFO mapreduce.Job:  map 100% reduce 98%
19/04/11 16:42:51 INFO mapreduce.Job:  map 100% reduce 100%
19/04/11 16:42:52 INFO mapreduce.Job: Job job_1554971932274_0001 completed successfully
19/04/11 16:42:52 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=4788907
		FILE: Number of bytes written=10005548
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=1793177
		HDFS: Number of bytes written=788921
		HDFS: Number of read operations=12
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=4
	Job Counters 
		Launched map tasks=2
		Launched reduce tasks=2
		Data-local map tasks=2
		Total time spent by all maps in occupied slots (ms)=57804
		Total time spent by all reduces in occupied slots (ms)=61580
		Total time spent by all map tasks (ms)=57804
		Total time spent by all reduce tasks (ms)=61580
		Total vcore-milliseconds taken by all map tasks=57804
		Total vcore-milliseconds taken by all reduce tasks=61580
		Total megabyte-milliseconds taken by all map tasks=59191296
		Total megabyte-milliseconds taken by all reduce tasks=63057920
	Map-Reduce Framework
		Map input records=100000
		Map output records=300000
		Map output bytes=4188895
		Map output materialized bytes=4788919
		Input split bytes=186
		Combine input records=0
		Combine output records=0
		Reduce input groups=100002
		Reduce shuffle bytes=4788919
		Reduce input records=300000
		Reduce output records=100002
		Spilled Records=600000
		Shuffled Maps =4
		Failed Shuffles=0
		Merged Map outputs=4
		GC time elapsed (ms)=2504
		CPU time spent (ms)=16690
		Physical memory (bytes) snapshot=586608640
		Virtual memory (bytes) snapshot=3370262528
		Total committed heap usage (bytes)=273997824
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=1792991
	File Output Format Counters 
		Bytes Written=788921
[root@nodeok ~]# hdfs dfs -get /mr
19/04/11 16:44:42 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[root@nodeok ~]# ls
anaconda-ks.cfg  hadoop-2.6.5.tar.gz  hello.txt  install.log  install.log.syslog  jdk-7u80-linux-x64.rpm  mr  wordcount.jar
[root@nodeok ~]# cd mr/
[root@nodeok mr]# ls
part-r-00000  part-r-00001  _SUCCESS
[root@nodeok mr]# ll -h
total 776K
-rw-r--r-- 1 root root 386K Apr 11 16:44 part-r-00000
-rw-r--r-- 1 root root 386K Apr 11 16:44 part-r-00001
-rw-r--r-- 1 root root    0 Apr 11 16:44 _SUCCESS
[root@nodeok mr]# vi part-r-00000
[root@nodeok mr]# vi part-r-00001

小丽0228

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
wordcount

package com.bjsxt.mr.wordcount; import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import or...
复制链接

扫一扫