案例描述:计算一个文件中每个单词出现的数量
代码:
package com.jeff.mr.wordCount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 案例1 :
* 在一个很大的文件中含有一行行的单词,每行的单词以空格分隔,计算每个单词出现的个数
* @author jeffSheng
* 2018年9月18日
*/
public class RunJob {
public static void main(String[] args) {
//初始化Configuration自动装载src或者class下的配置文件
Configuration config = new Configuration();
try {
FileSystem fs =FileSystem.get(config);
//创建执行的任务,静态创建方式,传入config
Job job = Job.getInstance(config);
//设置入口类,就是当前类
job.setJarByClass(RunJob.class);
//设置job任务名称
job.setJobName("wordCount");
//job任务运行时Map Task执行类
job.setMapperClass(WordCountMapper.class);
//job任务运行时Reduce Task执行类
job.setReducerClass(WordCountReducer.class);
//map Task输出的key的类型,就是单词
job.setMapOutputKeyClass(Text.class);
//map Task输出的value的类型,就是单词出现数量
job.setMapOutputValueClass(IntWritable.class);
//先指定mapTask输入数据的目录:/usr/input/
FileInputFormat.addInputPath(job, new Path("/usr/input/"));
//指定输出数据的目录:/usr/output/wc
Path outpath =new Path("/usr/output/wc");
//判断目录是否存在,存在则递归删除
if(fs.exists(outpath)){
fs.delete(outpath, true);
}
//指定输出数据的目录
FileOutputFormat.setOutputPath(job, outpath);
//等待job任务完成
boolean f= job.waitForCompletion(true);
if(f){
System.out.println("job任务执行成功");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.jeff.mr.wordCount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
/**
* Map Task定义
* 计算文件中单词出现次数和默认第一阶段洗牌
* @author jeffSheng
* 2018年9月18日
* 继承Mapper接口,泛型参数:<MapTask输入数据key,MapTask输入数据value,MapTask输出数据key,MapTask输出数据value>
* Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*
* KEYIN, VALUEIN
* mapTask输入数据从文件切割后的碎片段来的按照行去传递给MapTask,默认以数据行的下标为键值,类型为LongWritable,value为Text类型表示一行的数据
*
* KEYOUT, VALUEOUT
* mapTask的输出数据以单词为key,就是字符串类型Text,value则是单词的数量类型IntWritable
* Mapper<LongWritable, Text, Text, IntWritable>
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
/**
* 该map方法循环调用,从文件的split碎片段中逐行即读取每行则调用一次,把该行所在的下标为key,该行的内容为value
*/
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
//value是split的每一行的值,在本例子中是空格分隔的字符串
String[] words = StringUtils.split(value.toString(), ' ');
for(String word :words){
//输出以单词word为键,1作为值的键值对,这里mapTask只是输出数据,统计则是在reduceTask
/**
* 输出数据会经历第一阶段洗牌,即分区,排序,合并,溢写。这些在mapTask端有默认的操作
*/
context.write(new Text(word), new IntWritable(1));
}
}
}
package com.jeff.mr.wordCount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* reduce Task定义
* mapTask第一阶段洗牌完成后输出数据传给reduce Task进行第二阶段的洗牌(分区,排序,分组)后作为reduce的输入,数据类型一致。
* Tips:分组后,每一组数据传给reduce Task,即每一组调用一次,这一组数据的特点是key相等,value可能是多个
* @author jeffSheng
* 2018年9月18日
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
//循环调用此方法,每组调用一次,这一组数据特点:key相同,value可能有多个。
/**
* Text arg0: 键,就是每一组中的key,即某个单词。
* Iterable<IntWritable> arg1: 迭代器中可以拿到每一组中的所有值去迭代
*/
protected void reduce(Text arg0, Iterable<IntWritable> arg1,Context arg2)
throws IOException, InterruptedException {
int sum =0;
for(IntWritable i: arg1){
sum=sum + i.get();
}
//输出以单词为key,总和为value的键值对
arg2.write(arg0, new IntWritable(sum));
}
}
当前active状态的节点为node1,端口8020
将node1和8020设置到Map/Reduce Locations,新建一个location
红框中填入node1和8020.hadoop的用户为root
新建输入文件的路径为/usr/input
刚开始发现创建的目录不能成功,解决办法是在hdfs-site.xml加入:
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
重启,即可解决!
#关闭全部节点
stop-dfs.sh
我们启动node1:
#start-dfs.sh
上传wc.txt文件即输入数据到hdfs的/usr/input下:
hadooo hello world hello hadoop hbase zookeeper |
在实际生产环境中,计算程序是先提交给ResourceManager的,所以我们先把程序打成jar包:
然后下一步,finish
然后我们把wc.jar上传到我们的node4,注意node4不是我们的Resourcemanager的主节点,但是node4的配置文件告诉我们了ResourceManager主节点node1的位置。
我们程序的入口类:
com.jeff.mr.wordCount.RunJob
# hadoop jar wc.jar com.jeff.mr.wordCount.RunJob
可以在监控界面看到计算进度:
执行日志:
root@node1 ~]# hadoop jar wc.jar com.jeff.mr.wordCount.RunJob
18/09/21 00:28:10 INFO client.ConfiguredRMFailoverProxyProvider: Failing over to rm2 18/09/21 00:28:10 WARN mapreduce.JobSubmitter: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 18/09/21 00:28:11 INFO input.FileInputFormat: Total input paths to process : 1 18/09/21 00:28:11 INFO mapreduce.JobSubmitter: number of splits:1 18/09/21 00:28:11 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1537198202075_0002 18/09/21 00:28:12 INFO impl.YarnClientImpl: Submitted application application_1537198202075_0002 18/09/21 00:28:12 INFO mapreduce.Job: The url to track the job: http://node4:18088/proxy/application_1537198202075_0002/ 18/09/21 00:28:12 INFO mapreduce.Job: Running job: job_1537198202075_0002 18/09/21 00:28:38 INFO mapreduce.Job: Job job_1537198202075_0002 running in uber mode : false 18/09/21 00:28:38 INFO mapreduce.Job: map 0% reduce 0% 18/09/21 00:28:51 INFO mapreduce.Job: map 100% reduce 0% 18/09/21 00:29:04 INFO mapreduce.Job: map 100% reduce 100% 18/09/21 00:29:05 INFO mapreduce.Job: Job job_1537198202075_0002 completed successfully 18/09/21 00:29:05 INFO mapreduce.Job: Counters: 49 File System Counters FILE: Number of bytes read=96 FILE: Number of bytes written=198601 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=146 HDFS: Number of bytes written=54 HDFS: Number of read operations=6 HDFS: Number of large read operations=0 HDFS: Number of write operations=2 Job Counters Launched map tasks=1 Launched reduce tasks=1 Data-local map tasks=1 Total time spent by all maps in occupied slots (ms)=11040 Total time spent by all reduces in occupied slots (ms)=9092 Total time spent by all map tasks (ms)=11040 Total time spent by all reduce tasks (ms)=9092 Total vcore-seconds taken by all map tasks=11040 Total vcore-seconds taken by all reduce tasks=9092 Total megabyte-seconds taken by all map tasks=11304960 Total megabyte-seconds taken by all reduce tasks=9310208 Map-Reduce Framework Map input records=3 Map output records=7 Map output bytes=76 Map output materialized bytes=96 Input split bytes=97 Combine input records=0 Combine output records=0 Reduce input groups=6 Reduce shuffle bytes=96 Reduce input records=7 Reduce output records=6 Spilled Records=14 Shuffled Maps =1 Failed Shuffles=0 Merged Map outputs=1 GC time elapsed (ms)=214 CPU time spent (ms)=3550 Physical memory (bytes) snapshot=322617344 Virtual memory (bytes) snapshot=1724956672 Total committed heap usage (bytes)=136253440 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=49 File Output Format Counters Bytes Written=54 job任务执行成功 |
执行结果目录/usr/output/wc下有两个文件:第一个_SUCCESS文件是成功标识,第二个输出结果文件:
输入文件:
计算结果: