MapReduce运行模式
1、本地模式 OR 集群模式
// 设置为local时,运行模式为本地模式
config.set("mapreduce.framework.name", "local");
// 设置为yarn时,运行模式为集群模式
config.set("mapreduce.framework.name", "yarn");
2、数据文件的输入输出路径
// 设置输入输出为hdfs路径
config.set("fs.defaultFS", "hdfs://node01:9000");
// 设置输入输出为本地路径
config.set("fs.defaultFS", "file:///");
提示:集群模式下,文件路径必须是hdfs路径
3、文件的输出路径为hdfs时,可能出现AccessControlException: Permission denied,报错信息如下
Caused by: org.apache.hadoop.ipc.RemoteException: org.apache.hadoop.security.AccessControlException: Permission denied: user=node01, access=WRITE, inode="":suh:supergroup:rwxr-xr-x
解决方案:
1、在系统环境变量或JVM变量中添加 HADOOP_USER_NAME,值为运行HADOOP上的Linux的用户名。修改后,需重启eclipse(推荐使用的方案)
2、MapReduce的驱动程序中添加如下设置:
// HADOOP_USER_NAME对应的用户对hdfs有读写权限
conf.set("HADOOP_USER_NAME", "root");
WordCount完成代码
package com.theone.pureone.mymapreducer;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMR {
/**
* LongWritable:文本行数,从第一行开始
* Text:每行的文本数据
* Text:Mapper端的输出Key的类型
* IntWritable:Mapper端的输出Value的类型
* @author Pureone
*
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\\s+");
for (String str : split) {
value.set(str);
context.write(value, new IntWritable(1));
}
}
}
/**
* Text:从Mapper端接收的Key的类型
* IntWritable:从Mapper端接受的Value的类型
* Text:Reducer输出的Key的类型
* IntWritable:Reducer输出的Value的类型
* @author Pureone
*
*/
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
Iterator<IntWritable> iterator = value.iterator();
int count = 0;
while (iterator.hasNext()) {
IntWritable next = (IntWritable) iterator.next();
count += next.get();
}
IntWritable sum = new IntWritable(count);
context.write(key, sum);
}
}
/**
* 驱动程序
*
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration config = new Configuration();
// 设置输入输出为hdfs路径
config.set("fs.defaultFS", "hdfs://node01:9000");
System.setProperty("HADOOP_USER_NAME", "root");
// 设置运行模式为本地,若设置为yarn,则为集群模式
config.set("mapreduce.framework.name", "local");
// config.set("yarn.resourcemanager.hostname", "node01");
Job job = Job.getInstance();
// 设置驱动类
job.setJarByClass(WordCountMR.class);
// 设置Mapper端输出的Key-Value的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置Reducer端输出的Key-Value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置Mapper类
job.setMapperClass(MyMapper.class);
// 设置Reducer类
job.setReducerClass(MyReducer.class);
// 指定该mapreduce程序数据的输入和输出路径
Path inputPath = new Path("F:\\input");
Path outputPath = new Path("hdfs://node01:9000/first_path/output");
FileSystem fileSystem = FileSystem.get(config);
// 判断输出路径是否存在
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
//
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
// 提交作业
boolean completion = job.waitForCompletion(true);
System.exit(completion ? 0 : 1);
}
}