MapReduce编程
前置 在hadoop3.3.1/bin中加上winutils.exe hadoop.dll
配置HADOOP_HOME、PATH环境变量
如果运行时报以下异常:
Exception in thread “main” java.lang.UnsatisfiedLinkError: org.apache.hadoop
需将 hadoop.dll拷贝到c:/windows/system32下
在win10中,如果使用IDEA,需要使用管理员身份运行IDEA
1创建Java项目,
2 在项目下新建lib文件夹,将jar包拷贝进入
3 在java bulid path中引入lib文件夹中所有的jar包
4 编写 map、reduce、job代码
如果使用Maven,则引入:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
map类:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
final IntWritable one = new IntWritable(1);
Text word = new Text();
public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
String[] str = ivalue.toString().split(" ");
for (String s : str) {
word.set(s);
context.write(word, one);
}
}
}
reduce类:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReduce extends Reducer<Text, IntWritable, Text, LongWritable> {
public void reduce(Text _key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {
long sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
LongWritable result = new LongWritable(sum);
context.write(_key, result);
}
}
job类:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// String[] otherArgs = { "/hello", "/wordcount/output1" };
String[] otherArgs = { "F:/wordcount/input", "F:/wordcount/output1" };
Job job = Job.getInstance(conf, "Word Count");
// 指定入口类
job.setJarByClass(WordCountDriver.class);
// 指定Map
job.setMapperClass(WordCountMap.class);
// 指定Reduce
job.setReducerClass(WordCountReduce.class);
// 指定本次job map阶段的输出数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 指定本次job reduce阶段的输出数据类型 也就是整个mr任务的最终输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 指定Job的输入路径
// for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
// }
// 指定Job的输出路径
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
5 运行job
7查看F盘文件夹中的运行结果
8修改在linux上执行的代码
9使用eclipse的export功能导出jar包
10 在linux集群中启动hdfs、yarn
11 将jar上传到linux中
12 使用 hadoop jar XXXX.jar 运行程序
13 在hdfs中查看运行结果
如果使用IDEA MAVEN需要在pom中加入以下插件,才能保证打包时,打入依赖包。
<build>
<plugins>
<!--(start) for package jar with dependencies -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.alan.TestHDFS</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- bind to the packaging phase -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>