文章目录
一、idea新建maven项目,配置scala环境
File–>Project Structure -->Modules
二、配置pom.xml,添加依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.hjt.yxh.hw</groupId>
<artifactId>MapReduceLearn</artifactId>
<version>1.0-SNAPSHOT</version>
<profiles>
<profile>
<id>jdk-15</id>
<activation>
<activeByDefault>true</activeByDefault>
<jdk>15</jdk>
</activation>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>15</maven.compiler.source>
<maven.compiler.target>15</maven.compiler.target>
<maven.compiler.compilerVersion>15</maven.compiler.compilerVersion>
</properties>
</profile>
</profiles>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>
三、编写代码,完整的代码如下:
package com.hjt.yxh.hw.mapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
class WordCountMapper extends Mapper<LongWritable,Text, Text, LongWritable>{
private LongWritable one = new LongWritable();
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] wordList = value.toString().split(" ");
for (String item: wordList){
one.set(1L);
word.set(item);
context.write(word,one);
}
}
}
class WordCountReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
private LongWritable result = new LongWritable(0);
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long ret = 0;
for (LongWritable item : values){
ret += item.get();
}
result.set(ret);
context.write(key,result);
}
}
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.创建一个job
Configuration configuration = new Configuration();
/* 这些配置在上传到Linux的hadoop集群上执行时可以不配置
configuration.addResource("core-site.xml");
configuration.addResource("hdfs-site.xml");
configuration.addResource("mapred-site.xml");
configuration.addResource("yarn-site.xml");
System.setProperty("HADOOP_USER_NAME","root");
configuration.set("mapreduce.job.jar","D:\\javaworkspace\\BigData\\Hadoop\\MapReduceLearn\\target\\MapReduceLearn-1.0-SNAPSHOT.jar");
*/
Job job = Job.getInstance(configuration);
//设置jobname
job.setJobName("wordCount");
job.setJarByClass(WordCount.class);
//设置输入文件
FileInputFormat.addInputPath(job,new Path("/root/data/student2.txt"));
//设置结果输出
Path out = new Path("/root/out/");
if(out.getFileSystem(job.getConfiguration()).exists(out)){
out.getFileSystem(job.getConfiguration()).delete(out,true);
}
FileOutputFormat.setOutputPath(job,new Path("/root/out/"));
//设置mapper类
job.setMapOutputValueClass(LongWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setMapperClass(WordCountMapper.class);
//设置reduce类
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setReducerClass(WordCountReducer.class);
//执行job
System.exit(job.waitForCompletion(true)? 0 : 1 );
}
}
四、打成jar包,上传到Linux服务器上,执行命令:
语法:hadoop jar + jar包 + 主入口类 + 执行参数
[root@k8s-node3 mapreduce_task_jar]# hadoop jar MapReduceLearn-1.0-SNAPSHOT.jar com.hjt.yxh.hw.mapReduce.WordCount
五、总结
入口类的编写流程
编写MapReduce入口类程序的流程中的来说都是比较模板化的:
- 1.创建配置对象,Configuratio conf = new Configuration
- 2.创建Job的实例,Job job = Job.getInstance(conf);
- 3.设置作业的一些参数,如jobName,jobId,JarByClass等
- 4.设置job的输入文件路径
- 5.设置job的输出结果的路径
- 6.设置job的Map类,Map类的输出的Key的类,Map类的输出的Value的类,(当然还可以设置一些setSortComparatorClass,setCombinerClass等)不过这些不是必要的,只是作为一些优化的手段。
- 7.设置job的Reduce类,Reduce类的输出的Key的类,Value的类,
- 调用job的执行方法,job.waitForCompletion(true)
Mapper类的实现
Mapper类的实现需要继承class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>,并重写父类的map方法。
- KEYIN基本上是固定的,为LongWritable,由hadoop传递的文件的偏移量
- VALUEIN基本上也是固定的,是文件的一行文本内容
- KEYOUT:map的输出的KEY的类型,需要实现Writable接口,它的类型就是Reduce类的输入的Key的类型
- VALUEOUT:map的输出的值的类型,需要实现Writeable接口,它的类型就是Reduce类的输入的VALUE的类型。
Reduce类的实现
Reduce类的实现需要继承class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>,并重写父类的reduce方法。
- KEYIN: 它的类型是有Map类决定的,Map类的输出为Reduce的输入
- VALUEIN:它的类型是有Map类决定的,Map类的输出为Reduce的输入
- KEYOUT:reduce的输出的KEY的类型,需要实现Writable接口,它的类型就是Reduce类的输入的Key的类型
- VALUEOUT:reduce的输出的值的类型,需要实现Writeable接口,它的类型就是Reduce类的输入的VALUE的类型
reduce方法:
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
- 它的第一个参数是KEY,也就是Map类中输出的KEY的类型
- 第二个参数是Iterable,是Map类中输出的值。所以这里需要记住一句很重要的话:在MapReduce中,相同的key为一组,进行一次reduce运算所以在reduce方法的第二个参数是一个数据集。
六:可能遇到的错误和应当避免的坑
小心导包错误,导致程序的错误
在编写MapReduce程序时,有些类的包的引入很容导错包,如:
导入FileOutputFormat和FileInputFormat的包时千万不要导成了这个 import org.apache.hadoop.mapred.FileOutputFormat;正确的包的路径是import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;