一、创建 maven 工程并导入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.kgc</groupId>
<artifactId>mapreduce</artifactId>
<version>1.0</version>
<name>mapreduce</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hadoop.version>3.1.3</hadoop.version>
<log4j.version>1.2.17</log4j.version>
<slf4j.version>2.0.0-alpha1</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<!-- 用于打胖包 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.3</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<!-- 指定主类 -->
<manifest>
<main-class>cn.kgc.mapreduce.App</main-class>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assemply</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
二、编写Mapper代码
package cn.kgc.mapreduce;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
//正则替换文章中的标点符号,以英文文章为例
final String REP_REG = "!|;|,|\\.|\\?|'";
Text keyOut = new Text();
IntWritable valueOut = new IntWritable(1);
@Override
//Context: 上下文,把多余线程的数据聚合到一起
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//StringTokenizer默认分隔符为空格
StringTokenizer it = new StringTokenizer(value.toString().replaceAll(REP_REG, "")
//将2个以上的空格替换成1个
.replaceAll(" {2,}"," "));
while (it.hasMoreElements()) {
keyOut.set(it.nextToken());
context.write(keyOut,valueOut);
}
}
}
三、编写 Reducer 代码
package cn.kgc.mapreduce;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import javax.xml.soap.Text;
import java.io.IOException;
import java.util.Iterator;
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
IntWritable valueOut = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
Iterator<IntWritable> it = values.iterator();
while (it.hasNext()) {
sum += it.next().get();
}
valueOut.set(sum);
context.write(key,valueOut);
}
}
四、编写 Job 执行代码
package cn.kgc.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
public class App {
public static void main(String[] args) {
//加载hadoop默认配置项
Configuration conf = new Configuration(true);
//跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
final Path pathIn = new Path("/test/data/story.txt");
final Path pathOut = new Path("/test/out/");
FileSystem fs = null;
try {
//创建FileSystem访问地址
fs = FileSystem.get(URI.create("hdfs://192.168.131.200:9820"),conf,"root");
//判断输入文件是否存在
if (!fs.exists(pathIn)){
System.err.println(pathIn.getName()+" doesn't exist");
return;
}
//判断输出文件目录是否已存在
if (fs.exists(pathOut)){
System.err.println(pathOut.getName()+" has existed");
return;
}
//文件系统只是用来判断文件或路径是否存在,用完需要释放资源
fs.close();
//创建job
Job wcJob = Job.getInstance(conf, "wc");
//指定jar包路径(需要打包)
wcJob.setJar("target/mapreduce-1.0-jar-with-dependencies.jar");
//指定jar包和jar包中需要被执行的类名
wcJob.setJarByClass(App.class);
//指定Mapper的类名和输出的Key-Value类型
wcJob.setMapperClass(WordCountMapper.class);
wcJob.setMapOutputKeyClass(Text.class);
wcJob.setMapOutputValueClass(IntWritable.class);
//指定Combiner类名
wcJob.setCombinerClass(WordCountReducer.class);
//setNumReduceTasks表示设置ReduceTask数量
wcJob.setNumReduceTasks(1);
//指定Reducer的类名和最终输出的Key-Value类型
wcJob.setReducerClass(WordCountReducer.class);
wcJob.setOutputKeyClass(Text.class);
wcJob.setOutputValueClass(IntWritable.class);
//添加输入文件
FileInputFormat.addInputPath(wcJob,pathIn);
//设置输出文件路径
FileOutputFormat.setOutputPath(wcJob,pathOut);
//true表示将运行进度等信息及时输出给用户,false的话只是等待作业结束
wcJob.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}