编写Mapper类
package com.haohaodata.bigdata.hadoop.mapreduce.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* KEYIN, 输入数据的KEY的数据类型
* VALUEIN, 输入数据的VALUE的数据类型
*
* KEYOUT, 输出数据的KEY的数据类型
* VALUEOUT 输出数据的VALUE的数据类型
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable intWritable = new IntWritable(1);
/**
* @param key 表示每行数据的偏移量
* @param value 表示每行数据的内容
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(",");
for (String word : words) {
context.write(new Text(word), intWritable);
}
}
}
编写Reducer类
package com.haohaodata.bigdata.hadoop.mapreduce.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* KEYIN, 输入的KEY的数据类型
* VALUEIN, 输入的VALUE的数据类型
* KEYOUT, 输出的KEY的数据类型
* VALUEOUT 输出的VALUE的数据类型
*/
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
*
* @param key map端输出的单词
* @param values (1,1,1,1)
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
编写Driver类
package com.haohaodata.bigdata.hadoop.mapreduce.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver {
public static void main(String[] args) throws Exception {
// 输入/输出路径
// String input = args[0];
// String output = args[1];
String input = "data/haohaodata.txt";
String output = "output";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 删除已经存在的输出目录
FileUtils.deleteOutput(configuration, output);
// 2) 设置本job对应要执行的主类是哪个
job.setJarByClass(WCDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
// 4) 设置Mapper阶段输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5) 设置Reducer阶段输出的数据的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6) 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7) 提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
// 集群提交jar
// hadoop jar hadoop0001-1.0-SNAPSHOT.jar com.haohaodata.bigdata.hadoop.mapreduce.wc.WCDriver /wcinput/haohaodata.txt /wcoutput
}
}
编写FileUtils工具类
package com.haohaodata.bigdata.hadoop.mapreduce.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class FileUtils {
public static void deleteOutput(Configuration configuration, String outPut) throws Exception {
FileSystem fileSystem = FileSystem.get(configuration);
// TODO 判断文件是否存在
if (fileSystem.exists(new Path(outPut))) {
// TODO 删除文件
fileSystem.delete(new Path(outPut), true);
}
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.haohaodata.bigdata</groupId>
<artifactId>hadoop0001</artifactId>
<version>1.0-SNAPSHOT</version>
<name>hadoop0001</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- 添加hadoop依赖 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>