新建一个maven工程
在pom.xml中添加相关依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lzc.hadoop</groupId>
<artifactId>hadoop-api</artifactId>
<version>1.0-SNAPSHOT</version>
<name>hadoop-api</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
</properties>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
实现代码
package com.lzc.hadoop.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCount2App {
/**
* Map:读取输入的文件
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
LongWritable one = new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 接收到的每一行数据
String line = value.toString();
// 按照指定分隔符进行拆分
String[] words = line.split(" ");
for (String word : words) {
// 通过上下文把map的结果输出
context.write(new Text(word), one);
}
}
}
/**
* Reduce:归并操作
*/
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0;
for (LongWritable value : values) {
// 求key出现的次数总和
sum += value.get();
}
// 最终统计结果的输出
context.write(key, new LongWritable(sum));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
// 准备清理已经存在的输出目录(相同的代码和脚本再次执行会报错 )
Path outputPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(configuration);
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
System.out.println("output file exists, but is has deleted");
}
// 创建Job
Job job = Job.getInstance(configuration, "wordcount");
// 设置Job的处理类
job.setJarByClass(WordCount2App.class);
// 设置作业处理的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 设置map的参数
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 设置reduce相关参数
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 设置作业处理的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}
将程序打包成jar包
运行程序
1.首先查看HDFS上的单词文件
[root@localhost data]# hadoop fs -text /hello.txt
18/08/25 09:05:25 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
hello lzc
hello hadoop
hello hdfs
2.执行jar包
hadoop jar /home/data/hadoop-api-1.0-SNAPSHOT.jar com.lzc.hadoop.mapreduce.WordCount2App hdfs://192.168.126.129:8020/hello.txt hdfs://192.168.126.129:8020/output/wc
由于执行的命令过长,可以将命令保存在shell脚本里。
新建一个shell脚本,将上面的命令写入
[root@localhost data]# vim wc_mr.sh
执行shell脚本
[root@localhost data]# ./wc_mr.sh
查看执行过程
执行完毕
查看执行结果
1.查看输出的文件名
[root@localhost data]# hadoop fs -ls /output/wc
18/08/25 09:04:11 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r-- 1 root supergroup 0 2018-08-25 09:02 /output/wc/_SUCCESS
-rw-r--r-- 1 root supergroup 30 2018-08-25 09:02 /output/wc/part-r-00000
/output/wc/part-r-00000为输出的结果文件
2.查看结果
[root@localhost data]# hadoop fs -text /output/wc/part-r-00000
18/08/25 09:04:57 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
hadoop 1
hdfs 1
hello 3
lzc 1