一、环境说明
- 系统:Win10
- Hadoop版本:2.10.1
- JDK:1.8
二、环境准备
1、下载hadoop
下载链接hadoop 2.10.1。下载后用解压到本地。
2、下载winutils
下载链接winutils,下载完成后解压到本地,然后复制hadoop对应版本或就近版本的文件夹中的hadoop.dll
与winutils.exe
文件到hadoop的bin目录中去。
3、配置环境变量
新建环境变量HADOOP_HOME
,值为hadoop文件夹的位置
添加变量到PATH
4、最好需要重启电脑,让配置及运行文件生效
三、MapReduce程序编写
1、创建一个空的Maven项目
2、因为要使用到hadoop的一些api,所以需要引入依赖包,这里直接放上完整的pom文件,其中相关依赖版本号hadoop.version
变量与你的hadoop版本一致
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.javayuli</groupId>
<artifactId>MapReduceTest</artifactId>
<version>1.0</version>
<properties>
<hadoop.version>2.10.1</hadoop.version>
</properties>
<repositories>
<repository>
<id>nexus-aliyun</id>
<name>nexus-aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-icu</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>jfree</groupId>
<artifactId>jfreechart</artifactId>
<version>1.0.13</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-dependency-plugin</artifactId>
<configuration>
<excludeTransitive>false</excludeTransitive>
<stripVersion>true</stripVersion>
<outputDirectory>./lib</outputDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>
3、编写一个Map程序
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author 14516
*/
public class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split("");
for (String s: split) {
context.write(new Text(s), new IntWritable(1));
}
}
}
4、编写一个Reduce函数
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author 14516
*/
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable val: values) {
count++;
}
context.write(key, new IntWritable(count));
}
}
5、编写一个入口函数
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.BasicConfigurator;
import java.io.IOException;
/**
* @author 14516
*/
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 自动快速地使用缺省Log4j环境
BasicConfigurator.configure();
Configuration configuration = new Configuration();
String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("必须输入读取文件路径和输出路径");
System.exit(2);
}
Job job = Job.getInstance();
job.setJarByClass(WordCount.class);
job.setJobName("Word Count");
JobConf jobConfiguration = (JobConf) job.getConfiguration();
// 设置读取文件的路径,都是从HDFS中读取。读取文件路径从脚本文件中传进来
FileInputFormat.addInputPath(jobConfiguration, new Path(args[0]));
// 设置mapreduce程序的输出路径,MapReduce的结果都是输入到文件中
FileOutputFormat.setOutputPath(jobConfiguration, new Path(args[1]));
// 设置实现了map函数的类
job.setMapperClass(WordCountMap.class);
// 设置实现了reduce函数的类
job.setReducerClass(WordCountReduce.class);
// 设置reduce函数的key值
job.setOutputKeyClass(Text.class);
// 设置reduce函数的value值
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 :1);
}
}
6、配置打包
idea中File->Project Structure
选择WordCount
点击Apply,立即应用
7、新建一个运行配置
8、创建input文件夹,并在input文件夹中创建测试文件A.txt
9、运行程序
程序运行后,会自动创建output文件夹,此时part-r-00000中就是执行结果,即每个字符出现的频次。
10、打成jar包
经过上述步骤6之后,可以在Build->Build Artifacts中进行打包
打包后就可以将jar包上传到服务器进行运行。