-
配置本地项目环境
- 在linux安装hadoop
安装参考: https://blog.csdn.net/xiaobo5264063/article/details/119630117
这里安装的时3台, 对应的host为
192.168.2.112 node112.cn
192.168.2.113 node113.cn
192.168.2.114 node114.cn - 配置window的hosts
在C:\Windows\System32\drivers\etc\hosts配置内容如下192.168.2.112 node112.cn 192.168.2.113 node113.cn 192.168.2.114 node114.cn
- 在本地window安装hadoop
下载hadoop-3.1.2.tar.gz 和 winutils-master.zip
解压hadoop-3.1.2.tar.gz 和 winutils-master.zip
将解压的winutils-master\hadoop-3.1.2\bin下所有文件拷贝到hadoop-3.1.2\bin下, 并覆盖
将winutils.exe和hadoop.dll文件拷贝到 C:\Windows\System32目录下 - 配置环境变量
此电脑->属性->高级系统设置->环境变量
在系统变量添加属性如下:HADOOP_HOME D:\study\software\hadoop\hadoop-3.1.2 HADOOP_USER_NAME root Path %HADOOP_HOME%\bin;%HADOOP_HOME%\sbin;
-
项目整合
- 创建springboot项目,pom.xml依赖
<properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <!-- Hadoop版本控制 --> <hadoop.version>3.1.2</hadoop.version> <!-- commons-io版本控制 --> <commons-io.version>2.4</commons-io.version> </properties> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopcommon --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoophdfs --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopclient --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopmapreduce-client-common --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopmapreduce-client-core --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoopmapreduce-client-jobclient --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>${hadoop.version}</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>${commons-io.version}</version> </dependency> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> </dependencies>
- 拷贝hadoop的配置文件
拷贝服务器hadoop/etc下的配置文件到项目recources下
core-site.xml hdfs-site.xml mapred-site.xml yarn-site.xml - 创建测试代码
AppWord.java --包名: com.basic
package com.basic; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * Hello world! */ public class AppWord { public static void main(String[] args) throws Exception { //获取配置文件 Configuration configuration = new Configuration(true); //本地开发环境下运行 configuration.set("mapreduce.framework.name", "local"); //创建任务 Job job = Job.getInstance(configuration); //设置任务主类 job.setJarByClass(AppWord.class); //设置任务名称(自定义) job.setJobName("aaa-wordcount-" + System.currentTimeMillis()); //设置Reduce的数量 job.setNumReduceTasks(2); //设置hadoop数据的数据输入路径 上传数据为: hdfs dfs -put test.txt /aaa FileInputFormat.setInputPaths(job, new Path("/aaa/test.txt")); //设置hadoop数据的结果输出路径 FileOutputFormat.setOutputPath(job, new Path("/aaa/result/wordcount_" + System.currentTimeMillis())); //设置Map的输入的key和value类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //设置Map和Reduce的处理类 job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); //提交任务 job.waitForCompletion(true); } }
WordCountMapper.java
package com.basic; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * map处理类 */ public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { //创建对象 private IntWritable one = new IntWritable(1); /** * @param key map是按行读取数据 key表示读取的偏移量 * @param value 每一行数据的内容 * @param context 上下文操作对象 */ @Override protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException { System.out.println("map-->接收一行数据数据: " + key + " 内容:" + value+"----------------"); // 获取data.txt每一行的数据内容 String valueString = value.toString(); // 将每一行的内容进行格式化 => 替换特殊字符和split分割 String[] values = valueString.replaceAll("[^a-zA-Z0-9'\\s]", "").split(" "); // 使用context对象将数据写入到reduce for (String val : values) { System.out.println("map-->发送key: " + key + " 单个词:" + val); context.write(new Text(val), one); } } }
WordCountReducer.java
package com.basic; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.util.Iterator; /** * reduce处理类 */ public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { //获取迭代器 Iterator<IntWritable> iterator = values.iterator(); //声明一个计数器 int count = 0; while (iterator.hasNext()) { IntWritable val = iterator.next(); count += val.get(); System.out.println("reduce-->获取key: " + key + " 获取val:" + val + " 计算count:" + count); } // 存储数据到hdfs context.write(key, new IntWritable(count)); } }
- 上传测试数据
test.txt可以为自定以英文单词表,这里内容如下:在hadoop服务器执行命令如下 hdfs dfs -put test.txt /aaa
- 运行代码
本地运行截图:如果部署在hadoop服务器运行,则命令为: hadoop jar 项目jar包 main方法包名+类名
例子: hadoop jar mapreduce.jar com.basic.AppWord - 查看dfs
这里展示分区中test.txt文件单词出现次数