IDEA调试运行基于hadoop的WordCount项目

最新推荐文章于 2021-11-15 21:14:22 发布

巴基海贼王

最新推荐文章于 2021-11-15 21:14:22 发布

阅读量578

点赞数 2

分类专栏： hadoop java 文章标签： hadoop mapreduce wordcount

本文链接：https://blog.csdn.net/weixin_43343486/article/details/91951802

版权

java 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

hadoop

2 篇文章 0 订阅

订阅专栏

IDEA调试运行hadoop

创建一个WordCount项目

创建一个com.lyf.wordcount包

1.修改maven pom.xml

//配置打包方式

//配置依赖jar包

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-common</artifactId>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-hdfs</artifactId>

</dependency>

//配置maven打包插件

<build>

<defaultGoal>install</defaultGoal>

<groupId>org.apache.maven.plugins</groupId>

<artifactId>maven-compiler-plugin</artifactId>

</configuration>

</plugin>

<groupId>org.apache.maven.plugins</groupId>

<artifactId>maven-resources-plugin</artifactId>

</configuration>

</plugin>

<groupId>org.codehaus.mojo</groupId>

<artifactId>exec-maven-plugin</artifactId>

<mainClass>com.test.mapreduce.MainApp</mainClass>

<includePluginDependencies>false</includePluginDependencies>

</configuration>

</plugin>

</plugins>

</build>

2.创建一个WordMapper类

package com.lyf.wordcount;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

// 创建一个 WordMapper类继承于 Mapper抽象类

public class WordMapper extends Mapper<Object, Text, Text, IntWritable>

{

private final static IntWritable one = new IntWritable(1);

private Text word = new Text();

// Mapper抽象类的核心方法，三个参数

public void map(Object key, // 首字符偏移量

Text value, // 文件的一行内容

Context context) // Mapper端的上下文，与OutputCollector和Reporter的功能类似

throws IOException, InterruptedException

{

StringTokenizer itr = new StringTokenizer(value.toString());

while (itr.hasMoreTokens())

{

word.set(itr.nextToken());

context.write(word, one);

}

3.创建一个WordReducer类

package com.lyf.wordcount;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

// 创建一个 WordReducer类继承于 Reducer抽象类

public class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable>

{

private IntWritable result = new IntWritable(); // 用于记录 key 的最终的词频数

// Reducer抽象类的核心方法，三个参数

public void reduce(Text key, // Map端输出的 key 值

Iterable<IntWritable> values, // Map端输出的Value 集合（相同key的集合）

Context context) // Reduce 端的上下文，与OutputCollector和Reporter的功能类似

throws IOException, InterruptedException

{

int sum = 0;

for (IntWritable val : values) // 遍历 values集合，并把值相加

{

sum += val.get();

}

result.set(sum); // 得到最终词频数

context.write(key, result); // 写入结果

}

4.创建一个WordMain类

package com.lyf.wordcount;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class WordMain

{

public static void main(String[] args) throws Exception

{

// Configuration类：读取Hadoop的配置文件，如 site-core.xml...；

// 也可用set方法重新设置（会覆盖）：conf.set("fs.default.name", "hdfs://xxxx:9000")

Configuration conf = new Configuration();

// 将命令行中参数自动设置到变量conf中

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

/**

* 这里必须有输入输出

if (otherArgs.length != 2)

{

System.err.println("Usage: wordcount <in> <out>");

System.exit(2);

}

Job job = new ~~Job~~(conf, "word count"); // 新建一个 job，传入配置信息

job.setJarByClass(WordMain.class); // 设置 job 的主类

job.setMapperClass(WordMapper.class); // 设置 job 的 Mapper 类

job.setCombinerClass(WordReducer.class); // 设置 job 的作业合成类

job.setReducerClass(WordReducer.class); // 设置 job 的 Reducer 类

job.setOutputKeyClass(Text.class); // 设置 job 输出数据的关键类

job.setOutputValueClass(IntWritable.class); // 设置 job 输出值类

FileInputFormat.addInputPath(job, new Path(otherArgs[0])); // 文件输入

FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); // 文件输出

System.exit(job.waitForCompletion(true) ? 0 : 1); // 等待完成退出

}

5.执行MapReduce操作

1．将写好的代码打包

2．maven package后的jar包位于target路径下

3．测试文件Phone_Data.dat上传hdfs集群

hdfs dfs -put Phone_Data.dat

4．执行测试命令

hadoop jar WordCount-1.0-SNAPSHOT.jar \

com/test/wordcount/WordMain /Phone_Data.dat /output

//注解

WordCount-1.0-SNAPSHOT.jar是代码jar包
com/test/wordcount/WordMain是因为main程序是放在com/test/wordcount路径下的WordMain.java下
/Phone_Data.dat是hdfs集群中输入文件的路径
/output是hdfs集群中输出文件的路径

5．执行成功，part-r-00000中即为执行结果

巴基海贼王

关注

2
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
IDEA调试运行基于hadoop的WordCount项目

IDEA调试运行hadoop创建一个WordCount项目创建一个com.lyf.wordcount包1.修改maven pom.xml//配置打包方式<packaging>jar</packaging>//配置依赖jar包<dependency> <groupId>org.apache.hadoop<...
复制链接

扫一扫

专栏目录