IDEA调试运行hadoop
创建一个WordCount项目
创建一个com.lyf.wordcount包
1.修改maven pom.xml
//配置打包方式
<packaging>jar</packaging>
//配置依赖jar包
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
//配置maven打包插件
<build>
<defaultGoal>install</defaultGoal>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- Allows the example to be run via 'mvn compile exec:java' -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>com.test.mapreduce.MainApp</mainClass>
<includePluginDependencies>false</includePluginDependencies>
</configuration>
</plugin>
</plugins>
</build>
2.创建一个WordMapper类
package com.lyf.wordcount;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// 创建一个 WordMapper类 继承于 Mapper抽象类
public class WordMapper extends Mapper<Object, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
// Mapper抽象类的核心方法,三个参数
public void map(Object key, // 首字符偏移量
Text value, // 文件的一行内容
Context context) // Mapper端的上下文,与OutputCollector和Reporter的功能类似
throws IOException, InterruptedException
{
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens())
{
word.set(itr.nextToken());
context.write(word, one);
}
}
}
3.创建一个WordReducer类
package com.lyf.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
// 创建一个 WordReducer类 继承于 Reducer抽象类
public class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable(); // 用于记录 key 的最终的词频数
// Reducer抽象类的核心方法,三个参数
public void reduce(Text key, // Map端 输出的 key 值
Iterable<IntWritable> values, // Map端输出的Value 集合(相同key的集合)
Context context) // Reduce 端的上下文,与OutputCollector和Reporter的功能类似
throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values) // 遍历 values集合,并把值相加
{
sum += val.get();
}
result.set(sum); // 得到最终词频数
context.write(key, result); // 写入结果
}
}
4.创建一个WordMain类
package com.lyf.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordMain
{
public static void main(String[] args) throws Exception
{
// Configuration类:读取Hadoop的配置文件,如 site-core.xml...;
// 也可用set方法重新设置(会覆盖):conf.set("fs.default.name", "hdfs://xxxx:9000")
Configuration conf = new Configuration();
// 将命令行中参数自动设置到变量conf中
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
/**
* 这里必须有输入输出
*/
if (otherArgs.length != 2)
{
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count"); // 新建一个 job,传入配置信息
job.setJarByClass(WordMain.class); // 设置 job 的主类
job.setMapperClass(WordMapper.class); // 设置 job 的 Mapper 类
job.setCombinerClass(WordReducer.class); // 设置 job 的 作业合成类
job.setReducerClass(WordReducer.class); // 设置 job 的 Reducer 类
job.setOutputKeyClass(Text.class); // 设置 job 输出数据的关键类
job.setOutputValueClass(IntWritable.class); // 设置 job 输出值类
FileInputFormat.addInputPath(job, new Path(otherArgs[0])); // 文件输入
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); // 文件输出
System.exit(job.waitForCompletion(true) ? 0 : 1); // 等待完成退出
}
}
5.执行MapReduce操作
1.将写好的代码打包
2.maven package后的jar包位于target路径下
3.测试文件Phone_Data.dat上传hdfs集群
hdfs dfs -put Phone_Data.dat
4.执行测试命令
hadoop jar WordCount-1.0-SNAPSHOT.jar \
com/test/wordcount/WordMain /Phone_Data.dat /output
//注解
- WordCount-1.0-SNAPSHOT.jar是代码jar包
- com/test/wordcount/WordMain是因为main程序是放在com/test/wordcount路径下的WordMain.java下
- /Phone_Data.dat是hdfs集群中输入文件的路径
- /output是hdfs集群中输出文件的路径
5.执行成功,part-r-00000中即为执行结果