IDEA运行WordCount程序（详细步骤）-CSDN博客

本文链接：https://blog.csdn.net/a18307096730/article/details/121343243

IDEA版本：2020.3

Hadoop版本：2.7.7

准备工作：

首先你得搭建好你的hadoop集群，然后IDEA插件Big Data Tools能成功连接上hdfs分布式文件系统

先搞懂WordCount程序的原理

然后，新建一个maven程序（包名自己定义就好了）

配置pom.xml文件（导入你hadoop版本相应的依赖，我的hadoop是2.7.7的）

    <dependencies>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.7</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.7</version>
        </dependency>
    </dependencies>

相应的版本可以在maven仓库中查看：https://mvnrepository.com/

准备好输入文件和输出目录

输入文件：（hdfs://192.168.183.101:9000/test/input/file01.txt）

你可以在本地写一个txt文件，然后传到hdfs上去；或者在虚拟机中写好文件，再把文件传到hdfs上去。不会的可以参考：HDFS-java编程

输出目录：“hdfs://192.168.183.101:9000/test/output” 这里的output文件夹是还没有创建的，因为程序运行的时候会自己创建，不需要我们自己创建，运行前输出目录存在的话会报错

配置文件输入输出路径（这里的输入输出路径都是在hdfs上的）

第一个是输入文件的路径：“hdfs://192.168.183.101:9000/test/input/file01.txt”

第二个数输出文件的路径：“hdfs://192.168.183.101:9000/test/output”

注意：这里的输出路径必须是hdfs上不存在的，不然会报错

给出完整代码

package cn.neu.connection.test;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
    public static class Map extends Mapper<Object,Text,Text,IntWritable>{
        private static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
            StringTokenizer st = new StringTokenizer(value.toString());
            while(st.hasMoreTokens()){
                word.set(st.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{
        private static IntWritable result = new IntWritable();
        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{
            int sum = 0;
            for(IntWritable val:values){
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception{
        System.setProperty("HADOOP_USER_NAME", "lingyi");
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length != 2){
            System.err.println("Usage WordCount <int> <out>");
            System.exit(2);
        }
        Job job = new Job(conf,"word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        otherArgs[0]="hdfs://192.168.183.101:9000/test/input/file01.txt";//输入路径
        otherArgs[1]="hdfs://192.168.183.101:9000/test/output";//输出路径，运行前hdfs不嗯呢该存在此路径，不然报错
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

两个注意点

这里的用户名改为你自己的

输入输出路径自己定义，必须是hdfs上的路径（可以从Big Table Tools插件里面快速获取路径）

到这里，程序就可以运行了

这里要是1就是有错误

同时你也可以在xshell或者虚拟机上查看part-r-00000输出文件

hadoop fs -cat /test/output/part-r-00000

还没完.........

有个问题，每次运行程序之前你都得把输出路径删除或者改为别的路径

这里给出解决方法：

每次运行前就判断输出文件是否存在，存在就删掉

修改后的代码

        System.setProperty("HADOOP_USER_NAME", "lingyi");
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
        if(otherArgs.length != 2){
            System.err.println("Usage WordCount <int> <out>");
            System.exit(2);
        }
        //每次运行前检查输出路径是否存在，存在就删除
        Path outPath = new Path(otherArgs[1]);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }