Hadoop——MapReduce下的WordCount案例

最新推荐文章于 2024-07-02 11:21:22 发布

Night-Zzz

最新推荐文章于 2024-07-02 11:21:22 发布

阅读量67

点赞数 1

文章标签： hadoop mapreduce 数据库

本文链接：https://blog.csdn.net/NoNight_Zzz/article/details/132588954

版权

WordCount项目是用来干嘛的？就是用来统计txt文档里面单词出现的个数

例如：txt文档内容如下

最后运行完WordCount程序之后出来的结果就是

hello 4

aaa 2

bbb 2

ccc 1

WordCount项目需要建三个类 Mapper类 Reducer类 Driver类

WordCountMapper

package nj.zg.kb23.demo1;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

public class WordCountReduce extends Reducer<Text, IntWritable,Text, LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        System.out.println("Reduce stage Key:"+key+"Values:"+values.toString());
        int count = 0;
        for (IntWritable intWritable :
                values) {
            count+=intWritable.get();
        }

        LongWritable longWritable = new LongWritable(count);
        System.out.println("ReduceResult key:"+key+"resultValue:"+longWritable.get());
        context.write(key,longWritable);
    }
}

WordCountReducer

package nj.zg.kb23.demo1;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//<0,hello java,hello,1>
//<0,hello java,java,1>
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    Text text = new Text();
    IntWritable intWritable = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        System.out.println("WordCountMap stage Key:"+key+"  Value:"+value);
        String[] words =  value.toString().split(" ");//"hello java" --> [hello,java]
        for (String word:
             words) {
            text.set(word);//从String-->Text
            intWritable.set(1);//出现一次加1
            context.write(text,intWritable);//<hello,1> <java,1>
        }
    }
}

WordCountDriver

package nj.zg.kb23.demo1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //获取配置
        Configuration conf = new Configuration();
        //获取job对象
        Job job = Job.getInstance(conf);

        //设置job方法的驱动类
        job.setJarByClass(WordCountDriver.class);

        //设置job的map阶段 工作任务
        job.setMapperClass(WordCountMapper.class);
        //map输出key和value
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置job的reduce阶段 工作任务
        job.setReducerClass(WordCountReduce.class);
        //reduce输出的key和value
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //指定job 输入文件的路径
        FileInputFormat.setInputPaths(job,new Path("D:\\IDEA\\Maven_0826_1\\in\\wordcount.txt"));

        //指定job 输出文件路径
        Path path = new Path("D:\\IDEA\\Maven_0826_1\\on");
        FileSystem fileSystem = FileSystem.get(path.toUri(), conf);
        if (fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }
        FileOutputFormat.setOutputPath(job, path);

        //启动job(提交job)
        job.waitForCompletion(true);
    }
}

需要注意的是，Driver的输入路径需要指向你需要统计的txt文档，输出路径的最后一级目录不能存在，上面就是on目录不能存在，程序会自动帮你创建，如果运行程序之前就存在，就会报错