MapReduce程序之Index串联案例

最新推荐文章于 2022-06-02 11:26:13 发布

菜鸟周星星

最新推荐文章于 2022-06-02 11:26:13 发布

阅读量225

点赞数 1

分类专栏： Hadoop java 文章标签：大数据 hadoop mapreduce java

本文链接：https://blog.csdn.net/weixin_42083008/article/details/109751487

版权

java 同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

Hadoop

10 篇文章 0 订阅

订阅专栏

Index串联案例

现有三个文件 a.html b.html c.html ,里面分别记录着单词，如下图所示，现需要计算出每个单词在每个文件出现的次数，格式如下： Hello a.html-4 b.html-8 c.html-10

a.html

hello tom
hello jim
hello kitty
hello rose

b.html

hello jerry
hello jim
hello kitty
hello jack

c.html

hello jerry
hello java
hello c++
hello c++
hello hello

需求分析：

需要 Hello a.html-4 b.html-8 c.html-10 很明显最后，输出的单词作为key，value是一个字符串这个字符串是拼接而成，且涉及到文件名和各自的聚合个数

我们能够轻而易举的计算出来的是每个单词出现的个数如：Hello 10 java 20 ...

很明显一个mapreduce是计算不出来的，所以我们采用两个mapreduce程序串联的写法进行计算

第一个mapredue

1）map：以单词-文件名作为key 个数作为value输出到reduce

2）reduce：聚合求出

Hello-a.html 10

Hello-b.html 20

Hello-c.html 30

第二个mapreduce

1）map: 读取第一个mapreduce输出的文件进行切割 Hello a.html 10 以Hello为key 以a.html-10拼接为value输出给reduce

2）reduce :拿到的数据格式应为： Hello（a.html-10,b.html-20,c.html-30）循环迭代器拼接完成最终结果输出

第一个MapReduce程序

文件名的获取关键：重写父类的setup方法

 String fileName = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit fs = (FileSplit) context.getInputSplit();
            fileName = fs.getPath().getName();
        }

如下图：在进行循环读取文件数据之前，先执行的setup方法，所以同一个maptask中 fileName都是同一个

mapreduce程序：

package cn.doit19.hadoop.review.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author:tom
 * @Date:Created in 19:56 2020/11/17
 */
@SuppressWarnings("all")
public class Index1 {

    //Index案例  串联案例

    static class Index1Map extends Mapper<LongWritable, Text, Text, IntWritable> {
        Text k = new Text();
        String fileName = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit fs = (FileSplit) context.getInputSplit();
            fileName = fs.getPath().getName();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                //读一行执行一次
                String line = value.toString();
                //line的内容

                //hello tom
                //hello jim
                //hello kitty
                //hello rose
                String[] words = line.split("\\s+");//{hello,tom }
                for (String word : words) {
                    //拼接单词+文件名
                    k.set(word + "-" + fileName);
                    //输出结果到reduce
                    context.write(k, new IntWritable(1));
                }

            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }


    static class Index1Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            try {
                //  key  +   迭代器 内容如下
                //  Hello-a.html (1,1,1,1,1)     Hello-b.html(1,1,1,1,1)
                int count = 0;
                for (IntWritable value : values) {
                    count++;
                }
                context.write(key, new IntWritable(count));
            } catch (Exception e) {
                e.printStackTrace();
            }


        }
    }


    public static void main(String[] args) throws Exception {
        //初始化配置对象
        Configuration conf = new Configuration();

        //创建job对象
        Job job = Job.getInstance(conf);

        //设置map task 类
        job.setMapperClass(Index1Map.class);

        //设置reduce task 类
        job.setReducerClass(Index1Reducer.class);

        //设置map输出类型  kv
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce 最终输出类型  kv
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置reduce 数量
//        job.setNumReduceTasks(2);

        //设置输入路径
        FileInputFormat.setInputPaths(job, new Path("E:\\MR\\index"));

        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("E:\\MR\\In\\index1"));

        //提交任务
        boolean s = job.waitForCompletion(true);
    }
}

mapreduce程序输出结果：

第二个MapReduce程序

package cn.doit19.hadoop.review.index;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author:tom
 * @Date:Created in 21:03 2020/11/17
 */
@SuppressWarnings("all")
public class Index2 {
    static class Index2Map extends Mapper<LongWritable, Text, Text, Text> {
        Text k = new Text();
        Text v = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            try {
                //读一行执行一次
                String line = value.toString();
                //line的内容
                // Hello-a.html  10    Hello-b.html  20
                String[] words = line.split("\\s+");//{Hello-a.html,10 }
                String[] split = words[0].split("-");
                k.set(split[0]);
                //拼接value
                v.set(split[1] + "-" + words[1]);
                //输出结果到reduce
                context.write(k, v);
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }


    static class Index2Reducer extends Reducer<Text, Text, Text, Text> {
        Text v = new Text();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuilder sb = new StringBuilder();
            for (Text value : values) {
                sb.append(value + " ");
            }
            v.set(sb.toString().trim());
            context.write(key, v);
        }
    }


    public static void main(String[] args) throws Exception {
        //初始化配置对象
        Configuration conf = new Configuration();

        //创建job对象
        Job job = Job.getInstance(conf);

        //设置map task 类
        job.setMapperClass(Index2.Index2Map.class);

        //设置reduce task 类
        job.setReducerClass(Index2.Index2Reducer.class);

        //设置map输出类型  kv
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //设置reduce 最终输出类型  kv
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //设置reduce 数量
//        job.setNumReduceTasks(2);

        //设置输入路径
        FileInputFormat.setInputPaths(job, new Path("E:\\MR\\In\\index1"));

        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("E:\\MR\\out\\index2"));

        //提交任务
        boolean s = job.waitForCompletion(true);
    }

}

输出结果：

更多学习、面试资料尽在微信公众号：Hadoop大数据开发

菜鸟周星星

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReduce程序之Index串联案例

Index串联案例现有三个文件 a.html b.html c.html ,里面分别记录着单词，如下图所示，现需要计算出每个单词在每个文件出现的次数，格式如下： Hello a.html-4 b.html-8 c.html-10a.htmlhello tomhello jim hello kittyhello roseb.htmlhello jerryhello jimhello kittyhello jack...
复制链接

扫一扫