MapReduce:Index索引案例

最新推荐文章于 2024-08-20 15:17:00 发布

华..

最新推荐文章于 2024-08-20 15:17:00 发布

阅读量448

点赞数

文章标签： mapreduce hadoop 大数据

本文链接：https://blog.csdn.net/m0_53400772/article/details/130690736

版权

该文章展示了一个使用HadoopMapReduce框架进行文件内容处理的示例，具体任务是统计每个单词在不同文件中出现的次数。通过两个MapReduce阶段，首先完成单词-文件名的映射和计数，然后对结果进行排序和汇总。

摘要由CSDN通过智能技术生成

案例需求

a.html
hello world
hello lucy
hello jack
hello liuyan

b.html
hello aaa
aaa bbb
bbb ccc
hello liuyan
liuyan tangyan

c.html
world hello
liuyan tangyan
tangyan aaa
bbb   ccc

计算每个单词在每个文件中出现的次数
aaa   b.html-2 c.html-1
bbb   b.html-2 c.html-1
ccc   b.html-1
hello   a.html-4 b.html-2 c.html-1
jack   a.html-1
liuyan   b.html-2 c.html-1
lucy   a.html-1
tangyan   c.html-2 b.html-1
world   a.html-1

需求分析

如图所示,需两次mapreduce

代码

package day36.com.doit.demo3;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Test {

    private static class IndexMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
        private String fileName;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit fileSplit = (FileSplit)context.getInputSplit();
             fileName = fileSplit.getPath().getName();
        }

        Text k2 = new Text();
        IntWritable v2 = new IntWritable();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] arr = value.toString().split("\\s+");
            for (String s : arr) {
                k2.set(s+"-"+fileName);
                v2.set(1);
                context.write(k2,v2);
            }



        }
    }


    private static class IndexReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        IntWritable v3 = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int sum = 0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            v3.set(sum);

            context.write(key,v3);
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        //创建任务
        Job job = Job.getInstance(conf, "index");
        //设置Mapper类
        job.setMapperClass(IndexMapper.class);
        //设置Reduce类
        job.setReducerClass(IndexReducer.class);
        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\ideawork\\bbb\\input"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\ideawork\\bbb\\output2"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }
}

将第一次输出结果作为第二次的输入

package day36.com.doit.demo3;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

public class Test2 {

    private static class IndexMapper extends Mapper<LongWritable, Text,Text,Text> {
        Text k2 =new Text();
        Text v2 = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] arr = value.toString().split("-");
            k2.set(arr[0]);
            v2.set(arr[1].replaceAll("\\s+","-"));
            context.write(k2,v2);

        }
    }

    private static class IndexReducer extends Reducer<Text,Text,Text,Text> {
        Text v3 = new Text();
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            List<String> list = new ArrayList<>();


            StringBuilder sb = new StringBuilder();
            for (Text value : values) {
//                sb.append(value.toString()).append(" ");
                list.add(value.toString());

            }
            Collections.sort(list, new Comparator<String>() {
                @Override
                public int compare(String o1, String o2) {
                    return  o2.split("-")[1].compareTo(o1.split("-")[1]);
                }
            });
//            v3.set(sb.toString());
            for (String s : list) {
                sb.append(" "+s);
                v3.set(sb.toString());

            }
            context.write(key,v3);

        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        //创建任务
        Job job = Job.getInstance(conf, "index");
        //设置Mapper类
        job.setMapperClass(IndexMapper.class);
        //设置Reduce类
        job.setReducerClass(IndexReducer.class);
        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //设置输入文件位置
        FileInputFormat.setInputPaths(job,new Path("d:\\ideawork\\bbb\\output2"));
        //设置输出文件位置
        FileOutputFormat.setOutputPath(job,new Path("d:\\ideawork\\bbb\\output2_6"));

        //将任务提交 并等待完成
        job.waitForCompletion(true);
    }

}