倒排索引案例（多job串联）

最新推荐文章于 2024-09-29 19:02:21 发布

佳讯好

最新推荐文章于 2024-09-29 19:02:21 发布

阅读量228

点赞数 6

分类专栏：大数据离线开发文章标签： mapreduce 分布式大数据 hadoop

本文链接：https://blog.csdn.net/qq_15304885/article/details/137194578

版权

大数据同时被 2 个专栏收录

14 篇文章 0 订阅

订阅专栏

离线开发

14 篇文章 0 订阅

订阅专栏

文章讲述了如何使用ApacheHadoop的MapReduce框架对大量文本（文档和网页）建立搜索索引，包括两次处理过程（一次输出文件名和单词计数，二次做倒排索引），以及如何通过JobControl串联两个Job任务以提高效率。

摘要由CSDN通过智能技术生成

一、需求：有大量的文本（文档、网页），需要建立搜索索引

（1）第一次预期输出结果

aaa--a.txt	3
aaa--b.txt	4
aaa--c.txt	4
dddddd--a.txt	2
dddddd--b.txt	3
dddddd--c.txt	5
ssssssssssssss--a.txt	1
ssssssssssssss--b.txt	2
ssssssssssssss--c.txt	6

（2）第二次预期输出结果：

aaa	c.txt-->4	b.txt-->4	a.txt-->3	
dddddd	c.txt-->5	b.txt-->3	a.txt-->2	
ssssssssssssss	c.txt-->6	b.txt-->2	a.txt-->1

（3）第三次预期输出结果：
使用多job串联的方法，达到在一次处理中得到两个文件

二、数据导入

新建三个txt文件即可：

a.txt

aaa
aaa
aaa
ssssssssssssss
dddddd
dddddd

b.txt

aaa
aaa
aaa
aaa
ssssssssssssss
ssssssssssssss
dddddd
dddddd
dddddd

c.txt

aaa
aaa
aaa
aaa
ssssssssssssss
ssssssssssssss
ssssssssssssss
ssssssssssssss
ssssssssssssss
ssssssssssssss
dddddd
dddddd
dddddd
dddddd
dddddd

三、代码实现

1、第一次处理

package bigdata.b16;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexDriverOne {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"D:\\test\\b16\\index","D:\\test\\b16\\index1"};
        Configuration conf = new Configuration();
        Job job1 = Job.getInstance(conf);
        job1.setJarByClass(IndexDriverOne.class);
        job1.setMapperClass(IndexMapperOne.class);
        job1.setReducerClass(IndexReduceOne.class);

        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(IntWritable.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(IntWritable.class);

        //判断输出路径是否存在
        Path path = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(path)) {
            fs.delete(path, true);
        }
        FileInputFormat.setInputPaths(job1, new Path(args[0]));
        FileOutputFormat.setOutputPath(job1, path);
        System.exit(job1.waitForCompletion(true) ? 0 : 1);
    }
}

/**
 * mapper
 * 实现：1拿到每个文件的名字
 *       2.拿到每个单词
 *       3.统计每个文件里面单词的个数
 * */
class IndexMapperOne extends Mapper<LongWritable, Text,Text, IntWritable>{
    String name;
    Text k = new Text();
    IntWritable v = new IntWritable();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取文件对象
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        //获取文件的名字
        name = fileSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //获取数据
        String line = value.toString();
        //获取关键词
        String[] split = line.split(" ");
        for (String s :split){
            //将关键词和文件名字拼接，作为输出的key   MapReduce--a.txt
            k.set(s+"--"+name);
            v.set(1);
            //输出
            context.write(k,v);
        }
    }
}

/**
 * reduce
 * 将每个文件中的单词进行统计
 * */
class IndexReduceOne extends Reducer<Text, IntWritable,Text, IntWritable>{
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //定义一个计数器
        int count = 0;
        for (IntWritable i : values){
            count+=Integer.parseInt("1");
        }
        context.write(key,new IntWritable(count));
    }
}

2、第二次处理

package bigdata.b16;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexDriverTwo {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        args = new String[]{"D:\\test\\b16\\index1","D:\\test\\b16\\index2"};
        Configuration conf = new Configuration();
        Job job2 = Job.getInstance(conf);
        job2.setJarByClass(IndexDriverTwo.class);
        job2.setMapperClass(IndexMapperTow.class);
        job2.setReducerClass(IndexReduceTow.class);

        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        //判断输出路径是否存在
        FileSystem fs = FileSystem.get(conf);
        Path path1 = new Path(args[1]);
        if(fs.exists(path1)) {
            fs.delete(path1, true);
        }
        FileInputFormat.setInputPaths(job2, new Path(args[0]));
        FileOutputFormat.setOutputPath(job2, path1);
        job2.waitForCompletion(true);
    }
}

/**
 * mapper
 * 将关键词和文件名字分开，输出到reduce，利用MapReduce的分组，做最终的倒排索引
 * */
class IndexMapperTow extends Mapper<LongWritable, Text,Text,Text>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取数据
        String line = value.toString();
        //切分，按照“--”
        String[] split = line.split("--");

        //输出
        context.write(new Text(split[0]),new Text(split[1]));

    }
}
/**
 *
 * reduce
 *
 * */
class IndexReduceTow extends Reducer<Text,Text,Text,Text>{
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //通过循环迭代器获取每个文件名字，然后拼接成一行
        StringBuilder stringBuilder =new StringBuilder();
        for (Text text : values){
            //a.txt    2，将制表符换成 -->
            String[] split = text.toString().split("\t");
            String s = split[0]+"-->"+split[1];
            stringBuilder.append(s).append("\t");
        }
        //输出
        context.write(key,new Text(stringBuilder.toString()));
    }
}

3、两个job串联

package bigdata.b16;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexDriver {
    public static void main(String[] args) throws IOException, InterruptedException {
        args = new String[]{"D:\\test\\b16\\index","D:\\test\\b16\\index1","D:\\test\\b16\\index2"};

        //实例化配置文件
        Configuration conf = new Configuration();
        //定义job任务
        Job job1 = Job.getInstance(conf);
        //配置mapper和reducer
        job1.setMapperClass(IndexMapperOne.class);
        job1.setReducerClass(IndexReduceOne.class);

        //配置mapper输出数据类型，总的输出类型
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(IntWritable.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(IntWritable.class);

        //判断输出路径是否存在，如果存在就删除输出路径，如果不存运行job任务
        Path path = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(path)) {
            fs.delete(path, true);
        }
        FileInputFormat.setInputPaths(job1, new Path(args[0]));
        FileOutputFormat.setOutputPath(job1, path);


        Job job2 = Job.getInstance(conf);

        job2.setMapperClass(IndexMapperTow.class);
        job2.setReducerClass(IndexReduceTow.class);

        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        //判断输出路径是否存在
        Path path1 = new Path(args[2]);
        if(fs.exists(path1)) {
            fs.delete(path1, true);
        }
        FileInputFormat.setInputPaths(job2, new Path(args[1]));
        FileOutputFormat.setOutputPath(job2, path1);

        ControlledJob aJob = new ControlledJob(job1.getConfiguration());
        ControlledJob bJob = new ControlledJob(job2.getConfiguration());



        //创建管理job任务的control组
        JobControl index = new JobControl("index");
        //添加job任务
        index.addJob(aJob);
        index.addJob(bJob);
        //任务的依赖关系
        bJob.addDependingJob(aJob);
        //提交
        Thread thread = new Thread(index);
        thread.start();

        //控制任务停止
        while(!index.allFinished()){
            Thread.sleep(1000);
        }
        System.exit(0);
    }
}