MapReduce编程——文档倒排索引

最新推荐文章于 2024-04-28 18:20:22 发布

River_flow

最新推荐文章于 2024-04-28 18:20:22 发布

阅读量1.6k

点赞数

分类专栏：笔记文章标签： mapreduce 大数据 hadoop

本文链接：https://blog.csdn.net/weixin_45835541/article/details/110252187

版权

笔记专栏收录该内容

2 篇文章 0 订阅

订阅专栏

实验要求

实现倒排索引效果：统计每个单词在不同文件中的出现次数；倒排索引的原理参考实验说明；
输入：自己编辑几个文件，例如 a.txt,b.txt,c.txt。每个文件的内容为若干行单词，单词之间以空格分开，并将这些文件上传到 hdfs 的/in 目录下；例如：a.txt
包含内容：
hadoop google scau
map hadoop reduce
hive hello hbase
编写程序实现单词的倒排索引效果；
分区要求：以 A-M 字母开头（包含小写）的单词出现在 0 区；以 N-Z 字母开头的单词出现在 1 区；其余开头的单词出现在 2 区；
单词的输出形式：hadoop a.txt->2,b.txt->1,其中hadoop 是单词（也作为输出的 key）,”a.txt->2,b.txt->1”表示输出的 value，即表示hadoop单词在 a.txt 文件中出现次数为 2，在 b.txt文件中出现次数为 1；

代码

第一次mapreduce

mapper类

K1为Object, V1,为Text ， K2 为字符串 Text,V2 为 IntWritable
输出形式：文件名->单词 1

package mr.index.first;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class FirstIndexMapper extends Mapper<Object, Text, Text, IntWritable> {

	String filename;
	Text k = new Text();
	IntWritable v = new IntWritable();
	
	@Override
	protected void setup(Mapper<Object, Text, Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		// 获取文件名称
        FileSplit split = (FileSplit) context.getInputSplit();
        filename = split.getPath().getName();
	}
	
	@Override
	protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		
        String line = value.toString();
        String[] words = line.split(" ");
        for (String word : words) {
            k.set(word + "->" + filename);
            v.set(1);
            context.write(k, v);
        }
	}
	
}

reducer类

K3 同 K2 类型为 Text, V3 为 IntWritable
输出形式：文件名->单词 count

package mr.index.first;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FirstIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	
	IntWritable v = new IntWritable();
	
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,
			Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
		int count=0;
		for(IntWritable value : values) {
			count+=value.get();
		}
		v.set(count);
		context.write(key, v);
	}
	
}

main

package mr.index.first;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class FirstIndexMain {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(FirstIndexMain.class);
        job.setMapperClass(FirstIndexMapper.class);
        job.setReducerClass(FirstIndexReducer.class);

        job.setNumReduceTasks(1);

        job.setMapOutputKeyClass(Text.class);// map阶段的输出的key
        job.setMapOutputValueClass(IntWritable.class);// map阶段的输出的value

        job.setOutputKeyClass(Text.class);// reduce阶段的输出的key
        job.setOutputValueClass(IntWritable.class);// reduce阶段的输出的value

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);

	}

}

第二次mapreduce

● （以第一次的输出目录作为输入目录,将word 作为 key,value 为文件名->计数形式的集合）

mapper类

k1为Object，v1为Text，K2 为 Text,V2 为 Text
输出形式：单词文件名->count

package mr.index.second;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SecondIndexMapper extends Mapper<Object, Text, Text, Text> {

	Text k = new Text();
	Text v = new Text();
	
	@Override
	protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] lines = line.split("->");
		k.set(lines[0]);
		v.set(lines[1]);
		context.write(k, v);
	}
	
}

reducer类

K3 同 K2 类型为 Text, V3 为 Text
输出形式：单词文件名 1->count1 文件名2->count2

package mr.index.second;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class SecondIndexReducer extends Reducer<Text, Text, Text, Text> {

	Text v = new Text();
	
	@Override
	protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
			throws IOException, InterruptedException {
		StringBuilder s = new StringBuilder();
		for(Text value : values) {
			s.append(value.toString().replace("\t", "->")+"  ");
		}
		v.set(s.toString());
		context.write(key, v);
	}

}

partitioner

在mapper后执行，传入的数据类型与k2v2一致

package mr.index.second;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class SecondIndexPartitioner extends Partitioner<Text, Text> {
	
	private static int PartitionNumber=0;
	
	@Override
	public int getPartition(Text key, Text value, int numPartitions) {
		String word=key.toString().trim();
		if (word.length()==0) return 0;
		char firstchar=Character.toUpperCase(word.charAt(0));
		if(firstchar>='A'&&firstchar<='M')
			PartitionNumber=0;
		else if(firstchar>='N'&&firstchar<='Z')
			PartitionNumber=1;
		else PartitionNumber=2;
		return PartitionNumber;
	}
	
}

main

package mr.index.second;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class SecondIndexMain {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SecondIndexMain.class);
        job.setMapperClass(SecondIndexMapper.class);
        job.setReducerClass(SecondIndexReducer.class);
        
        job.setPartitionerClass(SecondIndexPartitioner.class);
        job.setNumReduceTasks(3);

        job.setMapOutputKeyClass(Text.class);// map阶段的输出的key
        job.setMapOutputValueClass(Text.class);// map阶段的输出的value

        job.setOutputKeyClass(Text.class);// reduce阶段的输出的key
        job.setOutputValueClass(Text.class);// reduce阶段的输出的value

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);

	}

}

实验数据

在这里插入图片描述

运行结果

第一次mapreduce结果

在这里插入图片描述

第二次mapreduce结果

在这里插入图片描述

总结

倒排索引通过两次mapreduce实现，第一次mapreduce代码逻辑与代码结构与之前的 WordCount 程序有很多相似的地方，就是一个k2包含文件名的WordCount；第二次则在 Reducer 中累加的不是单词的个数，而是 value 字符串的适当叠加。
对于v1，无论v1是第一次中的word还是第二次中一个IntWritable的count，从文件中读入的v1都是Text类型，才能从而实现对v1的分割。