MapReduce练习-----倒排索引

最新推荐文章于 2024-10-12 11:19:00 发布

_a_0_

最新推荐文章于 2024-10-12 11:19:00 发布

阅读量1.2k

点赞数 1

分类专栏： # Hadoop

本文链接：https://blog.csdn.net/zyz_home/article/details/79949076

版权

Hadoop 专栏收录该内容

20 篇文章 2 订阅

订阅专栏

数据1：
huangbo love xuzheng
huangxiaoming love baby huangxiaoming love mimi
liangchaowei love liujialing

数据2：
hello huangbo
hello xuzheng
hello huangxiaoming

题目一：编写 MapReduce 求出以下格式的结果数据：统计每个关键词在每个文档中当中的第几行出现了多少次。
例如，huangxiaoming 关键词的格式：

huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1

首先是进行文件的额切分，拼接添加行号，以单词为key，文件名和行号进行拼接做为value，然后通过第二个MapRudece程序将数据组合成我们需要的。样式。

第一个MapReduce程序

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Question3_1_1 {
		public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

			Text k = new Text();
			IntWritable v = new IntWritable(1);
			int num = 0;

			@Override
			protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

				String line = value.toString();
				//行号
				num++;

				String[] words = line.split(" ");

				//huangixaoming  mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
				FileSplit inputSplit = (FileSplit) context.getInputSplit();
				//通过切片获取文件的名称
				String fileName = inputSplit.getPath().getName();
				for (String word : words) {
					//单词+文件名+行号  作为key输出
					k.set(word + ":" + fileName+ ":" + (num));
					System.out.println(word + "--" + fileName+ "--" + (num));
					context.write(k, v);

				}
				
			}

		}

		public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {

			Text t = new Text();
			@Override
			protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
					throws IOException, InterruptedException {
				//获取到key 单词+文件名+行号；
				//根据key相同，进行累加相同的word出现了几次
				int count = 0;
				for (IntWritable value : values) {

					count += value.get();
				}
				
				//转化输出
				t.set(key.toString()+","+count);
				context.write(t,NullWritable.get());

			}

		}

		public static void main(String[] args) throws Exception {

			Configuration conf = new Configuration();
			FileSystem fs = FileSystem.get(conf);
			Job job = Job.getInstance(conf);
			
			job.setJarByClass(Question3_1_1.class);

			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(IntWritable.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(NullWritable.class);

			FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
			
			if(fs.exists(new Path("G:/test/q3/output_3_1"))){
				fs.delete(new Path("G:/test/q3/output_3_1"), true);
			}
			FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_1"));

			job.setMapperClass(MRMapper.class);
			job.setReducerClass(MRReducer.class);

			System.exit(job.waitForCompletion(true) ? 1:0);

		}


}

第二个MapReduce程序


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Question3_1_2 {
	
	public static class MRMapper extends Mapper<LongWritable, Text, Text, Text>{
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			String[] files = line.split(":");
			//k.set(word + ":" + fileName+ ":" + (num));
			//baby:mapreduce-4-1.txt:2,1
			String str = files[1]+":"+files[2];
			context.write(new Text(files[0]), new Text(str));
		}
	}
	
	public static class MRReducer extends Reducer<Text, Text, Text, Text>{
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			StringBuffer sb = new StringBuffer();
			for (Text text : values) {
				sb.append(text.toString()+";");
			}
			context.write(key, new Text(sb.toString()));
		}
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		Job job = Job.getInstance(conf);
		job.setJarByClass(Question3_1_2.class);
		
		job.setMapperClass(MRMapper.class);
		job.setReducerClass(MRReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_1"));
		
		if(fs.exists(new Path("G:/test/q3/output_3_2"))){
			fs.delete(new Path("G:/test/q3/output_3_2"), true);
		}
		FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_2"));
		
		System.exit(job.waitForCompletion(true) ? 1:0);
	}


}

题目二：编写 MapReduce 程序求出每个关键词在每个文档出现了多少次，并且按照出现次数降序排序。
例如：
huangixaoming mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
以上答案的含义：
关键词 huangxiaoming 在第一份文档 mapreduce-4-1.txt 中出现了 3 次，在第二份文档mapreduce-4-2.txt 中出现了 1 次。

方案：先统计出每个关键词在某个文件中的出现次数，然后再进行排序。


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Question3_2_1 {
		public static class MRMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

			Text k = new Text();
			IntWritable v = new IntWritable(1);

			@Override
			protected void map(LongWritable key, Text value, Context context) 
					throws IOException, InterruptedException {

				String line = value.toString();

				String[] words = line.split(" ");

				//huangixaoming  mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1
				FileSplit inputSplit = (FileSplit) context.getInputSplit();
				String fileName = inputSplit.getPath().getName();
				for (String word : words) {
					k.set(word + ":" + fileName);
					context.write(k, v);

				}
				
			}

		}

		public static class MRReducer extends Reducer<Text, IntWritable, Text, NullWritable> {

			Text t = new Text();
			@Override
			protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
					throws IOException, InterruptedException {

				int count = 0;
				for (IntWritable value : values) {

					count += value.get();
				}
				t.set(key.toString()+","+count);
				context.write(t,NullWritable.get());

			}

		}

		public static void main(String[] args) throws Exception {

			Configuration conf = new Configuration();
			FileSystem fs = FileSystem.get(conf);
			Job job = Job.getInstance(conf);
			
			job.setJarByClass(Question3_2_1.class);

			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(IntWritable.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(NullWritable.class);

			FileInputFormat.setInputPaths(job, new Path("G:/test/q3/input"));
			
			if(fs.exists(new Path("G:/test/q3/output_3_3"))){
				fs.delete(new Path("G:/test/q3/output_3_3"), true);
			}
			FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_3"));

			job.setMapperClass(MRMapper.class);
			job.setReducerClass(MRReducer.class);

			System.exit(job.waitForCompletion(true) ? 1:0);

		}


}

使用自定义对象，将上面的结果组合成一个自定义对象，然后根据关键词分组，根据出现次数排序；


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Question3_2_2 {
	//huangixaoming  mapreduce-4-1.txt,3;mapreduce-4-2.txt,1
	//yangmi:mapreduce-4-1.txt,1
	public static class MRMapper extends Mapper<LongWritable, Text, TestBean, NullWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {
			
			String[] line =  value.toString().split(":");
			TestBean tb = new TestBean(line[0],line[1].split(",")[0],Integer.parseInt(line[1].split(",")[1]));
			context.write(tb,NullWritable.get());
		}
	}
	
	public static class MRReducer extends Reducer<TestBean, NullWritable, Text, Text>{
		Text k = new Text();
		Text v = new Text();
		@Override
		protected void reduce(TestBean key, Iterable<NullWritable> values, Context context)
				throws IOException, InterruptedException {
			
			StringBuffer sb = new StringBuffer();
			for (NullWritable nv : values) {
				sb.append(key.getFileName()+","+key.getNum()+";");
			}
			k.set(key.getName());
			v.set(sb.toString());
			context.write(k, v);
		}
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		Job job = Job.getInstance(conf);
		job.setJarByClass(Question3_2_2.class);
		
		job.setMapperClass(MRMapper.class);
		job.setReducerClass(MRReducer.class);
		job.setMapOutputKeyClass(TestBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setGroupingComparatorClass(UserGC.class);
		
		FileInputFormat.setInputPaths(job, new Path("G:/test/q3/output_3_3"));
		
		if(fs.exists(new Path("G:/test/q3/output_3_4"))){
			fs.delete(new Path("G:/test/q3/output_3_4"), true);
		}
		FileOutputFormat.setOutputPath(job, new Path("G:/test/q3/output_3_4"));
		
		System.exit(job.waitForCompletion(true) ? 1:0);
	}


}

自定义数据类型：TestBean

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TestBean implements WritableComparable<TestBean>{
	private String name;
	private String fileName;
	private int num;
	
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getFileName() {
		return fileName;
	}
	public void setFileName(String fileName) {
		this.fileName = fileName;
	}
	public int getNum() {
		return num;
	}
	public void setNum(int num) {
		this.num = num;
	}
	public TestBean() {
		super();
		// TODO Auto-generated constructor stub
	}
	public TestBean(String name, String fileName, int num) {
		super();
		this.name = name;
		this.fileName = fileName;
		this.num = num;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		out.writeUTF(fileName);
		out.writeInt(num);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		name = in.readUTF();
		fileName = in.readUTF();
		num = in.readInt();		
	}
	@Override
	public int compareTo(TestBean o) {
		
		if(o.getName().compareTo(this.getName()) == 0){
			int flag = o.getNum()-this.getNum();
			if(flag == 0){
				return 0;
			}else if(flag > 0){
				return 1;
			}else{
				return -1;
			}
		}else{
			return o.getName().compareTo(this.getName());
		}
		
	}
	
	
}

自定义分组组件：UserGC


import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class UserGC extends WritableComparator{

	public UserGC() {
		super(TestBean.class,true);
	}

	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		TestBean pa = (TestBean) a;	
		TestBean pb = (TestBean) b;	
		
		
		return pa.getName().compareTo(pb.getName());
	}
	
}