倒排索引创建(Mapreduce)

需求描述:假如有如下文本文件:

a.txt                                      b.txt                                 c.txt

hello tom                           hello jack                            hello jerry

hello jerry                          hello jim                              hello java 

hello jim                            hello kitty                             hello c++

hello kitty                          hello rose                            hello c++

需要得到以下结果:(每个单词在每个文件中出现次数的统计)

hello  a.txt-->4  b.txt-->4  c.txt-->4

java   c.txt-->1

jerry  b.txt-->1  c.txt-->1

....

思路:

(1)首先,写一个mapreduce程序:统计出每个单词在每个文件中的总次数,如:

hello-a.txt 4

hello-b.txt 4

jerry-b.txt 1

.......

(2)然后再写第二个mapreduce程序,读取上述结果数据,并形成最终需要格式输出:

map: 根据“-”切,以单词做key,后面一段作为value

reduce: 拼接values里面的每一段,以单词做key,拼接结果做value,输出即可

第一步代码如下:

package ldp.index;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class IndexStep1 {

		public static class IndexStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
			
			
			@Override
			protected void map(LongWritable key, Text value,
					Mapper<LongWritable, Text, Text, IntWritable>.Context context)
					throws IOException, InterruptedException {
			
				FileSplit inputSplit = (FileSplit) context.getInputSplit();
				String name = inputSplit.getPath().getName();				
				
				String[] words = value.toString().split(" ");
				for (String word : words) {
					context.write(new Text(word+"-"+name), new IntWritable(1));
				}							
			}
		}			
		public static class IndexStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
			
			@Override
			protected void reduce(Text key, Iterable<IntWritable> values,
					Reducer<Text, IntWritable, Text, IntWritable>.Context context)
					throws IOException, InterruptedException {
				
				int count = 0 ;
				for (IntWritable value : values) {
					count += value.get();
				}
				context.write(key, new IntWritable(count));				
			}
						
		}
			
			
		public static void main(String[] args) throws Exception {

				Configuration conf = new Configuration();				
				Job job = Job.getInstance(conf);
				job.setJarByClass(IndexStep1.class);
				job.setMapperClass(IndexStep1Mapper.class);
				job.setReducerClass(IndexStep1Reducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(IntWritable.class);				
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(IntWritable.class);
				FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\input"));
				FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
				job.setNumReduceTasks(3);				
				job.waitForCompletion(true);				
			}
}

第二步代码如下:

package ldp.index;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.nustaq.serialization.util.test;

public class IndexStep2 {

		public static class IndexStep2Mapper extends Mapper<LongWritable, Text, Text, Text>{
						
			@Override
			protected void map(LongWritable key, Text value,
					Mapper<LongWritable, Text, Text, Text>.Context context)
					throws IOException, InterruptedException {
				
			String[] words = value.toString().split("-");	
			context.write(new Text(words[0]), new Text(words[1].replaceAll("\t", "-->")));				
			}
		}			
		public static class IndexStep2Reducer extends Reducer<Text, Text, Text, Text>{
			
			@Override
			protected void reduce(Text key, Iterable<Text> values,
					Reducer<Text, Text, Text, Text>.Context context)
					throws IOException, InterruptedException {
				
				StringBuilder sb = new StringBuilder();
				for (Text value : values) {
					sb.append(value.toString()).append("\t");
				}				
				context.write(key, new Text(sb.toString()));
			}					
		}
			
			
		public static void main(String[] args) throws Exception {

				Configuration conf = new Configuration();				
				Job job = Job.getInstance(conf);
				job.setJarByClass(IndexStep2.class);
				job.setMapperClass(IndexStep2Mapper.class);
				job.setReducerClass(IndexStep2Reducer.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);				
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(Text.class);
				FileInputFormat.setInputPaths(job, new Path("E:\\hadoopdatas\\index_data\\output1"));
				FileOutputFormat.setOutputPath(job, new Path("E:\\hadoopdatas\\index_data\\output2"));
				job.setNumReduceTasks(1);				
				job.waitForCompletion(true);
				
			}
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值