概念
倒排索引:倒排索引是文档检索系统中最常用到的数据结果,应用于搜索引擎,根据内容来查找文档的一种方式。进行相反的操作,因称为倒排索引;
简单理解就是根据单词,返回它在哪个文件中出现过,而且频率是多少的结果
设计思路
Map过程
在Map端 把需要处理的文档上传到hdfs时,输入的文件被处理,得到文件中每一行的偏移量和这一行内容的键值对<偏移量,内容>做为map的输入。得到索引中需要的信息:单词,文档URI 和词频。
key:单词和URI
value:出现同样单词的次数。
combiner 过程
经过map方法处理后,Combine过程将key值相同的value值累加,得到一个单词在文档中的词频。为了将相同的key交给相对应的reduce需要自定义数据类型
注意
在combiner过程中,因为map输出的类型跟reduce输入的类型必须一致,所以combiner的输入与输出类型也必须一样
reduce 端
接收combiner输入的的’[key,value]’数据,Reduce过程只需要将相同的key值的value值组合成倒排引索文件的格式即可,其余的事情直接交给MapReduce框架进行处理。
示例代码
Combiner
package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class IndexCombiner extends
Reducer<URLWritable, IntWritable, URLWritable, IntWritable> {
private URLWritable combinerOutputKey = new URLWritable();
private IntWritable combinerOutputValue = new IntWritable();
@Override
public void reduce(URLWritable key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
combinerOutputKey.setKey(key.getKey());
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
combinerOutputValue.set(sum);
context.write(key, combinerOutputValue);
}
}
自定义类型
package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class URLWritable implements WritableComparable<URLWritable> {
private String key;
private String url;
public URLWritable() {
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public void set(String key, String url) {
this.key = key;
this.url = url;
}
public URLWritable(String key, String url) {
this.set(key, url);
}
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(key);
out.writeUTF(url);
}
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.key = in.readUTF();
this.url = in.readUTF();
}
public int compareTo(URLWritable o) {
// TODO Auto-generated method stub
int comp = this.key.compareTo(o.getKey());
if (0 != comp) {
return comp;
}
return Integer.valueOf(getUrl()).compareTo(
Integer.valueOf(o.getUrl()));
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((key == null) ? 0 : key.hashCode());
result = prime * result + ((url == null) ? 0 : url.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
URLWritable other = (URLWritable) obj;
if (key == null) {
if (other.key != null)
return false;
} else if (!key.equals(other.key))
return false;
if (url == null) {
if (other.url != null)
return false;
} else if (!url.equals(other.url))
return false;
return true;
}
@Override
public String toString() {
return key + "," +url;
}
}
MapReduce
package com.hao.bigdata.hadoop.mapreduce.InvertedIndex;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class InvertedMapReduce extends Configured implements Tool {
// maper classs
/***
* @author hao public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
// TODO
public static class IndexMapper extends
Mapper<LongWritable, Text, URLWritable, IntWritable> { // extends-mapper-jilei
// set,map,output,value
private IntWritable mapOutputvalue = new IntWritable(1);
private URLWritable mapOutputKey = new URLWritable();
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String lineValue = value.toString();
String[] values = lineValue.split("\\$\\$");
String url = values[0];
String title = values[1];
String content = values[2];
// split title
String[] SplitTitle = title.split(" ");
for (String SplitTitles : SplitTitle) {
mapOutputKey.set(SplitTitles, url);
context.write(mapOutputKey, mapOutputvalue);
}
// split title
String[] SplitContent = content.split(" ");
for (String SplitContents : SplitContent) {
mapOutputKey.set(SplitContents, url);
context.write(mapOutputKey, mapOutputvalue);
}
}
}
// reducer class
/**
* * @author hao public class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
*/
// TODO
public static class IndexReducer extends
Reducer<URLWritable, IntWritable, URLWritable, IntWritable> {
private URLWritable reduceOutPutKey = new URLWritable();
private IntWritable reduceOutPutValue = new IntWritable();
@Override
public void reduce(URLWritable key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
// set combinerOutput Key
reduceOutPutKey.setKey(key.getKey());
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
reduceOutPutValue.set(sum);
context.write(reduceOutPutKey, reduceOutPutValue);
}
}
// driver
public int run(String args[]) throws Exception {
// step 1: get Configuration
Configuration configuration = super.getConf();
// step 2: creat Job chuanlian input-> map->reduce->output
Job job = Job.getInstance(configuration, this.getClass()
.getSimpleName());
job.setJarByClass(this.getClass()); // jar bao
/**
* step 3:job input ->map ->reduce ->output
*/
// step 3.1:input
Path inpath = new Path(args[0]); // fengzhuang lujing
FileInputFormat.addInputPath(job, inpath);
// step 3.2:mapper
job.setMapperClass(IndexMapper.class);
job.setMapOutputKeyClass(URLWritable.class); // zhiding,map,shuchu<key,value>leixing
job.setMapOutputValueClass(IntWritable.class);
// =============shuffle========================
// 1.partitioner
// job.setPartitionerClass(cls);
// 2.sort
// job.setSortComparatorClass(cls);
// 3.combin
job.setCombinerClass(IndexCombiner.class);
// 4.compress
// set by configuration
// 5.group
// job.setGroupingComparatorClass(cls);
// ==============shuffle=======================
// step 3.3:reducer
job.setReducerClass(IndexReducer.class);// zhiding,reduce,shuchu<keyK,value>,leixing
// TODO
job.setOutputKeyClass(URLWritable.class);
job.setOutputValueClass(IntWritable.class);
/*
* //set reduce num job.setNumReduceTasks(0);
*/
// step 3.4:output
Path outpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outpath);
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
// main
public static void main(String[] args) throws Exception {
/*
* args = new String[] {
* "hdfs://bigdata00.hadoop-hao.com:8020/data/inputFiles/input02",
* "hdfs://bigdata00.hadoop-hao.com:8020/data/outputFiles/output04" };
*/
// create configuration
Configuration configuration = new Configuration();
// run job
int status = ToolRunner.run(configuration, new InvertedMapReduce(),
args);
// exit program
System.exit(status);
}
}