该测试代码对应了之前的文章 Hadoop MapReduce 上利用Lucene实现分布式索引
之前在完成一项任务时,需要检索几十万个questionID,提取对应的内容。这不能用简单的顺序查找或者折半查找实现。所以我设计了QuestionIndexMR,主要目的是根据questionID快速提取其所对应的value值(这里的设计相当于使用文件名,将文件内容提取出来。但是如果做传统意义上的索引检索,则是反过来的^_^),所以需要区分理解。
QuestionIndexMR的源码如下:
package question.index;
import hdfs.document.HDFSDocument;
import hdfs.document.HDSDocumentOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
public class QuestionIndexMR extends Configured
implements Mapper<LongWritable, Text, Text, Text>,
Reducer<Text, Text, Text, HDFSDocument>{
String charset = null;
@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub
setConf(job);
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
@Override
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> collector, Reporter reporter)
throws IOException {
charset = getConf().get("charset");
/* value的格式为“questionID value1” */
String tempValue = new String(value.getBytes(),0,value.getLength(),charset);
String[] splitResu = tempValue.split("\t");
Text questionID = new Text(splitResu[0]);
collector.collect(questionID, new Text(splitResu[1]));
}
@Override
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, HDFSDocument> collector, Reporter reporter)
throws IOException {
while (values.hasNext()){
HashMap<String,String> fields = new HashMap<String, String>();
fields.put(key.toString(), values.next().toString());
HDFSDocument doc = new HDFSDocument();
doc.setFields(fields);
collector.collect(key, doc);
}
}
public void run() throws Exception{
String questionInput = "/user/zhl/question_category_keywords";
String questionOutput = "/user/zhl/question_luceneIndex";
Configuration conf = new Configuration();
conf.set("charset", "utf-8");
JobConf job = new JobConf(conf, QuestionIndexMR.class);
job.setJarByClass(QuestionIndexMR.class);
job.setJobName("ProblemIndexer");
FileInputFormat.addInputPath(job, new Path(questionInput));
Path outpath= new Path(questionOutput);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outpath))
fs.delete(outpath, true);
FileOutputFormat.setOutputPath(job, outpath);
job.setMapperClass(QuestionIndexMR.class);
job.setReducerClass(QuestionIndexMR.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(HDFSDocument.class);
job.setOutputFormat(HDSDocumentOutput.class);
job.setNumMapTasks(45);
job.setNumReduceTasks(1);
JobClient.runJob(job);
}
}
这是最初的解决方法。后来发现随着索引内容的增多,检索的速度下降的非常快。
最后的解决方案是,采用符合MapReduce流式原理的做法,在需要访问questionID内容的时候,将questionID对应的内容输入,并在map/reduce阶段进行控制。