之前做过一个基于solrJ的mr索引程序,性能并不理想,想着solr的底层就是lucene,所以我用相应版本的lucene做了一个mr程序,程序性能提高了30-40倍(同样的分词器),实验证明创建出来的索引能被solrcloud识别,此程序只用到了map过程,生成的文件最后经过一次本地的合并形成最终的索引。
下面放出部分主要程序代码:
public static class TestMapper extends
Mapper<LongWritable, Text, Text, Text> {
private static final Logger LOG = Logger.getLogger(TestMapper.class);
RunInfo runInfo = null;
/**
* 做一些准备工作 1.获得solrcloud的DocCollection,以方便知道文档应该放在哪一个shard里面
* 2.初始化对应shard的list以方便文档被放入
*/
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
CreateIndexWriter createIndexWriter = new CreateIndexWriter();
try {
runInfo = createIndexWriter.create(context.getConfiguration()
.get("ZK_HOST"),
context.getConfiguration().get("APP_ID"), true);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 开始遍历文档,并把计算文档所属的shard,放在对应的list里面去
*/
@Override
public void map(LongWritable key, Text columns, Context context)
throws IOException, InterruptedException {
InputData inputData = null;
try {
inputData = DocumentUtil.parseLog(columns);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (inputData != null) {
String id = inputData.getId().toString();
int sliceHash = DocumentUtil.sliceHash(id);
Slice slice = DocumentUtil.hashToSlice(sliceHash,
runInfo.getDocCollection());
String shardName = slice.getName();// shard1,shard2 ..
Document doc = null;
try {
doc = DocumentUtil.getDocument(inputData);
} catch (Exception e) {
e.printStackTrace();
}
runInfo.getShardDocs().get(shardName).add(doc);
}
}
/**
* 将每一个shard的list的文档提交,写到hdfs目录里
*/
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
try {
AddDoc addDoc = new AddDoc(runInfo, context.getConfiguration()
.get("HDFS_FLOADER"), context.getConfiguration().get(
"INDEX_PATH"));
addDoc.write(runInfo.getShardDocs().size());
} catch (Exception e) {
e.printStackTrace();
}
}
}
为了进一步提升索引的性能在addDoc.write方法里用多线程同时去写多个shard,代码如下:
public void write(int num) throws Exception {
ExecutorService pool = Executors.newFixedThreadPool(num);
ArrayList<Future<Object>> get = new ArrayList<>();
String random = getRandom();
for (int i = 0; i < num; i++) {
Log.info("indexMrLog shard" + i + ":start write");
Log.info("indexMrLog shard" + i + " num is:"
+ runInfo.getShardDocs().get("shard" + (i + 1)).size());
addCallable callable = new addCallable(runInfo.getShardDocs().get(
"shard" + (i + 1)), (i + 1), random);
get.add(pool.submit(callable));
}
for (Future<Object> future : get) {
try {
future.get();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
}
pool.shutdown();
}
class addCallable implements Callable<Object> {
ArrayList<Document> documents;
int num;
String code;
addCallable(ArrayList<Document> documents, int num, String code) {
this.num = num;
this.documents = documents;
this.code = code;
}
@Override
public Object call() throws Exception {
try {
IndexWriter indexWriter = getIndexWriter(HDFS_FOLADER,
INDEX_PATH, num, code);
runInfo.getIndexWriters().add(indexWriter);
indexWriter.addDocuments(documents);
indexWriter.commit();
LOG.info("index writer:" + code + "has commit size"
+ documents.size());
// indexWriter.close();
return true;
} catch (Exception e) {
e.printStackTrace();
return false;
}
}
做完mapreduce后最后再用一个程序将每个shard下面的所有的文件夹合并一次形成最终的索引文件,代码如下
public void startCombine(String HDFS_FLOADER, String INDEX_PATH, int num)
throws Exception {
this.HDFS_FLOADER = HDFS_FLOADER;
this.INDEX_PATH = INDEX_PATH;
Configuration hdfsconf = new Configuration();
hdfsconf.set("fs.hdfs.impl.disable.cache", "false");
hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录
FileSystem fs = FileSystem.get(hdfsconf);
Path path = new Path(INDEX_PATH);// 索引目录
FileStatus[] files = fs.listStatus(path);
BlockingQueue<Runnable> workQueue = new ArrayBlockingQueue<>(100);
ThreadPoolExecutor pool = new ThreadPoolExecutor(40, 43, 1,
TimeUnit.MINUTES, workQueue);
ArrayList<Future<Object>> futures = new ArrayList<Future<Object>>();
for (FileStatus file : files) {
if (!file.getPath().getName().startsWith("shard")) {
continue;
}
String dest = file.getPath().getParent().toString() + "/combine/"
+ file.getPath().getName();
System.out.println("indexMrLog dest:" + dest);
Callable<Object> callable = new combineCallable(file.getPath()
.toString(), dest, num);
futures.add(pool.submit(callable));
}
for (Future<Object> future : futures) {
try {
LOG.info("indexMrLog: " + future.get());
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
}
pool.shutdown();
}
private class combineCallable implements Callable<Object> {
String source;
String dest;
int num;
public combineCallable(String source, String dest, int num) {
this.source = source;
this.dest = dest;
this.num = num;
}
@Override
public Object call() {
try {
Configuration hdfsconf = new Configuration();
hdfsconf.set("fs.hdfs.impl.disable.cache", "true");
hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录
FileSystem destFS = FileSystem.get(hdfsconf);
Path path = new Path(dest);
destFS.mkdirs(path);
HdfsDirectory d = new HdfsDirectory(path, hdfsconf);
IndexWriterConfig conf = new IndexWriterConfig(
new KeywordAnalyzer());
conf.setUseCompoundFile(false);
conf.setRAMBufferSizeMB(25000);
conf.setMaxBufferedDocs(5000000);
// conf.se
// DocumentsWriterPerThreadPool indexerThreadPool = new
// DocumentsWriterPerThreadPool();
conf.setCommitOnClose(false);
LogMergePolicy logMergePolicy = new LogDocMergePolicy();
logMergePolicy.setMergeFactor(num);
logMergePolicy.setMaxMergeDocs(5000000);
conf.setMergePolicy(logMergePolicy);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
// LogMergePolicy logMergePolicy = new LogDocMergePolicy();
// logMergePolicy.setMergeFactor(2);
// conf.setMergePolicy(logMergePolicy);
IndexWriter indexWriter = new IndexWriter(d, conf);
indexWriters.add(indexWriter);
System.out.println("source:" + source);
FileStatus[] files = destFS.listStatus(new Path(source));
HdfsDirectory[] hdfsDirectories = new HdfsDirectory[files.length];
// CodecReader[] codecReaders=new CodecReader[files.length];
int j = 0;
for (FileStatus file : files) {
try {
if (!file.isDirectory()) {
continue;
}
String pathString = file.getPath().toString();
System.out.println("add index file:" + pathString);
Configuration tempHdfsconf = new Configuration(hdfsconf);
tempHdfsconf.set("fs.hdfs.impl.disable.cache", "true");
// tempHdfsconf.set("fs.defaultFS", HDFS_FLOADER);//
// HDFS目录
hdfsDirectories[j] = new HdfsDirectory(new Path(
pathString), tempHdfsconf);
// codecReaders[j]=new SegmentReader
j++;
FileSystem fSystem = FileSystem.get(tempHdfsconf);
System.out.println(pathString + "/write.lock");
fSystem.delete(new Path(pathString + "/write.lock"),
true);
// indexWriter.addIndexes(new HdfsDirectory(new Path(
// pathString), tempHdfsconf));
} catch (Exception e) {
e.printStackTrace();
}
}
indexWriter.addIndexes(hdfsDirectories);
indexWriter.forceMerge(num);
indexWriter.commit();
indexWriter.close();
return source + " sucess";
} catch (Exception e) {
e.printStackTrace();
return source + " false";
}
}
}