基于lucene的mr索引程序的实现

最新推荐文章于 2024-08-12 22:04:42 发布

jj380382856

最新推荐文章于 2024-08-12 22:04:42 发布

阅读量588

点赞数

分类专栏： lucene 文章标签： lucene 索引

本文链接：https://blog.csdn.net/jj380382856/article/details/51968763

版权

lucene 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

之前做过一个基于solrJ的mr索引程序，性能并不理想，想着solr的底层就是lucene，所以我用相应版本的lucene做了一个mr程序，程序性能提高了30-40倍（同样的分词器），实验证明创建出来的索引能被solrcloud识别，此程序只用到了map过程，生成的文件最后经过一次本地的合并形成最终的索引。

下面放出部分主要程序代码：

public static class TestMapper extends
			Mapper<LongWritable, Text, Text, Text> {

		private static final Logger LOG = Logger.getLogger(TestMapper.class);
		RunInfo runInfo = null;

		/**
		 * 做一些准备工作 1.获得solrcloud的DocCollection，以方便知道文档应该放在哪一个shard里面
		 * 2.初始化对应shard的list以方便文档被放入
		 */
		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {

			CreateIndexWriter createIndexWriter = new CreateIndexWriter();
			try {
				runInfo = createIndexWriter.create(context.getConfiguration()
						.get("ZK_HOST"),
						context.getConfiguration().get("APP_ID"), true);
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

		/**
		 * 开始遍历文档，并把计算文档所属的shard，放在对应的list里面去
		 */
		@Override
		public void map(LongWritable key, Text columns, Context context)
				throws IOException, InterruptedException {

			InputData inputData = null;
			try {
				inputData = DocumentUtil.parseLog(columns);
			} catch (Exception e) {
				e.printStackTrace();
				return;
			}
			if (inputData != null) {

				String id = inputData.getId().toString();
				int sliceHash = DocumentUtil.sliceHash(id);

				Slice slice = DocumentUtil.hashToSlice(sliceHash,
						runInfo.getDocCollection());
				String shardName = slice.getName();// shard1,shard2 ..

				Document doc = null;
				try {
					doc = DocumentUtil.getDocument(inputData);
				} catch (Exception e) {
					e.printStackTrace();
				}

				runInfo.getShardDocs().get(shardName).add(doc);

			}

		}

		/**
		 * 将每一个shard的list的文档提交，写到hdfs目录里
		 */
		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {
			try {
				AddDoc addDoc = new AddDoc(runInfo, context.getConfiguration()
						.get("HDFS_FLOADER"), context.getConfiguration().get(
						"INDEX_PATH"));
				addDoc.write(runInfo.getShardDocs().size());

			} catch (Exception e) {
				e.printStackTrace();
			}

		}

	}

为了进一步提升索引的性能在addDoc.write方法里用多线程同时去写多个shard，代码如下：

public void write(int num) throws Exception {

		ExecutorService pool = Executors.newFixedThreadPool(num);
		ArrayList<Future<Object>> get = new ArrayList<>();
		String random = getRandom();
		for (int i = 0; i < num; i++) {
			Log.info("indexMrLog shard" + i + ":start write");
			Log.info("indexMrLog shard" + i + " num is:"
					+ runInfo.getShardDocs().get("shard" + (i + 1)).size());

			addCallable callable = new addCallable(runInfo.getShardDocs().get(
					"shard" + (i + 1)), (i + 1), random);

			get.add(pool.submit(callable));
		}

		for (Future<Object> future : get) {
			try {
				future.get();
			} catch (InterruptedException e) {
				e.printStackTrace();
			} catch (ExecutionException e) {
				e.printStackTrace();
			}
		}

		pool.shutdown();
	}

	class addCallable implements Callable<Object> {

		ArrayList<Document> documents;
		int num;
		String code;

		addCallable(ArrayList<Document> documents, int num, String code) {
			this.num = num;
			this.documents = documents;
			this.code = code;
		}

		@Override
		public Object call() throws Exception {

			try {

				IndexWriter indexWriter = getIndexWriter(HDFS_FOLADER,
						INDEX_PATH, num, code);
				runInfo.getIndexWriters().add(indexWriter);
				indexWriter.addDocuments(documents);
				indexWriter.commit();

				LOG.info("index writer:" + code + "has commit size"
						+ documents.size());
				// indexWriter.close();
				return true;
			} catch (Exception e) {
				e.printStackTrace();
				return false;
			}

		}

做完mapreduce后最后再用一个程序将每个shard下面的所有的文件夹合并一次形成最终的索引文件，代码如下

public void startCombine(String HDFS_FLOADER, String INDEX_PATH, int num)
			throws Exception {

		this.HDFS_FLOADER = HDFS_FLOADER;
		this.INDEX_PATH = INDEX_PATH;

		Configuration hdfsconf = new Configuration();
		hdfsconf.set("fs.hdfs.impl.disable.cache", "false");
		hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录

		FileSystem fs = FileSystem.get(hdfsconf);
		Path path = new Path(INDEX_PATH);// 索引目录

		FileStatus[] files = fs.listStatus(path);

		BlockingQueue<Runnable> workQueue = new ArrayBlockingQueue<>(100);

		ThreadPoolExecutor pool = new ThreadPoolExecutor(40, 43, 1,
				TimeUnit.MINUTES, workQueue);
		ArrayList<Future<Object>> futures = new ArrayList<Future<Object>>();
		for (FileStatus file : files) {
			if (!file.getPath().getName().startsWith("shard")) {
				continue;
			}
			String dest = file.getPath().getParent().toString() + "/combine/"
					+ file.getPath().getName();
			System.out.println("indexMrLog dest:" + dest);

			Callable<Object> callable = new combineCallable(file.getPath()
					.toString(), dest, num);
			futures.add(pool.submit(callable));
		}

		for (Future<Object> future : futures) {
			try {
				LOG.info("indexMrLog: " + future.get());
			} catch (InterruptedException e) {
				e.printStackTrace();
			} catch (ExecutionException e) {
				e.printStackTrace();
			}
		}
		pool.shutdown();

	}

	private class combineCallable implements Callable<Object> {
		String source;
		String dest;
		int num;

		public combineCallable(String source, String dest, int num) {
			this.source = source;
			this.dest = dest;
			this.num = num;

		}

		@Override
		public Object call() {

			try {
				Configuration hdfsconf = new Configuration();
				hdfsconf.set("fs.hdfs.impl.disable.cache", "true");
				hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录

				FileSystem destFS = FileSystem.get(hdfsconf);

				Path path = new Path(dest);
				destFS.mkdirs(path);

				HdfsDirectory d = new HdfsDirectory(path, hdfsconf);

				IndexWriterConfig conf = new IndexWriterConfig(
						new KeywordAnalyzer());

				conf.setUseCompoundFile(false);
				conf.setRAMBufferSizeMB(25000);
				conf.setMaxBufferedDocs(5000000);
				// conf.se
				// DocumentsWriterPerThreadPool indexerThreadPool = new
				// DocumentsWriterPerThreadPool();

				conf.setCommitOnClose(false);

				LogMergePolicy logMergePolicy = new LogDocMergePolicy();
				logMergePolicy.setMergeFactor(num);
				logMergePolicy.setMaxMergeDocs(5000000);

				conf.setMergePolicy(logMergePolicy);

				conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

				// LogMergePolicy logMergePolicy = new LogDocMergePolicy();
				// logMergePolicy.setMergeFactor(2);
				// conf.setMergePolicy(logMergePolicy);

				IndexWriter indexWriter = new IndexWriter(d, conf);

				indexWriters.add(indexWriter);

				System.out.println("source:" + source);
				FileStatus[] files = destFS.listStatus(new Path(source));

				HdfsDirectory[] hdfsDirectories = new HdfsDirectory[files.length];
				// CodecReader[] codecReaders=new CodecReader[files.length];
				int j = 0;
				for (FileStatus file : files) {
					try {

						if (!file.isDirectory()) {
							continue;
						}
						String pathString = file.getPath().toString();
						System.out.println("add index file:" + pathString);

						Configuration tempHdfsconf = new Configuration(hdfsconf);
						tempHdfsconf.set("fs.hdfs.impl.disable.cache", "true");
						// tempHdfsconf.set("fs.defaultFS", HDFS_FLOADER);//
						// HDFS目录
						hdfsDirectories[j] = new HdfsDirectory(new Path(
								pathString), tempHdfsconf);
						// codecReaders[j]=new SegmentReader
						j++;
						FileSystem fSystem = FileSystem.get(tempHdfsconf);
						System.out.println(pathString + "/write.lock");
						fSystem.delete(new Path(pathString + "/write.lock"),
								true);

						// indexWriter.addIndexes(new HdfsDirectory(new Path(
						// pathString), tempHdfsconf));

					} catch (Exception e) {
						e.printStackTrace();
					}
				}

				indexWriter.addIndexes(hdfsDirectories);
				indexWriter.forceMerge(num);
				indexWriter.commit();
				indexWriter.close();

				return source + "  sucess";
			} catch (Exception e) {
				e.printStackTrace();
				return source + "  false";
			}
		}
	}

jj380382856

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
基于lucene的mr索引程序的实现

之前做过一个基于solrJ的mr索引程序，性能并不理想，想着solr的底层就是lucene，所以我用相应版本的lucene做了一个mr程序，程序性能提高了30-40倍（同样的分词器），实验证明创建出来的索引能被solrcloud识别，此程序只用到了map过程，生成的文件最后经过一次本地的合并形成最终的索引。下面放出部分主要程序代码：public static class TestM
复制链接

扫一扫

专栏目录