Lucene踩坑总结

最新推荐文章于 2024-03-07 13:51:48 发布

Forward_duyu

最新推荐文章于 2024-03-07 13:51:48 发布

阅读量4.5k

点赞数 2

文章标签： lucene 索引问题 IndexWriter 索引合并

本文链接：https://blog.csdn.net/zoroduyu/article/details/82830579

版权

前言

感觉好久都没写博客了，前段时间工作忙，实在没时间写。这段时间稍微空闲下来，抽点空，记录下这段时间上工作遇到的问题。

文章概要

对自己这段时间在工作中使用Lucene遇到的一些问题进行总结和归纳，并将解决方法给出，以供大家学习和参考。

详述

问题1:

报错：Lock held by this virtual machine

问题描述：

不知道各位小伙伴在写Lucene的过程中有没有遇到Lock held by this virtual machine这个问题，具体报错大致信息如下：

Exception in thread "main" org.apache.lucene.store.LockObtainFailedException: Lock held by this virtual machine: D:\work\address\write.lock
	at org.apache.lucene.store.NativeFSLockFactory.obtainFSLock(NativeFSLockFactory.java:139)
	at org.apache.lucene.store.FSLockFactory.obtainLock(FSLockFactory.java:41)
	at org.apache.lucene.store.BaseDirectory.obtainLock(BaseDirectory.java:45)
	at org.apache.lucene.index.IndexWriter.<init>(IndexWriter.java:718)
	at com.tydic.oms.rsc.test.DoubleIndexWriterTest.getIndexWriter(DoubleIndexWriterTest.java:47)
	at com.tydic.oms.rsc.test.DoubleIndexWriterTest.main(DoubleIndexWriterTest.java:28)

原因及解决办法：

这个报错的原因是在Lucene中，打开一个IndexWrite之后，就会自动在目录中生成一个write.lock文件，并将这个文件夹锁住，若对同一个文件夹再打开一个，则会抛出上面的异常。具体报错场景可以参考下面的代码：

package com.tydic.oms.rsc.test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class DoubleIndexWriterTest {

	public static void main(String[] args) throws IOException {
		IndexWriter writer1 = null;
		IndexWriter writer2 = null;
		Directory directory1 = null;
		Directory directory2 = null;
		directory1 = getDirectory("D:\\work\\address");
		writer1 = getIndexWriter(directory1);
//		dosomething
		
		directory2 = getDirectory("D:\\work\\address");
		writer2 = getIndexWriter(directory2);
//		dosomething
		
		
		
	}
	
	/**
	 * 得到IndexWriter对象
	 * @param directory 传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		return indexWriter;
	}
	
	/**
	 * 获取Directory
	 * 
	 * @param path 传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public static Directory getDirectory(String path) throws IOException {
		Directory directory = FSDirectory.open(Paths.get(path));
		return directory;
	}
	
}

在上面这段代码的main方法中，只要writer1未被关闭，运行上面这段代码就会抛出这个异常。若要解决这个问题，必须将writer1关掉，代码修改如下：

package com.tydic.oms.rsc.test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class DoubleIndexWriterTest {

	public static void main(String[] args) throws IOException {
		IndexWriter writer1 = null;
		IndexWriter writer2 = null;
		Directory directory1 = null;
		Directory directory2 = null;
		directory1 = getDirectory("D:\\work\\address");
		writer1 = getIndexWriter(directory1);
		// 关闭写入流
		writer1.close();
		System.out.println("writer1    dosomething");

		directory2 = getDirectory("D:\\work\\address");
		writer2 = getIndexWriter(directory2);
		System.out.println("writer2    dosomething");

	}

	/**
	 * 得到IndexWriter对象
	 * 
	 * @param directory
	 *            传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		return indexWriter;
	}

	/**
	 * 获取Directory
	 * 
	 * @param path
	 *            传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public static Directory getDirectory(String path) throws IOException {
		Directory directory = FSDirectory.open(Paths.get(path));
		return directory;
	}

}

补充：

上面这段代码用到了中文分词器smartcn，相关maven配置如下：

<dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-smartcn</artifactId>
            <version>7.4.0</version>
</dependency>

所以从上面的列子可以看出，在实际使用过程中，需要保证同一时间只能有一个IndexWriter对同一个文件夹进行操作，所以我们需要一个管理工具类来对每个文件夹的多个indexWriter进行一个统一的管理，这里我把自己写的一个管理工具类分享出来供大家参考：

import java.io.IOException;
import java.nio.file.Paths;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.springframework.stereotype.Component;

/**
 * 用于管理indexWriter和Directory的管理类，所有的indexWirter和Directory都在这里做管理
 * 理论上来讲，一个文件夹只需要一个Directory和indexWriter就够了，不需要重复new对象
 * 当有多个文件夹时，用map来存储和管理
 * 
 * @author duyu
 *
 */
@Component
public class WriterAndDirManager {

	/**
	 * 存储indexWriter的map
	 */
	private Map<Directory, IndexWriter> writerMap = new ConcurrentHashMap<>();

	/**
	 * 存储Directory的map
	 */
	private Map<String, Directory> dirMap = new ConcurrentHashMap<>();

	/**
	 * 存储Directory的map
	 */
	private Map<Directory, IndexReader> readerMap = new ConcurrentHashMap<>();

	/**
	 * 得到IndexReader对象
	 * @param directory  文件夹对象
	 * @return 返回一个可以使用的IndexReader对象
	 * @throws IOException
	 */
	public IndexReader getIndexReader(Directory directory) throws IOException {
		// 如果map中有则直接返回
		if (readerMap.containsKey(directory)) {
			IndexReader indexReader = readerMap.get(directory);
			// 只有当RefCount大于0时 才认为indexReader未被关闭
			if (indexReader.getRefCount() > 0) {
				return indexReader;
			}
		}
		IndexReader indexReader = DirectoryReader.open(directory);
		readerMap.put(directory, indexReader);
		return indexReader;
	}

	/**
	 * 获取Directory
	 * 
	 * @param path 传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public Directory getDirectory(String path) throws IOException {
		// 如果map中有则直接返回
		if (dirMap.containsKey(path)) {
			return dirMap.get(path);
		}
		// 没有则new 一个加入map
		Directory directory = FSDirectory.open(Paths.get(path));
		dirMap.put(path, directory);
		return directory;
	}

	/**
	 * 得到IndexWriter对象
	 * @param directory 传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 如果map中有则直接返回
		if (writerMap.containsKey(directory)) {
			IndexWriter indexWriter = writerMap.get(directory);
			if (indexWriter.isOpen()) {
				return indexWriter;
			}
		}
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		writerMap.put(directory, indexWriter);
		return indexWriter;
	}
}

问题2:合并索引部分失效问题

问题描述：

合并索引：将多个小索引文件合并成大的索引文件，lucene优化索引方法的一种，可以加快索引检索速度
不知道各位小伙伴有没有尝试过对文件夹内索引进行索引合并的功能没有，没有也没关系。我将在这里详细讲解一下，先来看下面这段代码：

package com.tydic.oms.rsc.test;

import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class DoubleIndexWriterTest {

	public static void main(String[] args) throws IOException {
		IndexWriter writer1 = null;
//		IndexWriter writer2 = null;
		Directory directory1 = null;
//		Directory directory2 = null;
		directory1 = getDirectory("D:\\work\\temp");
		writer1 = getIndexWriter(directory1);

		Document document1 = new Document();
		document1.add(new StringField("id", "1", Field.Store.YES));
		document1.add(new StringField("name", "hehe1", Field.Store.YES));
		
		Document document2 = new Document();
		document2.add(new StringField("id", "2", Field.Store.YES));
		document2.add(new StringField("name", "hehe2", Field.Store.YES));
		
		Document document3 = new Document();
		document3.add(new StringField("id", "3", Field.Store.YES));
		document3.add(new StringField("name", "hehe3", Field.Store.YES));
		
		writer1.addDocument(document1);
		writer1.commit();
		
		writer1.addDocument(document2);
		writer1.commit();
		
		writer1.addDocument(document3);
		writer1.commit();
		
	}

	/**
	 * 得到IndexWriter对象
	 * 
	 * @param directory
	 *            传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		return indexWriter;
	}

	/**
	 * 获取Directory
	 * 
	 * @param path
	 *            传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public static Directory getDirectory(String path) throws IOException {
		Directory directory = FSDirectory.open(Paths.get(path));
		return directory;
	}

}

上面的代码添加了三个索引，不过我故意分三次提交。程序运行的结果生成了三个索引文件，如下图：

索引文件

可以看到，这里生成了三份索引文件_0,_1,_2。现在我们来按照下面这段代码进行索引合并：

package com.tydic.oms.rsc.test;

import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class DoubleIndexWriterTest2 {

	public static void main(String[] args) throws IOException {
		IndexWriter writer1 = null;
		Directory directory = null;
		directory = getDirectory("D:\\work\\temp");
		writer1 = getIndexWriter(directory);
		// 获取读取流
		IndexReader reader = getIndexReader(directory);
		// 合并文件夹
		writer1.forceMerge(1);
		// 提交
		writer1.commit();
		writer1.close();

	}

	/**
	 * 得到IndexWriter对象
	 * 
	 * @param directory
	 *            传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		return indexWriter;
	}

	/**
	 * 获取Directory
	 * 
	 * @param path
	 *            传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public static Directory getDirectory(String path) throws IOException {
		Directory directory = FSDirectory.open(Paths.get(path));
		return directory;
	}

	/**
	 * 得到IndexReader对象
	 * 
	 * @param directory
	 *            文件夹对象
	 * @return 返回一个可以使用的IndexReader对象
	 * @throws IOException
	 */
	public static IndexReader getIndexReader(Directory directory) throws IOException {
		IndexReader indexReader = DirectoryReader.open(directory);
		return indexReader;
	}

}

在这里，我先获取了一个读取流，然后调用forceMerge(1);这个方法进行索引合并，合并的结果如下：
在这里插入图片描述

可以看到_0,_1,_2文件还是存在，但是多出了一个_3文件，也就是说索引并没有完全的合并。为什么会出现这种现象呢？目前根本原因尚不清楚，不过我们可以从forceMerge(1)这个方法的注释上进行一波猜测，forceMerge(1)方法的注释中文翻译（google机翻）如下：

强制合并策略合并段，直到<= maxNumSegments。要执行的实际合并由MergePolicy确定。
这是一个非常昂贵的操作，特别是当你通过一个小的maxNumSegments;通常你应该只在索引是静态的时候调用它（不再更改）。
请注意，这需要与目录中索引大小成比例的可用空间：如果不使用复合文件格式，则为2X;如果是，则为3X。例如，如果索引大小为10 MB，则需要额外的20 MB空闲为此完成（如果您使用复合文件格式，则为30 MB）。这也受到用于执行合并的Codec的影响，甚至可能导致更大的索引。此外，最好在之后调用commit（），以允许IndexWriterto释放磁盘空间。
如果一些但并非所有读者在合并期间重新打开，这将导致> 2X temporaryspace被消耗，因为那些新读者将在那时打开临时片段。在合并运行时，最好不要重新打开读卡器。
实际的临时使用量可能远低于这些数字（这取决于许多因素）。
一般来说，一旦完成，索引的总大小将小于起始索引的大小。它可能会相当小（如果有人类删除）或略小一些。
如果遇到异常，例如，如果磁盘已满，索引将不会被破坏，并且nodocuments将丢失。但是，它可能已经部分合并（某些段合并但未全部合并），并且即使使用复合文件格式，索引中的一个段也可能是非复合格式。在转换段intocompound格式期间遇到theException时会发生这种情况。
当调用开始时，此调用将合并索引中存在的那些段。如果其他线程仍在添加文档和刷新段，则除非再次调用forceMerge，否则不会合并这些新创建的段。

让我们来看看文档注释的最后一句话：当调用开始时，此调用将合并索引中存在的那些段。如果其他线程仍在添加文档和刷新段，则除非再次调用forceMerge，否则不会合并这些新创建的段。 简而言之，就是当合并索引的操作正在进行时，若此时还有添加文档或者刷新的操作，则不会将这部分索引合并。那让我们大胆猜测一波，当执行合并索引的操作时，若有indexReader没有关闭，也会影响合并索引的操作。那来验证一下我的假设，修改代码将上面的indexReader关闭，如下:

public static void main(String[] args) throws IOException {
		IndexWriter writer1 = null;
		Directory directory = null;
		directory = getDirectory("D:\\work\\temp");
		writer1 = getIndexWriter(directory);
		// 获取读取流
		IndexReader reader = getIndexReader(directory);
		reader.close();
		// 合并文件夹
		writer1.forceMerge(1);
		// 提交
		writer1.commit();
		writer1.close();

	}

执行后的结果如下：

在这里插入图片描述

这里可以看到，合成之后只剩_3这一个索引文件了。所以请务必注意，当进行索引合并操作时，务必关闭读入流。

问题3:读入流indexReader无法获取最新数据

先来看如下代码：

package com.tydic.oms.rsc.test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class DoubleIndexWriterTest3 {

	public static void main(String[] args) throws IOException {
		IndexWriter writer = null;
		Directory directory = null;
		directory = getDirectory("D:\\work\\temp");
		writer = getIndexWriter(directory);
		// 获取读取流
		IndexReader reader = getIndexReader(directory);
		Document document = new Document();
		document.add(new StringField("id", "1000", Field.Store.YES));
		document.add(new StringField("aliasName", "dwda", Field.Store.YES));
		
//		写入一个索引提交
		writer.addDocument(document);
		writer.flush();
		writer.commit();
		IndexSearcher indexSearcher = new IndexSearcher(reader);
		// term 我需要根据那个字段进行检索，字段对应的值...
		Term term = new Term("id", "1000");
		Query query = new TermQuery(term);

		// 搜索先搜索索引目录..
		// 找到符合query 条件的前面条记录...
		TopDocs topDocs = indexSearcher.search(query, 1);
		System.out.println("查询到"+topDocs.totalHits+"条记录");

	}

	/**
	 * 得到IndexWriter对象
	 * 
	 * @param directory
	 *            传入directory文件夹对象
	 * @return 返回可以使用，没有被close掉的IndexWriter对象
	 * @throws IOException
	 */
	public static IndexWriter getIndexWriter(Directory directory) throws IOException {
		// 中文分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		// 索引配置
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		// 获取索引实例
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 没有则new一个加入map
		return indexWriter;
	}

	/**
	 * 获取Directory
	 * 
	 * @param path
	 *            传入文件夹路径
	 * @return 返回一个Directory对象
	 * @throws IOException
	 */
	public static Directory getDirectory(String path) throws IOException {
		Directory directory = FSDirectory.open(Paths.get(path));
		return directory;
	}

	/**
	 * 得到IndexReader对象
	 * 
	 * @param directory
	 *            文件夹对象
	 * @return 返回一个可以使用的IndexReader对象
	 * @throws IOException
	 */
	public static IndexReader getIndexReader(Directory directory) throws IOException {
		IndexReader indexReader = DirectoryReader.open(directory);
		return indexReader;
	}

}

这里我先new 了一个indexReader，然后往索引中写入一个索引，之后马上利用之前的indexReader来查询刚刚插入的索引，结果如下：

在这里插入图片描述