基于Lucene的多场景检索系统开发指南
官网
一、项目构建配置 (pom.xml)
<dependencies>
<!-- Lucene核心库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>8.11.1</version>
</dependency>
<!-- 文本解析工具 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<!-- MySQL连接器 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.30</version>
</dependency>
<!-- 网络请求处理 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
</dependencies>
二、基础索引构建类
public abstract class BaseIndexer {
protected Directory directory;
protected Analyzer analyzer;
protected IndexWriter writer;
public BaseIndexer(String indexPath) throws IOException {
this.directory = FSDirectory.open(Paths.get(indexPath));
this.analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
this.writer = new IndexWriter(directory, config);
}
public abstract void buildIndex() throws Exception;
public void close() throws IOException {
writer.close();
directory.close();
}
}
三、多场景实现方案
1. Office文档检索
public class DocumentIndexer extends BaseIndexer {
public DocumentIndexer(String indexPath) throws IOException {
super(indexPath);
}
@Override
public void buildIndex() throws Exception {
// 支持docx/xlsx/pptx格式
File folder = new File("docs/");
for (File file : folder.listFiles()) {
String content = parseDocument(file);
addDocument(file.getName(), content, file.getAbsolutePath());
}
}
private String parseDocument(File file) {
// 使用POI解析不同文档格式
if(file.getName().endsWith(".docx")) {
// Word解析逻辑
} else if(file.getName().endsWith(".xlsx")) {
// Excel解析逻辑
} else if(file.getName().endsWith(".pptx")) {
// PPT解析逻辑
}
return extractedText;
}
private void addDocument(String title, String content, String path) {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
doc.add(new TextField("content", content, Field.Store.NO));
doc.add(new StringField("path", path, Field.Store.YES));
writer.addDocument(doc);
}
}
2. 数据库表检索
public class DatabaseIndexer extends BaseIndexer {
private Connection connection;
public DatabaseIndexer(String indexPath, String dbUrl, String user, String password) throws Exception {
super(indexPath);
this.connection = DriverManager.getConnection(dbUrl, user, password);
}
@Override
public void buildIndex() throws Exception {
Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery("SELECT * FROM knowledge_base");
while(rs.next()) {
Document doc = new Document();
doc.add(new StringField("id", rs.getString("id"), Field.Store.YES));
doc.add(new TextField("title", rs.getString("title"), Field.Store.YES));
doc.add(new TextField("content", rs.getString("content"), Field.Store.NO));
writer.addDocument(doc);
}
}
}
3. Wiki知识库检索
public class WikiIndexer extends BaseIndexer {
public WikiIndexer(String indexPath) throws IOException {
super(indexPath);
}
@Override
public void buildIndex() throws Exception {
List<String> urls = fetchAllWikiUrls(); // 获取所有页面链接
for(String url : urls) {
String content = fetchWikiContent(url);
addDocument(url, content);
}
}
private String fetchWikiContent(String url) {
// 使用Jsoup解析HTML内容
Document doc = Jsoup.connect(url).get();
return doc.select(".wiki-content").text();
}
}
四、场景差异对比表
对比维度 | Office文档 | 数据库表 | Wiki网站 |
---|---|---|---|
数据来源 | 本地文件系统 | 关系型数据库 | Web服务器 |
解析方式 | Apache POI/Tika | JDBC直连查询 | HTTP请求+HTML解析 |
更新频率 | 文件变动监听 | 数据库触发器/定时任务 | 定时爬取 |
存储结构 | 非结构化文本 | 结构化字段映射 | 半结构化HTML内容 |
增量更新 | 文件修改时间戳判断 | 增量ID/时间戳查询 | 页面Last-Modified头验证 |
性能考量 | 大文件分块处理 | 批量提交优化 | 爬虫速率限制 |
五、典型搜索实现
public class Searcher {
public List<SearchResult> search(String indexPath, String queryStr) throws Exception {
DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser("content", new StandardAnalyzer());
TopDocs results = searcher.search(parser.parse(queryStr), 10);
List<SearchResult> matches = new ArrayList<>();
for(ScoreDoc scoreDoc : results.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
matches.add(new SearchResult(
doc.get("title"),
doc.get("path"),
scoreDoc.score
));
}
reader.close();
return matches;
}
}
六、实施注意事项
- 分词策略:根据中文特性建议使用IKAnalyzer替代StandardAnalyzer
- 权限控制:Wiki爬取需处理Cookie认证和反爬机制
- 增量索引:建议为数据库表增加last_modified字段
- 性能优化:文档超过10MB时启用PositionalSpanQuery
- 异常处理:添加RetryPolicy应对网络波动
- 日志追踪:在document.addField()时记录原始数据ID
完整项目包含以下模块:
src/
├── main/
│ ├── java/
│ │ ├── indexer/ # 各类型索引构建类
│ │ ├── searcher/ # 搜索服务类
│ │ ├── model/ # 数据模型定义
│ │ └── App.java # 启动类
│ └── resources/
│ └── log4j.properties # 日志配置
└── test/ # 单元测试