lucene全文检索过程
1.需求,读取文件的内容,行读取方式,按照行内容进行分词匹配检索。
2.引入lucene的maven依赖
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>2.4.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.1.0</version>
</dependency>
3.代码分为创建lucene索引与读取文件内容按行读取
1>创建lucene步骤代码
Directory directory = FSDirectory.open(Paths.get("D:\\temp\\0208\\index"));
Analyzer analyzer = new SmartChineseAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter indexWriter = new IndexWriter(directory, config);
//文件内容
Field fileNameField = new TextField("filename", fileName, Field.Store.YES);
//创建文件名域
//第一个参数:域的名称
//第二个参数:域的内容
//第三个参数:是否存储
Field line = new StringField("lineNumber",String.valueOf(lineNumber.get()), Field.Store.YES);
Field fileContentField = new TextField("keyWord", String.join("==>", "行号", String.valueOf(lineNumber.get()), strLineRecord), Field.Store.YES);
//文件路径域(不分析、不索引、只存储)
Field filePathField = new StoredField("path", f.getAbsolutePath());
//文件大小域
//创建document对象
Document document = new Document();
document.add(fileNameField);
document.add(fileContentField);
document.add(filePathField);
document.add(line);
//创建索引,并写入索引库
indexWriter.addDocument(document);
//执行创建
indexWriter.commit();
//关闭indexwriter
indexWriter.close();
2>读取文件内容代码,我的文件不大,单个最多100MB,所以采用java8的文件读取方式
try (Stream<String> lines = Files.lines(filePath, Charset.forName("UTF-8"))) {
AtomicInteger lineNumber = new AtomicInteger();
long count = lines.filter((line) -> {
//当前的行号
int curLineNumber = lineNumber.incrementAndGet();
lineNumber.set(curLineNumber);
return true;
})
.map((strLineRecord) -> {
System.out.println("行号"+lineNumber.get()+"内容:"+strLineRecord);
return strLineRecord;
})
.filter(Objects::nonNull)
.count();
}
3>二者结合出的代码如下
Directory directory = FSDirectory.open(Paths.get("D:\\temp\\0208\\index"));
Analyzer analyzer = new SmartChineseAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter indexWriter = new IndexWriter(directory, config);
try (Stream<String> lines = Files.lines(filePath, Charset.forName("UTF-8"))) {
AtomicInteger lineNumber = new AtomicInteger();
long count = lines.filter((line) -> {
//当前的行号
int curLineNumber = lineNumber.incrementAndGet();
lineNumber.set(curLineNumber);
return true;
})
.map((strLineRecord) -> {
//文件名
try {
File f = new File(filePath.toString());
String fileName = f.getName();
//文件内容
Field fileNameField = new TextField("filename", fileName, Field.Store.YES);
//创建文件名域
//第一个参数:域的名称
//第二个参数:域的内容
//第三个参数:是否存储
Field line = new StringField("lineNumber",String.valueOf(lineNumber.get()), Field.Store.YES);
Field fileContentField = new TextField("keyWord", String.join("==>", "行号", String.valueOf(lineNumber.get()), strLineRecord), Field.Store.YES);
//文件路径域(不分析、不索引、只存储)
Field filePathField = new StoredField("path", f.getAbsolutePath());
//文件大小域
//创建document对象
Document document = new Document();
document.add(fileNameField);
document.add(fileContentField);
document.add(filePathField);
document.add(line);
//创建索引,并写入索引库
indexWriter.addDocument(document);
} catch (Exception e) {
e.printStackTrace();
}
return null;
})
.filter(Objects::nonNull)
.count();
}
//执行创建
indexWriter.commit();
//关闭indexwriter
indexWriter.close();
4.多线程创建读取文件并创建索引
多线程读并不会产生问题,但是设计lucene的创建索引便会进行写文件的操作,因此我们需要将写分离出来。采用ExecutorCompletionService来控制所有的线程均执行完毕
String filePaths = MapUtils.getString(map, "filePaths");
JSONArray paths = JSON.parseArray(filePaths);
Integer maxPoolSize = Integer.parseInt(ConfigSupport.getInstance().getProperty("pool.max.total"));
ExecutorService executor = Executors.newFixedThreadPool(paths.size()>maxPoolSize?maxPoolSize:paths.size());
ExecutorCompletionService<String> completionService = new ExecutorCompletionService<>(executor);
IndexWriter indexWriter = logAnalyze.getIndexWriter();
try {
for (int i = 0; i < paths.size(); i++) {
String path = String.valueOf(paths.get(i));
completionService.submit(new Callable<String>() {
@Override
public String call() throws Exception {
logAnalyze.addFileContentToWriter(indexWriter,Paths.get(path));
return path;
}
});
}
for (int i = 0; i < paths.size(); i++) {
Future<String> future = completionService.take();
if (future != null) {
String str = future.get();
}
}
}catch (Exception e) {
log.error("异常"+e);
} finally {
log.info("创建成功:索引文件源"+filePaths);
logAnalyze.commitCreateIndex(indexWriter);
executor.shutdown();
}
LogAnalyze.java中提供上边所需要的几个方法如下
public IndexWriter getIndexWriter(){
try {
Directory directory = FSDirectory.open(Paths.get(indexDirectory));
Analyzer analyzer = new SmartChineseAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter indexWriter = new IndexWriter(directory, config);
return indexWriter;
}catch (Exception e){
e.printStackTrace();
}
return null;
}
public void addFileContentToWriter(IndexWriter indexWriter,Path filePath){
try (Stream<String> lines = Files.lines(filePath, Charset.forName("UTF-8"))) {
AtomicInteger lineNumber = new AtomicInteger();
Long count = lines.filter((line) -> {
//当前的行号
int curLineNumber = lineNumber.incrementAndGet();
lineNumber.set(curLineNumber);
return true;
})
.map((strLineRecord) -> {
//文件名
try {
File f = new File(filePath.toString());
String fileName = f.getName();
//文件内容
Field fileNameField = new TextField(LogIndex.IndexField.FILE_NAME.getField(), fileName, Field.Store.YES);
//创建文件名域
//第一个参数:域的名称
//第二个参数:域的内容
//第三个参数:是否存储
Field line = new StringField(LogIndex.IndexField.LINE_NUMBER.getField(), String.valueOf(lineNumber.get()), Field.Store.YES);
Field lineContent = new TextField(LogIndex.IndexField.LINE_CONTENT.getField(), strLineRecord, Field.Store.YES);
//文件路径域(不分析、不索引、只存储)
Field filePathField = new StoredField(LogIndex.IndexField.FILE_PATH.getField(), f.getAbsolutePath());
//文件大小域
//创建document对象
Document document = new Document();
document.add(fileNameField);
document.add(lineContent);
document.add(filePathField);
document.add(line);
//创建索引,并写入索引库
indexWriter.addDocument(document);
} catch (Exception e) {
e.printStackTrace();
}
return null;
})
.count();
}catch (Exception e) {
e.printStackTrace();
}
}
public void commitCreateIndex(IndexWriter indexWriter){
try {
//执行创建
indexWriter.commit();
//关闭indexwriter
indexWriter.close();
}catch (Exception e){
e.printStackTrace();
}
}
5.lucene查找索引
直接附上高亮的代码,自己修改即可
public List<LogIndex> mutilSearchHighLightIndex(Map<String, String> searchContent) {
List<LogIndex> result = new ArrayList<>();
IndexReader indexReader=null;
try {
Directory directory = FSDirectory.open(Paths.get(indexDirectory));
//创建indexReader对象
indexReader = DirectoryReader.open(directory);
//创建indexsearcher对象
Set<String> mutilFields = searchContent.keySet();
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[]{LogIndex.IndexField.LINE_CONTENT.getField()}, new SmartChineseAnalyzer());
Query query = queryParser.parse(searchContent.get(LogIndex.IndexField.LINE_CONTENT.getField()));
//创建高亮器*******************************************
Formatter formatter = new SimpleHTMLFormatter("<font color=red>", "</font>");
Scorer fragmentScorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
/*******************************************/
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
//创建查询
//执行查询
//第一个参数是查询对象,第二个参数是查询结果返回的最大值
TopDocs topDocs = indexSearcher.search(query, 100);
//查询结果的总条数
// System.out.println("查询结果的总条数:" + topDocs.totalHits);
//遍历查询结果
//topDocs.scoreDocs存储了document对象的id
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
//scoreDoc.doc属性就是document对象的id
int docId = scoreDoc.doc;
float score = scoreDoc.score;
//根据document的id找到document对象
Document document = indexSearcher.doc(docId);
String lineContent = document.get(LogIndex.IndexField.LINE_CONTENT.getField());
String lineNumber = document.get(LogIndex.IndexField.LINE_NUMBER.getField());
String filePath = document.get(LogIndex.IndexField.FILE_PATH.getField());
String filename = document.get(LogIndex.IndexField.FILE_NAME.getField());
highlighter.setTextFragmenter(new SimpleFragmenter(lineContent.length()));//设置每次返回的字符数
String highlightLineContent = highlighter.getBestFragment(new SmartChineseAnalyzer(),
LogIndex.IndexField.LINE_CONTENT.getField(), lineContent);
LogIndex logIndex = new LogIndex();
logIndex.setLineNumber(lineNumber);
logIndex.setFileName(filename);
logIndex.setFilePath(filePath);
logIndex.setLineContent(highlightLineContent);
System.out.println("logIndex = " + logIndex);
result.add(logIndex);
}
}catch (Exception e) {
e.printStackTrace();
} finally {
//关闭indexreader对象
try {
indexReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}