目录
一、lucene结构介绍
Document:被索引的文档
Indexwriter:索引的写出工具类,通过函数addDocument将文档添加到索引中,实现索引的创建
query:用户发出请求时的查询语句
index:lucene的反向索引
IndexSearcher:通过函数search搜索lucene index
TopDocsCollector:返回给用户的文档集合
private String indexdir="./chindex";
private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private Analyzer analyzer;
public ChineseDao() throws IOException {
dir = FSDirectory.open(new File(indexdir));
reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
analyzer = new IKAnalyzer(true);
}
本项目的本质是一个双语语料库,因此需要允许用户既可以根据中文查询,也可以根据英文查询,由于中英分词的方式不同,针对中文和英文分别建立了两个索引文件,因此查询索引文件的部分也根据中文和英文分别进行查询。
二、中文查询
public List<ContentInfo> search(Query query) {
List<ContentInfo> list = new ArrayList<>();
try {
QueryScorer scorer = new QueryScorer(query);
SimpleHTMLFormatter fors = new SimpleHTMLFormatter("<span style=\"color:green; font-weight:bold\">", "</span>");
Highlighter highlighter = new Highlighter(fors, scorer);
//ScoreDoc[] hits= pageSearch(query, page, 1000); // 在索引中执行检索操作,显示n条数据
TopDocs hits = searcher.search(query, 1000);
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc); // 根据文档打分得到文档的内容
ContentInfo content = new ContentInfo();
content.setCh(doc.get("ch"));
content.setEng(doc.get("eng"));
content.setBook(doc.get("book"));
content.setContentId(doc.get("oid"));
TokenStream tokenStream = analyzer.tokenStream("ch", new StringReader(content.getCh()));
Fragmenter fragment = new SimpleSpanFragmenter(scorer, doc.get("ch").length());
highlighter.setTextFragmenter(fragment);
content.setCh(highlighter.getBestFragment(tokenStream, content.getCh()));//获取高亮的片段,可以对其数量进行限制
list.add(content);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
return list;
}
三、英文查询
/*
根据english关键词查询段落
*/
public List<ContentInfo> search_eng(Query query){
List<ContentInfo> list = new ArrayList<>();
try {
QueryScorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(fors, scorer);
//ScoreDoc[] hits= pageSearch(query, searcher, page, pageSize); // 在索引中执行检索操作,显示n条数据,现在是50
TopDocs hits = searcher.search(query, 50);
for (ScoreDoc scoreDoc : hits.scoreDocs) { // hits.scoreDocs
org.apache.lucene.document.Document doc = searcher.doc(scoreDoc.doc); // 根据文档打分得到文档的内容
ContentInfo content = new ContentInfo();
content.setCh(doc.get("ch"));
content.setEng(doc.get("eng"));
content.setBook(doc.get("book"));
content.setContentId(doc.get("oid"));
TokenStream tokenStream = analyzer.tokenStream("eng", new StringReader(content.getEng()));
Fragmenter fragment = new SimpleSpanFragmenter(scorer, doc.get("eng").length());
highlighter.setTextFragmenter(fragment);
content.setEng(highlighter.getBestFragment(tokenStream, content.getEng()));//获取高亮的片段,可以对其数量进行限制
list.add(content);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
return list;
}