在本篇博客中我们来构建一个简单而通用的搜索查询接口,在这个接口里面我们需要实现基本的增、删、改、查功能,并且做到通用而又使用简单,可扩展性强。一般在实际应用Lucene过程中,主要两个作用最为常见,一个为文档库的搜索查询(可以扩展为各种搜索引擎),另一个为知识问答库(可以扩展为类似小黄鸡的智能对话机器人)的搜索查询。接口的类图如下:
为了便于大家使用,在此处将全部源码公开,
DAO的基类LuceneDao,提供常用的增删改查方法,并且将根据资料生成Document以及查询结果这两个扩展点进行抽象,在子类中可以根据不同的资料,进行扩展实现:
package com.hsdl.lucene;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public abstract class LuceneDao {
private Analyzer analyzer = new IKAnalyzer(true);
private String indexPath = "D:/work/lucene/tika/index";
public void add(Stuff stuff) throws Exception {
createIndex(stuff);
}
public void batchAdd(List<Stuff> stuffs) throws Exception {
createIndexs(stuffs);
}
/***
*
* 删除方法
*
* */
public void delete(String fieldName, String fieldVaule) {
try {
IndexWriter writer = getIndexWrite();
Query q = new TermQuery(new Term(fieldName, fieldVaule));
writer.deleteDocuments(q);// 删除指定条件的Document
writer.commit();// 提交
writer.close();// 关闭
System.out.println("删除" + fieldName + "为" + fieldVaule + "的记录成功");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 批量删除
*
* @param fieldMap
* @throws Exception
*/
public void batchDelete(Map<String, String> fieldMap) throws Exception {
IndexWriter writer = getIndexWrite();
for (String fieldName : fieldMap.keySet()) {
Query q = new TermQuery(
new Term(fieldName, fieldMap.get(fieldName)));
writer.deleteDocuments(q);// 删除指定条件的Document
System.out.println("删除" + fieldName + "为" + fieldMap.get(fieldName)
+ "的记录成功");
}
writer.commit();// 提交
writer.close();// 关闭
}
protected abstract Document getDocument(Stuff stuff) throws Exception;
/**
*
* @param fieldName
* @param fieldVaule
* @param stuff
* @throws Exception
*/
public void update(String fieldName, String fieldVaule, Stuff stuff)
throws Exception {
try {
IndexWriter writer = getIndexWrite();
Document doc = getDocument(stuff);
writer.updateDocument(new Term(fieldName, fieldVaule), doc);
writer.commit();
writer.close();// 关闭
System.out.println("更新" + fieldName + "为" + fieldVaule + "的记录成功");
} catch (Exception e) {
throw e;
}
}
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* 设置索引文件的目录
*
* @param indexPath
*/
public void setIndexPath(String indexPath) {
this.indexPath = indexPath;
}
/**
* 创建索引
*
* @param analyzer
* @param indexPath
* @param docPath
* @throws Exception
*/
protected void createIndex(Stuff stuff) throws Exception {
IndexWriter iwriter = getIndexWrite();
indexDoc(iwriter, stuff);
iwriter.commit();
iwriter.close();
}
protected void indexDoc(IndexWriter iwriter, Stuff stuff) throws Exception {
Document doc = getDocument(stuff);
iwriter.addDocument(doc);
}
/**
* 批量创建索引
*
* @param analyzer
* @param indexPath
* @param docPath
* @throws Exception
*/
protected void createIndexs(List<Stuff> stuffs) throws Exception {
IndexWriter iwriter = getIndexWrite();
for (Stuff stuff : stuffs) {
indexDoc(iwriter, stuff);
}
iwriter.close();
}
/**
* 获取IndexWrite实例
*
* @param analyzer
* @param indexPath
* @return
* @throws IOException
*/
protected IndexWriter getIndexWrite() throws IOException {
IndexWriter iwriter;
Directory directory = FSDirectory.open(new File(indexPath));
// 配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_45,
analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory, iwConfig);
return iwriter;
}
/**
* 搜索
*
* @param searchField
* 搜索域
* @param indexPath
* 索引目录
* @param topCount
* 返回搜索相似度最高的条数
* @throws CorruptIndexException
* @throws IOException
* @throws ParseException
*/
public Document[] search(String searchField, String searchKeyStr,
int topCount) throws CorruptIndexException, IOException,
ParseException {
Directory directory = FSDirectory.open(new File(indexPath));
// 搜索过程**********************************
// 实例化搜索器
IndexReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
// 使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(Version.LUCENE_45, searchField,
analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(searchKeyStr);
// 搜索相似度最高的topCount条记录
TopDocs topDocs = isearcher.search(query, topCount);
// 输出结果
Document[] docs=new Document[topDocs.scoreDocs.length];
for(int i=0;i<docs.length;i++){
docs[i]=isearcher.doc(topDocs.scoreDocs[i].doc);
}
return docs;
}
public void displaySearchResult(Document[] docs) {
System.out.println("开始显示搜索查询结果....\n返回查询条数:"+docs.length);
}
/**
* 为索引文档添加附加的数据,一般为数据库存储相关记录的主键,便于在搜索后查询该文档其它的信息
*
* @param attachData
* @param doc
*/
protected void addAttacheData(Document doc, Map<String, String> attachData) {
if (attachData != null) {
Set<String> keys = attachData.keySet();
for (String key : keys) {
doc.add(new StringField(key, attachData.get(key),
Field.Store.YES));
}
}
}
}
文档库资料对象的基类Stuff,我们将资料内容之外的其他数据放入到Map中,做为附加数据。
package com.hsdl.lucene;
import java.util.Map;
/**
* 文档库资料对象的基类
* @author alex
*
*/
public class Stuff {
private Map<String,String> attacheData;
public Map<String,String> getAttacheData() {
return attacheData;
}
public void setAttacheData(Map<String,String> attacheData) {
this.attacheData = attacheData;
};
}
文件资料对象FileStuff,在这个类中有文件路径以及代表文件内容的域的名字,在构建索引和搜索时使用:
package com.hsdl.lucene;
/**
* 文件资料
* @author alex
*
*/
public class FileStuff extends Stuff{
private String filePath;
private String contentFieldName;
public String getContentFieldName() {
return contentFieldName;
}
public void setContentFieldName(String contentFieldName) {
this.contentFieldName = contentFieldName;
}
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
}
知识问答资料 AskAnswerStuff:
package com.hsdl.lucene;
/**
* 知识问答资料
* @author alex
*
*/
public class AskAnswerStuff extends Stuff{
private String ask;
private String answer;
private String contentFieldName;
public String getContentFieldName() {
return contentFieldName;
}
public void setContentFieldName(String contentFieldName) {
this.contentFieldName = contentFieldName;
}
public String getAsk() {
return ask;
}
public void setAsk(String ask) {
this.ask = ask;
}
public String getAnswer() {
return answer;
}
public void setAnswer(String answer) {
this.answer = answer;
}
}
文档库访问之文件对象实现LuceneDaoFileImpl:
package com.hsdl.lucene;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.tika.Tika;
/**
* 文档库访问之文件对象实现
* @author alex
*
*/
public class LuceneDaoFileImpl extends LuceneDao{
private static String contentFieldName = "content";
private static Tika tika = new Tika();
protected void indexDoc(IndexWriter iwriter, Stuff stuff) throws Exception {
FileStuff fileStuff=(FileStuff)stuff;
File file=new File(fileStuff.getFilePath());
if(file.isDirectory()){
indexDocByFileDir(iwriter,new File(fileStuff.getFilePath()),stuff.getAttacheData());
}else{
super.indexDoc(iwriter,stuff);
}
}
/**
* 根据指定存放内容的目录创建索引
*
* @param iwriter
* @param file
* @throws IOException
*/
private void indexDocByFileDir(IndexWriter iwriter, File file,Map<String,String> attachData) throws IOException {
if (file.canRead()){
if (file.isDirectory()) {
String[] files = file.list();
if (files != null)
for (int i = 0; i < files.length; i++)
indexDocByFileDir(iwriter, new File(file, files[i]),attachData);
} else {
Document doc = getDocument(file,attachData);
iwriter.addDocument(doc);
}
}
}
protected Document getDocument(File file,Map<String,String> attachData) throws IOException {
Document doc = new Document();
addAttacheData(doc,attachData );
// 此处添加文件内容时,需要根据tika获取Reader对象
doc.add(new TextField(contentFieldName, tika.parse(file)));
doc.add(new StringField("fileName", file.getName(),
Field.Store.YES));
doc.add(new StringField("path", file.getAbsolutePath(),
Field.Store.YES));
return doc;
}
public void displaySearchResult(Document[] docs) {
super.displaySearchResult(docs);
for (int i = 0; i < docs.length; i++) {
System.out.println("内容:" + docs[i].toString());
System.out.println(docs[i].get("fileName") + "["
+ docs[i].get("path") + "]");
}
}
@Override
protected Document getDocument(Stuff stuff) throws IOException {
FileStuff fileStuff=(FileStuff)stuff;
File file=new File(fileStuff.getFilePath());
Document doc = new Document();
addAttacheData(doc,stuff.getAttacheData() );
// 此处添加文件内容时,需要根据tika获取Reader对象
doc.add(new TextField(contentFieldName, tika.parse(file)));
doc.add(new StringField("fileName", file.getName(),
Field.Store.YES));
doc.add(new StringField("path", file.getAbsolutePath(),
Field.Store.YES));
return doc;
}
}
文档库访问之知识问答实现LuceneDaoAskAnswerImpl:
package com.hsdl.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
/**
* 文档库访问之知识问答实现
* @author alex
*
*/
public class LuceneDaoAskAnswerImpl extends LuceneDao{
@Override
protected Document getDocument(Stuff stuff) throws Exception {
AskAnswerStuff fileStuff=(AskAnswerStuff)stuff;
Document doc = new Document();
addAttacheData(doc,stuff.getAttacheData() );
// 此处添加文件内容时,需要根据tika获取Reader对象
doc.add(new TextField("ask",fileStuff.getAsk(),Field.Store.YES));
doc.add(new StringField("answer", fileStuff.getAnswer(),
Field.Store.YES));
return doc;
}
public void displaySearchResult(Document[] docs) {
super.displaySearchResult(docs);
for (int i = 0; i < docs.length; i++) {
System.out.println("内容:" + docs[i].toString());
System.out.println(docs[i].get("ask") + ":["
+ docs[i].get("answer") + "]");
}
}
}
下面我们来编写两个测试类,分别测试文件库的访问以及知识问答库:
LuceneDaoFileTest
package com.hsdl.lucene;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.document.Document;
/**
* 测试文件索引与搜索
* @author alex
*
*/
public class LuceneDaoFileTest {
public static void main(String[] args) {
LuceneDao luceneDao=new LuceneDaoFileImpl();
luceneDao.setIndexPath("D:/work/lucene/filetest/index");
FileStuff fileStuff=new FileStuff();
fileStuff.setFilePath("D:/work/lucene/filetest/doc/test.txt");
Map<String,String> attacheData=new HashMap<String,String>();
attacheData.put("ID", "001");
fileStuff.setAttacheData(attacheData);
fileStuff.setContentFieldName("content");
try {
Document[] docs;
//添加测试
System.err.println("------------开始添加测试------------");
luceneDao.add(fileStuff);
docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10);
luceneDao.displaySearchResult(docs);
docs=luceneDao.search(fileStuff.getContentFieldName(),"网站收费",10);
luceneDao.displaySearchResult(docs);
//删除测试
System.err.println("------------开始删除测试------------");
luceneDao.delete("ID", "001");
docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10);
luceneDao.displaySearchResult(docs);
//更新测试
fileStuff.setFilePath("D:/work/lucene/filetest/doc/test.xls");
luceneDao.update("ID", "001",fileStuff);
System.err.println("------------开始更新测试------------");
docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10);
luceneDao.displaySearchResult(docs);
docs=luceneDao.search(fileStuff.getContentFieldName(),"网站费用",10);
luceneDao.displaySearchResult(docs);
} catch (Exception e) {
e.printStackTrace();
}
}
}
LuceneDaoAskAnswerTest
package com.hsdl.lucene;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.document.Document;
/**
* 测试问答索引与搜索
* @author alex
*
*/
public class LuceneDaoAskAnswerTest {
public static void main(String[] args){
//测试问答知识的索引与搜索
LuceneDao luceneDao=new LuceneDaoAskAnswerImpl();
luceneDao.setIndexPath("D:/work/lucene/askanswer/index");
AskAnswerStuff askAnswerStuff=new AskAnswerStuff();
askAnswerStuff.setAsk("微信营销怎么收费?");
askAnswerStuff.setAnswer("3000元每年,10年25000");
Map<String,String> attacheData=new HashMap<String,String>();
attacheData.put("ID", "001");
askAnswerStuff.setAttacheData(attacheData);
try {
Document[] docs;
//添加测试
System.err.println("------------开始添加测试------------");
luceneDao.add(askAnswerStuff);
docs=luceneDao.search("ask","微信收费",10);
luceneDao.displaySearchResult(docs);
docs=luceneDao.search("ask","网站收费",10);
luceneDao.displaySearchResult(docs);
//删除测试
System.err.println("------------开始删除测试------------");
luceneDao.delete("ID", "001");
docs=luceneDao.search("ask","微信收费",10);
luceneDao.displaySearchResult(docs);
//更新测试
askAnswerStuff.setAsk("网站建设怎么收费?");
askAnswerStuff.setAnswer("普通企业网站6000,商城网站10000,其他网站价格面议!");
luceneDao.update("ID", "001",askAnswerStuff);
System.err.println("------------开始更新测试------------");
docs=luceneDao.search("ask","微信收费",10);
luceneDao.displaySearchResult(docs);
docs=luceneDao.search("ask","网站收费",10);
luceneDao.displaySearchResult(docs);
} catch (Exception e) {
e.printStackTrace();
}
}
}