最近利用晚上下班还有周末的时间自己捣腾的封装了一个我自己的搜索引擎(基于lucene和solr).现在将大概的思路给写出来,分享下:
1.首先是索引对象,也可以说是查询的VO对象.封装了几个常用字段(如:主键,所属者ID,所属者姓名,进入详情页面的link,创建时间等),其他各个模块的字段(如:标题,内容,邮箱等)
SearchBean.java
字段的代码如下:
1
/** *//********以下 共有字段***********/2
/** *//**3 * 检索的内容4*/5protectedString keyword;6
/** *//**7 * 拥有者ID8*/9protectedString owerId;10
/** *//**11 * 拥有者name12*/13protectedString owerName;14
/** *//**15 * 检索对象的唯一标识位的值16*/17protectedString id;18
/** *//**19 * 检索出对象后进入详情页面的链接20*/21protectedString link;22
/** *//**23 * 创建时间24*/25protectedString createDate;26
/** *//**27 * index类型28*/29protectedString indexType;3031//setter,getter方法省略32
/** *//********以上 共有字段***********/3334
/** *//*************以下 其他字段************/35
/** *//**36 * 需要检索出来的字段及其值的对应map37*/38privateMapsearchValues;3940
/** *//**41 * 值对象42*/43privateObject object;4445
/** *//**46 * 获取检索出来的doIndexFields字段的值47 *48 *@return49*/50
publicMapgetSearchValues(){51returnsearchValues;52 }5354
/** *//**55 * 设置检索出来的doIndexFields字段的值56 *57 *@paramsearchValues58*/59
publicvoidsetSearchValues(MapsearchValues){60this.searchValues=searchValues;61 }62
/** *//********************以上 其他字段*******************/
抽象方法代码如下:
1
/** *//*****************以下 抽象方法******************/2
/** *//**3 * 返回需要进行检索的字段4 *5 *@return6*/7publicabstractString[] getDoSearchFields();89
/** *//**10 * 进行索引的字段11 *12 *@return13*/14publicabstractString[] getDoIndexFields();1516
/** *//**17 * 初始化searchBean中的公共字段(每个对象都必须创建的索引字段)18 *@throwsException19*/20publicabstractvoidinitPublicFields()throwsException;2122
/** *//**23 * 返回索引类型24 *25 *@return26*/27publicabstractString getIndexType();28
/** *//*****************以上 抽象方法********************/
共有的方法:
1
/** *//*******************以下 公共方法**********************/2
/** *//**3 * 获取需要创建索引字段的键值对map4 *5 *@return6*/7
publicMapgetIndexFieldValues(){8
if(this.object==null){9 logger.warn("given object is null!");10returnCollections.emptyMap();11 }1213 String[] doIndexFields=this.getDoIndexFields();14
if(doIndexFields==null||doIndexFields.length<1){15 logger.debug("given no doIndexFields!");16returnCollections.emptyMap();17 }1819 MapextInfo=newHashMap();20
for(String f : doIndexFields){21 String value=getValue(f, object);22 extInfo.put(f, value);23 }2425returnextInfo;26 }2728
/** *//**29 * 获取一个对象中的某个字段的值,结果转化成string类型30 *31 *@paramfield 字段名称32 *@paramobj 对象33 *@return34*/35
privateString getValue(String field, Object obj){36
if(StringUtils.isEmpty(field)){37 logger.warn("field is empty!");38returnStringUtils.EMPTY;39 }4041 String result=StringUtils.EMPTY;42
try{43 Object value=ObjectUtils.getFieldValue(object, field);44if(value==null)45 result=StringUtils.EMPTY;46elseif(valueinstanceofString)47 result=(String) value;48elseif(valueinstanceofCollections||valueinstanceofMap)49 result=ToStringBuilder.reflectionToString(object);50elseif(valueinstanceofDate)51 result=DateUtils.formatDate((Date) value);52else53 result=value.toString();5455
}catch(IllegalAccessException e){56 logger.error("can not find a value for field '{}' in object class '{}'!", field, object.getClass());57 }5859returnresult;60 }6162
/** *//**63 * you must use this method when you create the index, set what object you will to be created its index!64 *65 *@paramobject the object which you will want to be create index66*/67
publicvoidsetObject(Object object){68this.object=object;69 }7071
/** *//**72 * get what object you want to be created index!73 *74 *@return75*/76
publicObject getObject(){77returnthis.object;78 }79
/** *//***************以上 公共方法*************/
2.现在有很多开源或者闭源的索引引擎可以用在项目上使用,所以我写了一个接口和一个抽取了一些公共方法的抽象类,只需要将你选择的搜索引擎的具体创建索引,检索等功能的实现代码写在一个继承上面这个抽象类的子类中,就可以随意的切换使用的目标引擎.贴上接口和抽象类
SearchEngine.java
1
packagecom.message.base.search.engine;23importcom.message.base.pagination.PaginationSupport;4importcom.message.base.search.SearchBean;56importjava.util.List;78
/** *//**9 * 索引引擎实现构建索引.删除索引.更新索引.检索等操作.10 *11 *@authorsunhao(sunhao.java@gmail.com)12 *@versionV1.013 * @createTime 13-5-5 上午1:3814*/15
publicinterfaceSearchEngine{1617
/** *//**18 * 创建索引(考虑线程安全)19 *20 *@paramsearchBeans 对象21 *@throwsException22*/23publicvoiddoIndex(ListsearchBeans)throwsException;2425
/** *//**26 * 删除索引27 *28 *@parambean 对象29 *@throwsException30*/31publicvoiddeleteIndex(SearchBean bean)throwsException;3233
/** *//**34 * 删除索引(删除多个)35 *36 *@parambeans 对象37 *@throwsException38*/39publicvoiddeleteIndexs(Listbeans)throwsException;4041
/** *//**42 * 进行检索43 *44 *@parambean 检索对象(一般只需要放入值keyword,即用来检索的关键字)45 *@paramisHighlighter 是否高亮46 *@paramstart 开始值47 *@paramnum 偏移量48 *@return49 *@throwsException50*/51publicPaginationSupport doSearch(SearchBean bean,booleanisHighlighter,intstart,intnum)throwsException;5253
/** *//**54 * 进行多个检索对象的检索55 *56 *@parambeans 多个检索对象(一般只需要放入值keyword,即用来检索的关键字)57 *@paramisHighlighter 是否高亮58 *@paramstart 开始值59 *@paramnum 偏移量60 *@return61 *@throwsException62*/63publicPaginationSupport doSearch(Listbeans,booleanisHighlighter,intstart,intnum)throwsException;6465
/** *//**66 * 删除某个类型的所有索引(考虑线程安全)67 *68 *@paramclazz 索引类型69 *@throwsException70*/71publicvoiddeleteIndexsByIndexType(Class<?extendsSearchBean>clazz)throwsException;7273
/** *//**74 * 删除某个类型的所有索引(考虑线程安全)75 *76 *@paramindexType 索引类型77 *@throwsException78*/79publicvoiddeleteIndexsByIndexType(String indexType)throwsException;8081
/** *//**82 * 删除所有的索引83 *84 *@throwsException85*/86publicvoiddeleteAllIndexs()throwsException;8788
/** *//**89 * 更新索引90 *91 *@paramsearchBean 需要更新的bean92 *@throwsException93*/94publicvoidupdateIndex(SearchBean searchBean)throwsException;9596
/** *//**97 * 批量更新索引98 *99 *@paramsearchBeans 需要更新的beans100 *@throwsException101*/102publicvoidupdateIndexs(ListsearchBeans)throwsException;103}
AbstractSearchEngine.java
1
packagecom.message.base.search.engine;23importcom.message.base.pagination.PaginationSupport;4importcom.message.base.pagination.PaginationUtils;5importcom.message.base.search.SearchBean;6importcom.message.base.utils.StringUtils;7importorg.slf4j.Logger;8importorg.slf4j.LoggerFactory;910importjava.util.Collections;1112
/** *//**13 * 搜索引擎的公用方法.14 *15 *@authorsunhao(sunhao.java@gmail.com)16 *@versionV1.017 * @createTime 13-5-8 下午10:5318*/19
publicabstractclassAbstractSearchEngineimplementsSearchEngine{20privatestaticfinalLogger logger=LoggerFactory.getLogger(AbstractSearchEngine.class);2122
/** *//**23 * 进行高亮处理时,html片段的前缀24*/25privateString htmlPrefix="
";26
/** *//**27 * 进行高亮处理时,html片段的后缀28*/29privateString htmlSuffix="
";30 31publicString getHtmlPrefix(){32returnhtmlPrefix;33 }3435
publicvoidsetHtmlPrefix(String htmlPrefix){36this.htmlPrefix=htmlPrefix;37 }3839
publicString getHtmlSuffix(){40returnhtmlSuffix;41 }4243
publicvoidsetHtmlSuffix(String htmlSuffix){44this.htmlSuffix=htmlSuffix;45 }4647
publicPaginationSupport doSearch(SearchBean bean,booleanisHighlighter,intstart,intnum)throwsException{48
if(bean==null){49 logger.debug("given search bean is empty!");50returnPaginationUtils.getNullPagination();51 }5253returndoSearch(Collections.singletonList(bean), isHighlighter, start, num);54 }5556
/** *//**57 * 获取index类型58 *59 *@parambean60 *@return61*/62
publicString getIndexType(SearchBean bean){63returnStringUtils.isNotEmpty(bean.getIndexType())?bean.getIndexType() : bean.getClass().getSimpleName();64 }65}
3.开始谈谈lucene
贴上代码先:
LuceneSearchEngine.java
1
packagecom.message.base.search.engine;23importcom.message.base.pagination.PaginationSupport;4importcom.message.base.pagination.PaginationUtils;5importcom.message.base.search.SearchBean;6importcom.message.base.search.SearchInitException;7importcom.message.base.utils.StringUtils;8importorg.apache.lucene.analysis.Analyzer;9importorg.apache.lucene.analysis.SimpleAnalyzer;10importorg.apache.lucene.document.Document;11importorg.apache.lucene.document.Field;12importorg.apache.lucene.index.IndexReader;13importorg.apache.lucene.index.IndexWriter;14importorg.apache.lucene.index.Term;15importorg.apache.lucene.queryParser.MultiFieldQueryParser;16importorg.apache.lucene.search.BooleanClause;17importorg.apache.lucene.search.IndexSearcher;18importorg.apache.lucene.search.Query;19importorg.apache.lucene.search.ScoreDoc;20importorg.apache.lucene.search.highlight.Highlighter;21importorg.apache.lucene.search.highlight.QueryScorer;22importorg.apache.lucene.search.highlight.SimpleHTMLFormatter;23importorg.apache.lucene.store.Directory;24importorg.apache.lucene.store.FSDirectory;25importorg.apache.lucene.util.Version;26importorg.slf4j.Logger;27importorg.slf4j.LoggerFactory;28importorg.springframework.beans.BeanUtils;2930importjava.io.File;31importjava.io.IOException;32importjava.util.*;3334
/** *//**35 * 基于lucene实现的索引引擎.36 *37 *@authorsunhao(sunhao.java@gmail.com)38 *@versionV1.039 * @createTime 13-5-5 上午10:3840*/41
publicclassLuceneSearchEngineextendsAbstractSearchEngine{42privatestaticfinalLogger logger=LoggerFactory.getLogger(LuceneSearchEngine.class);43
/** *//**44 * 索引存放路径45*/46privateString indexPath;47
/** *//**48 * 分词器49*/50privateAnalyzer analyzer=newSimpleAnalyzer();5152
publicsynchronizedvoiddoIndex(ListsearchBeans)throwsException{53this.createOrUpdateIndex(searchBeans,true);54 }5556
publicsynchronizedvoiddeleteIndex(SearchBean bean)throwsException{57
if(bean==null){58 logger.warn("Get search bean is empty!");59return;60 }6162 String id=bean.getId();6364
if(StringUtils.isEmpty(id)){65 logger.warn("get id and id value from bean is empty!");66return;67 }68 String indexType=getIndexType(bean);69 Directory indexDir=this.getIndexDir(indexType);70 IndexWriter writer=this.getWriter(indexDir);7172 writer.deleteDocuments(newTerm("pkId", id));73 writer.commit();74this.destroy(writer);75 }7677
publicsynchronizedvoiddeleteIndexs(Listbeans)throwsException{78
if(beans==null){79 logger.warn("Get beans is empty!");80return;81 }8283
for(SearchBean bean : beans){84this.deleteIndex(bean);85 }86 }8788
publicPaginationSupport doSearch(Listbeans,booleanisHighlighter,intstart,intnum)throwsException{89
if(beans==null||beans.isEmpty()){90 logger.debug("given search beans is empty!");91returnPaginationUtils.getNullPagination();92 }9394 List queryResults=newArrayList();95intcount=0;96
for(SearchBean bean : beans){97 String indexType=getIndexType(bean);9899 IndexReader reader=IndexReader.open(this.getIndexDir(indexType));100101 ListfieldNames=newArrayList();//查询的字段名102ListqueryValue=newArrayList();//待查询字段的值103Listflags=newArrayList();104105//要进行检索的字段106String[] doSearchFields=bean.getDoSearchFields();107if(doSearchFields==null||doSearchFields.length==0)108returnPaginationUtils.getNullPagination();109110//默认字段111
if(StringUtils.isNotEmpty(bean.getKeyword())){112
for(String field : doSearchFields){113 fieldNames.add(field);114 queryValue.add(bean.getKeyword());115 flags.add(BooleanClause.Occur.SHOULD);116 }117 }118119
Query query=MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queryValue.toArray(newString[]{}), fieldNames.toArray(newString[]{}),120
flags.toArray(newBooleanClause.Occur[]{}), analyzer);121122 logger.debug("make query string is '{}'!", query.toString());123 IndexSearcher searcher=newIndexSearcher(reader);124 ScoreDoc[] scoreDocs=searcher.search(query,1000000).scoreDocs;125126//查询起始记录位置127intbegin=(start==-1&&num==-1)?0: start;128//查询终止记录位置129intend=(start==-1&&num==-1)?scoreDocs.length : Math.min(begin+num, scoreDocs.length);130131//高亮处理132Highlighter highlighter=null;133
if(isHighlighter){134 SimpleHTMLFormatter formatter=newSimpleHTMLFormatter(this.getHtmlPrefix(),this.getHtmlSuffix());135 highlighter=newHighlighter(formatter,newQueryScorer(query));136 }137138 Listresults=newArrayList();139
for(inti=begin; i{140 SearchBean result=BeanUtils.instantiate(bean.getClass());141142intdocID=scoreDocs[i].doc;143 Document hitDoc=searcher.doc(docID);144145 result.setId(hitDoc.get("pkId"));146 result.setLink(hitDoc.get("link"));147 result.setOwerId(hitDoc.get("owerId"));148 result.setOwerName(hitDoc.get("owerName"));149 result.setCreateDate(hitDoc.get("createDate"));150 result.setIndexType(indexType);151152 String keyword=StringUtils.EMPTY;153if(isHighlighter&&highlighter!=null)154 keyword=highlighter.getBestFragment(analyzer,"keyword", hitDoc.get("keyword"));155156if(StringUtils.isEmpty(keyword))157 keyword=hitDoc.get("keyword");158159 result.setKeyword(keyword);160161 MapextendValues=newHashMap();162
for(String field : doSearchFields){163 String value=hitDoc.get(field);164if(isHighlighter&&highlighter!=null)165 value=highlighter.getBestFragment(analyzer, field, hitDoc.get(field));166167if(StringUtils.isEmpty(value))168 value=hitDoc.get(field);169170 extendValues.put(field, value);171 }172173 result.setSearchValues(extendValues);174175 results.add(result);176 }177178 queryResults.addAll(results);179 count+=scoreDocs.length;180 searcher.close();181 reader.close();182 }183184 PaginationSupport paginationSupport=PaginationUtils.makePagination(queryResults, count, num, start);185returnpaginationSupport;186 }187188
publicsynchronizedvoiddeleteIndexsByIndexType(Class<?extendsSearchBean>clazz)throwsException{189 String indexType=getIndexType(BeanUtils.instantiate(clazz));190this.deleteIndexsByIndexType(indexType);191 }192193
publicsynchronizedvoiddeleteIndexsByIndexType(String indexType)throwsException{194//传入readOnly的参数,默认是只读的195IndexReader reader=IndexReader.open(this.getIndexDir(indexType),false);196intresult=reader.deleteDocuments(newTerm("indexType", indexType));197 reader.close();198 logger.debug("the rows of delete index is '{}'! index type is '{}'!", result, indexType);199 }200201
publicsynchronizedvoiddeleteAllIndexs()throwsException{202 File indexFolder=newFile(this.indexPath);203
if(indexFolder==null||!indexFolder.isDirectory()){204//不存在或者不是文件夹205logger.debug("indexPath is not a folder! indexPath: '{}'!", indexPath);206return;207 }208209 File[] children=indexFolder.listFiles();210
for(File child : children){211if(child==null||!child.isDirectory())continue;212213 String indexType=child.getName();214 logger.debug("Get indexType is '{}'!", indexType);215216this.deleteIndexsByIndexType(indexType);217 }218 }219220
publicvoidupdateIndex(SearchBean searchBean)throwsException{221this.updateIndexs(Collections.singletonList(searchBean));222 }223224
publicvoidupdateIndexs(ListsearchBeans)throwsException{225this.createOrUpdateIndex(searchBeans,false);226 }227228
/** *//**229 * 创建或者更新索引230 *231 *@paramsearchBeans 需要创建或者更新的对象232 *@paramisCreate 是否是创建索引;true创建索引,false更新索引233 *@throwsException234*/235
privatesynchronizedvoidcreateOrUpdateIndex(ListsearchBeans,booleanisCreate)throwsException{236
if(searchBeans==null||searchBeans.isEmpty()){237 logger.debug("do no index!");238return;239 }240241 Directory indexDir=null;242 IndexWriter writer=null;243
for(Iteratorit=searchBeans.iterator(); it.hasNext(); ){244 SearchBean sb=it.next();245 String indexType=getIndexType(sb);246
if(sb==null){247 logger.debug("give SearchBean is null!");248return;249 }250booleananotherSearchBean=indexDir!=null&&!indexType.equals(((FSDirectory) indexDir).getFile().getName());251
if(indexDir==null||anotherSearchBean){252 indexDir=this.getIndexDir(indexType);253 }254
if(writer==null||anotherSearchBean){255this.destroy(writer);256 writer=this.getWriter(indexDir);257 }258259 Document doc=newDocument();260261//初始化一些字段262sb.initPublicFields();263 String id=sb.getId();264265//主键的索引,不作为搜索字段,并且也不进行分词266Field idField=newField("pkId", id, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);267 doc.add(idField);268269
logger.debug("create id index for '{}', value is '{}'! index is '{}'!",newObject[]{"pkId", id, idField});270271 String owerId=sb.getOwerId();272
if(StringUtils.isEmpty(owerId)){273thrownewSearchInitException("you must give a owerId");274 }275 Field owerId_=newField("owerId", owerId, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);276 doc.add(owerId_);277278 String owerName=sb.getOwerName();279
if(StringUtils.isEmpty(owerName)){280thrownewSearchInitException("you must give a owerName");281 }282 Field owerName_=newField("owerName", owerName, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);283 doc.add(owerName_);284285 String link=sb.getLink();286
if(StringUtils.isEmpty(link)){287thrownewSearchInitException("you must give a link");288 }289 Field link_=newField("link", link, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);290 doc.add(link_);291292 String keyword=sb.getKeyword();293
if(StringUtils.isEmpty(keyword)){294thrownewSearchInitException("you must give a keyword");295 }296 Field keyword_=newField("keyword", keyword, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);297 doc.add(keyword_);298299 String createDate=sb.getCreateDate();300
if(StringUtils.isEmpty(createDate)){301thrownewSearchInitException("you must give a createDate");302 }303 Field createDate_=newField("createDate", createDate, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);304 doc.add(createDate_);305306//索引类型字段307Field indexType_=newField("indexType", indexType, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);308 doc.add(indexType_);309310//进行索引的字段311String[] doIndexFields=sb.getDoIndexFields();312 MapindexFieldValues=sb.getIndexFieldValues();313
if(doIndexFields!=null&&doIndexFields.length>0){314
for(String field : doIndexFields){315 Field extInfoField=newField(field, indexFieldValues.get(field), Field.Store.YES, Field.Index.ANALYZED,316 Field.TermVector.WITH_POSITIONS_OFFSETS);317318 doc.add(extInfoField);319 }320 }321322if(isCreate)323 writer.addDocument(doc);324else325 writer.updateDocument(newTerm("pkId", sb.getId()), doc);326327 writer.optimize();328 }329330this.destroy(writer);331 logger.debug("create or update index success!");332 }333334
publicDirectory getIndexDir(String suffix)throwsException{335returnFSDirectory.open(newFile(indexPath+File.separator+suffix));336 }337338
publicIndexWriter getWriter(Directory indexDir)throwsIOException{339returnnewIndexWriter(indexDir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);340 }341342
publicvoiddestroy(IndexWriter writer)throwsException{343if(writer!=null)344 writer.close();345 }346347
publicvoidsetIndexPath(String indexPath){348this.indexPath=indexPath;349 }350351
publicvoidsetAnalyzer(Analyzer analyzer){352this.analyzer=analyzer;353 }354355}
关于如何使用lucene这里我就不再重复了,网上一大堆这方面的资料,有什么不懂得可以谷歌一下.下面谈谈我的一些想法,有不对的,尽管拍砖,来吧:
....
也没啥好说的,等想到再补充吧,就是觉得有一点比较操蛋,窝心:
1
FSDirectory.open(newFile("D:\index\xxx"/** *//**一个不存在的目录,或者是一个不是索引的目录**/));
使用上面一段取到索引Directory的时候,如果目录不存在会报错.可以有人认为这没什么,就是应该,我封装的这代码里面,确实对这玩意有要求的.
上面的SearchBean.java中有一个字段叫indexType,当没有指定的时候,默认为类名,如MessageSerarchBean,如果我没有对Message进行创建索引操作,在检索的时候就报错了.我得想想用什么方法给解决掉.