Spring+Lucene4.6构建全文检索
近来的一个web项目中,甲方说要支持站内主要信息的搜索,立马就想到用ES来满足他,但是由于种种原因,ES的方案流产。所以就变成Spring+Lucene的架构,由于是在老项目上的二次开发,Spring版本3.1.2,所以Lucene版本不敢使用最新的。行吧,我胆小……
直接上肉:Spring的核心配置(applicationcontext.xml),现在Spring boot上来了,就不用再整这个玩意了
<!-- 基于 LUCENE 全文检索 配 -->
<!-- 分词器 -->
<bean id="ikAnalyzer" class="org.wltea.analyzer.lucene.IKAnalyzer">
<constructor-arg name="useSmart" value="true" />
</bean>
<!-- lucene 索引路径 -->
<bean id="luceneDirectory" class="org.apache.lucene.store.SimpleFSDirectory" >
<constructor-arg>
<bean class="java.io.File">
<constructor-arg value="D:\\luceneTestDir" />
</bean>
</constructor-arg>
</bean>
<!-- lucene 4.5以上 -->
<bean id="matchVersion46" class="org.springframework.beans.factory.config.FieldRetrievingFactoryBean">
<property name="staticField" value="org.apache.lucene.util.Version.LUCENE_45" />
</bean>
<!-- Lucene 配置类 -->
<bean id="indexWriterConfig" class="org.apache.lucene.index.IndexWriterConfig">
<constructor-arg name="matchVersion" ref="matchVersion46"/>
<constructor-arg name="analyzer" ref="ikAnalyzer"/>
</bean>
<!-- Lucene 写 -->
<bean id="indexWriter" class="org.apache.lucene.index.IndexWriter">
<constructor-arg ref="luceneDirectory" />
<constructor-arg ref="indexWriterConfig" />
</bean>
接下来的事,就是开发服务啦……
1.InfoDocument.java (信息对象类)
public class InfoDocument {
//唯一标识,用于关联业务表
private String id;
//业务类型
private String entitytype;
//标题,涉及分词
private String title;
//机构名称,涉及分词
private String orgname;
//人员名称,涉及分词
private String username;
//创键日期
private String createdate;
//创建人名称
private String createuser;
//链接
private String url;
//信息数据类型(中文),由于可能只查询部分类型,所以涉及分词
private String infotypes;
//信息数据状态(更新数据时变更)
private String infostate;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getEntitytype() {
return entitytype;
}
public void setEntitytype(String entitytype) {
this.entitytype = entitytype;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getOrgname() {
return orgname;
}
public void setOrgname(String orgname) {
this.orgname = orgname;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getCreatedate() {
return createdate;
}
public void setCreatedate(String createdate) {
this.createdate = createdate;
}
public String getCreateuser() {
return createuser;
}
public void setCreateuser(String createuser) {
this.createuser = createuser;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getInfotypes() {
return infotypes;
}
public void setInfotypes(String infotypes) {
this.infotypes = infotypes;
}
public String getInfostate() {
return infostate;
}
public void setInfostate(String infostate) {
this.infostate = infostate;
}
}
2.InfoSearchService.java (供调用的服务)
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.alibaba.fastjson.JSONObject;
import com.yuanwang.infoSearch.entity.InfoDocument;
@Service
public class InfoSearchService {
@Autowired
IKAnalyzer ikAnalyzer;
@Autowired
SimpleFSDirectory luceneDirectory;
@Autowired
IndexWriter indexWriter;
/**
* 添加信息数据到索引
* @param obj
* @return
* @throws IOException
*/
public boolean addIndexforObject(InfoDocument obj) throws IOException {
boolean ret = true;
Document indexDoc = new Document();
JSONObject json = (JSONObject) JSONObject.toJSON(obj);
for(String fieldKey : json.keySet()){
//id, title, username, infotypes, createuser, orgname, createdate, infostate, entitytype, url
if(!"".equals(json.getString(fieldKey)) && json.getString(fieldKey) != null){
indexDoc.add(new TextField(fieldKey, json.getString(fieldKey), Field.Store.YES));
}
}
try {
indexWriter.addDocument(indexDoc);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
ret = false;
} finally{
indexWriter.commit();
}
return ret;
}
/**
* 更新数据到索引
* @param obj
* @return
* @throws IOException
*/
public boolean upateIndexforObject(Object obj) throws IOException {
boolean ret = true;
Document indexDoc = new Document();
JSONObject json = (JSONObject) JSONObject.toJSON(obj);
for(String fieldKey : json.keySet()){
//id, title, username, infotypes, createuser, orgname, createdate, infostate, entitytype, url
if(!"".equals(json.getString(fieldKey)) && json.getString(fieldKey) != null){
indexDoc.add(new TextField(fieldKey, json.getString(fieldKey), Field.Store.YES));
}
}
Term term = new Term("id",json.getString("id"));
try {
indexWriter.updateDocument(term, indexDoc);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
ret = false;
} finally{
indexWriter.commit();
}
return ret;
}
/**
* 获取分页信息
* @param key
* @param start
* @param end
* @param infoState
* @param infoTypes
* @return
* @throws IOException
* @throws ParseException
* @throws InvalidTokenOffsetsException
*/
public JSONObject getInfoByKeyPage(String key, int pageIndex, int pageSize,
String infoState, String infoTypes) throws IOException, ParseException, InvalidTokenOffsetsException {
JSONObject json = new JSONObject();
json.put("entityType", "info");
IndexSearcher indexSearcher = this.initIndexSearcher();
//记录开始查询时间
long begin = System.currentTimeMillis();
Query query = this.initInfoQuery(key,infoState, infoTypes);
if(query != null){
//获取上一页的最后一个元素
ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, query, indexSearcher);
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Scorer fragmentScorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter,fragmentScorer);
Fragmenter fragmenter = new SimpleFragmenter(100);
highlighter.setTextFragmenter(fragmenter);
TopDocs tds = indexSearcher.searchAfter(lastSd, query, pageSize);
List<JSONObject> docs = new ArrayList<JSONObject>();
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = indexSearcher.doc(sd.doc);
JSONObject docjson = new JSONObject();
if(!"".equals(doc.get("id")) && doc.get("id") != null){
for(int i= 0; i < doc.getFields().size(); i++){
String value = highlighter.getBestFragment(ikAnalyzer, doc.getFields().get(i).name(), doc.getFields().get(i).stringValue());
docjson.put( doc.getFields().get(i).name(),
value == null?doc.getFields().get(i).stringValue():value);
}
docjson.put("score", sd.score);
docs.add(docjson);
}
}
json.put("data", docs);
}
json.put("costtime", (System.currentTimeMillis() - begin) + "ms");
return json;
}
/**
* 初始化一个IndexSearcher
*/
private IndexSearcher initIndexSearcher() throws IOException {
DirectoryReader indexReader = DirectoryReader.open(luceneDirectory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
return indexSearcher;
}
/**
* 根据页码和分页大小获取上一次的最后一个scoredocs
* @param pageIndex
* @param pageSize
* @param query
* @param searcher
* @return
* @throws IOException
*/
private ScoreDoc getLastScoreDoc(int pageIndex, int pageSize, Query query,
IndexSearcher searcher) throws IOException {
if(pageIndex==1)return null;//如果是第一页就返回空
int num = pageSize*(pageIndex-1);//获取上一页的最后数量
TopDocs tds = searcher.search(query, num);
return tds.scoreDocs[num-1];
}
/**
* 获取信息查询对象
* @param key
* @param infoState
* @param infoTypes
* @return
* @throws ParseException
*/
private Query initInfoQuery(String key, String infoState, String infoTypes)
throws ParseException{
//设置查询条件(标题+机构+人+开始值+结束值)
int querySize = 3;
boolean infoStateFlag = false;
boolean infoTypesFlag = false;
if(!"".equals(infoState) && infoState != null){
querySize += 1;
infoStateFlag = true;
}
if(!"".equals(infoTypes) && infoTypes != null){
querySize += 1;
infoTypesFlag = true;
}
String[] queryString = new String[querySize];
String[] fields = new String[querySize];
queryString[0] = key;
fields[0] = "title";
queryString[1] = key;
fields[1] = "orgname";
queryString[2] = key;
fields[2] = "username";
if(infoTypesFlag && infoStateFlag){
queryString[3] = infoState;
fields[3] = "infotypes";
queryString[4] = infoTypes;
fields[4] = "infostate";
}else if(infoTypesFlag && !infoStateFlag){
queryString[3] = infoTypes;
fields[3] = "infotypes";
}else if(!infoTypesFlag && infoStateFlag){
queryString[3] = infoState;
fields[3] = "infotypes";
}
BooleanClause.Occur[] clauses = new BooleanClause.Occur[querySize];
for(int i = 0; i < querySize; i++){
clauses[i] = BooleanClause.Occur.SHOULD;
}
return MultiFieldQueryParser.parse(Version.LUCENE_46, queryString, fields, clauses, ikAnalyzer);
}
}
上面这些就是Lucene的全文检索的服务代码……,接下来,记录一下分词IKAnalyzer
首先说一下IKAnalyzer的源码里面的一个例子
package org.wltea.analyzer.sample;
import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IKAnalzyerDemo
{
public static void main(String[] args)
{
Analyzer analyzer = new IKAnalyzer(true);
TokenStream ts = null;
try
{
ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
OffsetAttribute offset = (OffsetAttribute)ts.addAttribute(OffsetAttribute.class);
CharTermAttribute term = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
TypeAttribute type = (TypeAttribute)ts.addAttribute(TypeAttribute.class);
ts.reset();
while (ts.incrementToken()) {
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
}
ts.end();
}
catch (IOException e)
{
e.printStackTrace();
if (ts != null) {
try
{
ts.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
finally
{
if (ts != null) {
try
{
ts.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}
}
例子也是比较简单,这里分词开发大牛1024个赞……
再来看源码的的一个默认配置类
package org.wltea.analyzer.cfg;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
public class DefaultConfig
implements Configuration
{
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict";
private static final String EXT_STOP = "ext_stopwords";
private Properties props;
private boolean useSmart;
public static Configuration getInstance()
{
return new DefaultConfig();
}
private DefaultConfig()
{
this.props = new Properties();
InputStream input = getClass().getClassLoader().getResourceAsStream("IKAnalyzer.cfg.xml");
if (input != null) {
try
{
this.props.loadFromXML(input);
}
catch (InvalidPropertiesFormatException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
public boolean useSmart()
{
return this.useSmart;
}
public void setUseSmart(boolean useSmart)
{
this.useSmart = useSmart;
}
public String getMainDictionary()
{
return "org/wltea/analyzer/dic/main2012.dic";
}
public String getQuantifierDicionary()
{
return "org/wltea/analyzer/dic/quantifier.dic";
}
public List<String> getExtDictionarys()
{
List<String> extDictFiles = new ArrayList(2);
String extDictCfg = this.props.getProperty("ext_dict");
if (extDictCfg != null)
{
String[] filePaths = extDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if ((filePath != null) && (!"".equals(filePath.trim()))) {
extDictFiles.add(filePath.trim());
}
}
}
}
return extDictFiles;
}
public List<String> getExtStopWordDictionarys()
{
List<String> extStopWordDictFiles = new ArrayList(2);
String extStopWordDictCfg = this.props.getProperty("ext_stopwords");
if (extStopWordDictCfg != null)
{
String[] filePaths = extStopWordDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if ((filePath != null) && (!"".equals(filePath.trim()))) {
extStopWordDictFiles.add(filePath.trim());
}
}
}
}
return extStopWordDictFiles;
}
}
好,看完这些,就需要在src目录下添加几个关于ik分词的配置文件
IKAnalyzer.cfg.xml(源码里面说的,就叫这个名字)
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;chinese_stopword.dic</entry>
</properties>
接下就是在src下创建IKAnalyzer.cfg.xml说的*.dic(字典文件),这里补充一下,ik分词jar包里面有两个分词字典:
打完,收工。下一篇会记录一下原理……