spring+Lucene4.6构建全文检索

最新推荐文章于 2024-08-16 15:33:24 发布

百兆

最新推荐文章于 2024-08-16 15:33:24 发布

阅读量1k

点赞数 1

分类专栏： spring Lucene

本文链接：https://blog.csdn.net/syddgb/article/details/77604360

版权

spring 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

Lucene

1 篇文章 0 订阅

订阅专栏

Spring+Lucene4.6构建全文检索

近来的一个web项目中，甲方说要支持站内主要信息的搜索，立马就想到用ES来满足他，但是由于种种原因，ES的方案流产。所以就变成Spring+Lucene的架构，由于是在老项目上的二次开发，Spring版本3.1.2，所以Lucene版本不敢使用最新的。行吧，我胆小……

直接上肉：Spring的核心配置（applicationcontext.xml）,现在Spring boot上来了，就不用再整这个玩意了

<!-- 基于 LUCENE 全文检索 配 -->  
    <!-- 分词器 -->
    <bean id="ikAnalyzer" class="org.wltea.analyzer.lucene.IKAnalyzer">
    	<constructor-arg name="useSmart" value="true" /> 
    </bean>   
    <!-- lucene 索引路径 -->
    <bean id="luceneDirectory" class="org.apache.lucene.store.SimpleFSDirectory" >     
        <constructor-arg>       
            <bean class="java.io.File">         
                <constructor-arg value="D:\\luceneTestDir" />       
            </bean>     
        </constructor-arg>     
    </bean>   
    <!-- lucene 4.5以上 -->
    <bean id="matchVersion46" class="org.springframework.beans.factory.config.FieldRetrievingFactoryBean">
        <property name="staticField" value="org.apache.lucene.util.Version.LUCENE_45" />
    </bean> 
    <!-- Lucene 配置类 -->
  	<bean id="indexWriterConfig" class="org.apache.lucene.index.IndexWriterConfig">
  		 <constructor-arg name="matchVersion" ref="matchVersion46"/>
  		 <constructor-arg name="analyzer" ref="ikAnalyzer"/>
  	</bean>
  	<!-- Lucene 写 -->
  	<bean id="indexWriter" class="org.apache.lucene.index.IndexWriter">
  		<constructor-arg ref="luceneDirectory" />
  		<constructor-arg ref="indexWriterConfig" />
  	</bean>

接下来的事，就是开发服务啦……

1.InfoDocument.java (信息对象类)

public class InfoDocument {
	//唯一标识，用于关联业务表
	private String id;
	//业务类型
	private String entitytype;
	//标题,涉及分词
	private String title;
	//机构名称，涉及分词
	private String orgname;
	//人员名称，涉及分词
	private String username;
	//创键日期
	private String createdate;
	//创建人名称
	private String createuser;
	//链接
	private String url;
	//信息数据类型（中文），由于可能只查询部分类型，所以涉及分词
	private String infotypes;
	//信息数据状态（更新数据时变更）
	private String infostate;
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getEntitytype() {
		return entitytype;
	}
	public void setEntitytype(String entitytype) {
		this.entitytype = entitytype;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getOrgname() {
		return orgname;
	}
	public void setOrgname(String orgname) {
		this.orgname = orgname;
	}
	public String getUsername() {
		return username;
	}
	public void setUsername(String username) {
		this.username = username;
	}
	public String getCreatedate() {
		return createdate;
	}
	public void setCreatedate(String createdate) {
		this.createdate = createdate;
	}
	public String getCreateuser() {
		return createuser;
	}
	public void setCreateuser(String createuser) {
		this.createuser = createuser;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getInfotypes() {
		return infotypes;
	}
	public void setInfotypes(String infotypes) {
		this.infotypes = infotypes;
	}
	public String getInfostate() {
		return infostate;
	}
	public void setInfostate(String infostate) {
		this.infostate = infostate;
	}
	  
	
}

2.InfoSearchService.java (供调用的服务）

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.alibaba.fastjson.JSONObject;
import com.yuanwang.infoSearch.entity.InfoDocument;

@Service
public class InfoSearchService {
	@Autowired
	IKAnalyzer ikAnalyzer;
	@Autowired
	SimpleFSDirectory luceneDirectory;
	@Autowired
	IndexWriter indexWriter;
	
	/**
	 * 添加信息数据到索引
	 * @param obj
	 * @return
	 * @throws IOException 
	 */
	public boolean addIndexforObject(InfoDocument obj) throws IOException {
		boolean ret = true;
		Document indexDoc = new Document();
		JSONObject json = (JSONObject) JSONObject.toJSON(obj);
		for(String fieldKey : json.keySet()){
			//id, title, username, infotypes, createuser, orgname, createdate, infostate, entitytype, url
			if(!"".equals(json.getString(fieldKey)) && json.getString(fieldKey) != null){
				indexDoc.add(new TextField(fieldKey, json.getString(fieldKey), Field.Store.YES));
			}
			
		}
		try {
			indexWriter.addDocument(indexDoc);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ret = false;
		} finally{
			indexWriter.commit(); 
		}
		return ret;
	}

	/**
	 * 更新数据到索引
	 * @param obj
	 * @return
	 * @throws IOException 
	 */
	public boolean upateIndexforObject(Object obj) throws IOException {
		boolean ret = true;
		Document indexDoc = new Document();
		JSONObject json = (JSONObject) JSONObject.toJSON(obj);
		for(String fieldKey : json.keySet()){
			//id, title, username, infotypes, createuser, orgname, createdate, infostate, entitytype, url
			if(!"".equals(json.getString(fieldKey)) && json.getString(fieldKey) != null){
				indexDoc.add(new TextField(fieldKey, json.getString(fieldKey), Field.Store.YES));
			}
			
		}
		 Term term = new Term("id",json.getString("id"));
		try {
			indexWriter.updateDocument(term, indexDoc);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			ret = false;
		} finally{
			indexWriter.commit(); 
		}
		
		return ret;
	}
	
	/**
	 * 获取分页信息
	 * @param key
	 * @param start
	 * @param end
	 * @param infoState
	 * @param infoTypes
	 * @return
	 * @throws IOException 
	 * @throws ParseException 
	 * @throws InvalidTokenOffsetsException 
	 */
	public JSONObject getInfoByKeyPage(String key, int pageIndex, int pageSize, 
			String infoState, String infoTypes) throws IOException, ParseException, InvalidTokenOffsetsException {
		JSONObject json = new JSONObject();
		json.put("entityType", "info");
		IndexSearcher indexSearcher = this.initIndexSearcher();
		//记录开始查询时间
		long begin = System.currentTimeMillis(); 
		Query query = this.initInfoQuery(key,infoState, infoTypes);
		if(query != null){
			//获取上一页的最后一个元素  
	        ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, query, indexSearcher);  
	        
            Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");    
            Scorer fragmentScorer = new QueryScorer(query);
            Highlighter highlighter = new Highlighter(formatter,fragmentScorer);
            Fragmenter fragmenter = new SimpleFragmenter(100);
            highlighter.setTextFragmenter(fragmenter);
	        
	        TopDocs tds = indexSearcher.searchAfter(lastSd, query, pageSize); 
	        List<JSONObject> docs = new ArrayList<JSONObject>();
	        for(ScoreDoc sd:tds.scoreDocs) {  
	            Document doc = indexSearcher.doc(sd.doc);  
	            JSONObject docjson = new JSONObject();
	            if(!"".equals(doc.get("id")) && doc.get("id") != null){
	            	for(int i= 0; i < doc.getFields().size(); i++){
	            		String value = highlighter.getBestFragment(ikAnalyzer, doc.getFields().get(i).name(), doc.getFields().get(i).stringValue());
		            	docjson.put( doc.getFields().get(i).name(),  
		            			value == null?doc.getFields().get(i).stringValue():value);
		            }
	            	docjson.put("score", sd.score);
	            	docs.add(docjson);
	            }
	        }  
	        json.put("data", docs);
		}
		json.put("costtime", (System.currentTimeMillis() - begin) + "ms");
		return json;
	}

	/**
	 * 初始化一个IndexSearcher
	 */
	private IndexSearcher initIndexSearcher() throws IOException {
		DirectoryReader indexReader = DirectoryReader.open(luceneDirectory);
	    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
	    return indexSearcher;
	}
	
	/** 
	 * 根据页码和分页大小获取上一次的最后一个scoredocs 
	 * @param pageIndex 
	 * @param pageSize 
	 * @param query 
	 * @param searcher 
	 * @return 
	 * @throws IOException 
	 */  
	private ScoreDoc getLastScoreDoc(int pageIndex, int pageSize, Query query, 
			IndexSearcher searcher) throws IOException {  
	    if(pageIndex==1)return null;//如果是第一页就返回空  
	    int num = pageSize*(pageIndex-1);//获取上一页的最后数量  
	    TopDocs tds = searcher.search(query, num);  
	    return tds.scoreDocs[num-1];  
	} 
	
	/**
	 * 获取信息查询对象
	 * @param key
	 * @param infoState
	 * @param infoTypes
	 * @return
	 * @throws ParseException
	 */
	private Query initInfoQuery(String key, String infoState, String infoTypes) 
			throws ParseException{
		//设置查询条件(标题+机构+人+开始值+结束值)
				int querySize = 3;
				boolean infoStateFlag = false;
				boolean infoTypesFlag = false;
				if(!"".equals(infoState) && infoState != null){
					querySize += 1;
					infoStateFlag = true;
				}
				if(!"".equals(infoTypes) && infoTypes != null){
					querySize += 1;
					infoTypesFlag = true;
				}
				String[] queryString = new String[querySize];
				String[] fields = new String[querySize];
				
				queryString[0] = key;
				fields[0] = "title";

				queryString[1] = key;
				fields[1] = "orgname";
			
				queryString[2] = key;
				fields[2] = "username";
				
				if(infoTypesFlag && infoStateFlag){
					queryString[3] = infoState;
					fields[3] = "infotypes";
					queryString[4] = infoTypes;
					fields[4] = "infostate";
				}else if(infoTypesFlag && !infoStateFlag){
					queryString[3] = infoTypes;
					fields[3] = "infotypes";
				}else if(!infoTypesFlag && infoStateFlag){
					queryString[3] = infoState;
					fields[3] = "infotypes";
				}
				BooleanClause.Occur[] clauses = new BooleanClause.Occur[querySize];
				for(int i = 0; i < querySize; i++){
					clauses[i] = BooleanClause.Occur.SHOULD;
				}
			
				return MultiFieldQueryParser.parse(Version.LUCENE_46, queryString, fields, clauses, ikAnalyzer);
	}

}

上面这些就是Lucene的全文检索的服务代码……，接下来，记录一下分词IKAnalyzer

首先说一下IKAnalyzer的源码里面的一个例子

package org.wltea.analyzer.sample;

import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class IKAnalzyerDemo
{
  public static void main(String[] args)
  {
    Analyzer analyzer = new IKAnalyzer(true);
    

    TokenStream ts = null;
    try
    {
      ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子，你可以直接运行它！IKAnalyer can analysis english text too"));
      
      OffsetAttribute offset = (OffsetAttribute)ts.addAttribute(OffsetAttribute.class);
      
      CharTermAttribute term = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
      
      TypeAttribute type = (TypeAttribute)ts.addAttribute(TypeAttribute.class);
      


      ts.reset();
      while (ts.incrementToken()) {
        System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
      }
      ts.end();
    }
    catch (IOException e)
    {
      e.printStackTrace();
      if (ts != null) {
        try
        {
          ts.close();
        }
        catch (IOException e)
        {
          e.printStackTrace();
        }
      }
    }
    finally
    {
      if (ts != null) {
        try
        {
          ts.close();
        }
        catch (IOException e)
        {
          e.printStackTrace();
        }
      }
    }
  }
}

例子也是比较简单，这里分词开发大牛1024个赞……

再来看源码的的一个默认配置类

package org.wltea.analyzer.cfg;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;

public class DefaultConfig
  implements Configuration
{
  private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
  private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
  private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
  private static final String EXT_DICT = "ext_dict";
  private static final String EXT_STOP = "ext_stopwords";
  private Properties props;
  private boolean useSmart;
  
  public static Configuration getInstance()
  {
    return new DefaultConfig();
  }
  
  private DefaultConfig()
  {
    this.props = new Properties();
    
    InputStream input = getClass().getClassLoader().getResourceAsStream("IKAnalyzer.cfg.xml");
    if (input != null) {
      try
      {
        this.props.loadFromXML(input);
      }
      catch (InvalidPropertiesFormatException e)
      {
        e.printStackTrace();
      }
      catch (IOException e)
      {
        e.printStackTrace();
      }
    }
  }
  
  public boolean useSmart()
  {
    return this.useSmart;
  }
  
  public void setUseSmart(boolean useSmart)
  {
    this.useSmart = useSmart;
  }
  
  public String getMainDictionary()
  {
    return "org/wltea/analyzer/dic/main2012.dic";
  }
  
  public String getQuantifierDicionary()
  {
    return "org/wltea/analyzer/dic/quantifier.dic";
  }
  
  public List<String> getExtDictionarys()
  {
    List<String> extDictFiles = new ArrayList(2);
    String extDictCfg = this.props.getProperty("ext_dict");
    if (extDictCfg != null)
    {
      String[] filePaths = extDictCfg.split(";");
      if (filePaths != null) {
        for (String filePath : filePaths) {
          if ((filePath != null) && (!"".equals(filePath.trim()))) {
            extDictFiles.add(filePath.trim());
          }
        }
      }
    }
    return extDictFiles;
  }
  
  public List<String> getExtStopWordDictionarys()
  {
    List<String> extStopWordDictFiles = new ArrayList(2);
    String extStopWordDictCfg = this.props.getProperty("ext_stopwords");
    if (extStopWordDictCfg != null)
    {
      String[] filePaths = extStopWordDictCfg.split(";");
      if (filePaths != null) {
        for (String filePath : filePaths) {
          if ((filePath != null) && (!"".equals(filePath.trim()))) {
            extStopWordDictFiles.add(filePath.trim());
          }
        }
      }
    }
    return extStopWordDictFiles;
  }
}

好，看完这些，就需要在src目录下添加几个关于ik分词的配置文件

IKAnalyzer.cfg.xml（源码里面说的，就叫这个名字）

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">  
<properties>  
	<comment>IK Analyzer 扩展配置</comment>
	<!--用户可以在这里配置自己的扩展字典 -->
	<entry key="ext_dict">ext.dic;</entry> 
	
	<!--用户可以在这里配置自己的扩展停止词字典-->
	<entry key="ext_stopwords">stopword.dic;chinese_stopword.dic</entry> 
	
</properties>

接下就是在src下创建IKAnalyzer.cfg.xml说的*.dic（字典文件），这里补充一下，ik分词jar包里面有两个分词字典：