最近在研究Lucene的用法,经过这两天的努力,在网上搜索资料,还专门买了本书《开发自己的搜索引擎---Lucene+Heritrix》打算系统 的学习一下这东西,大的项目是肯定离不开搜索引擎的,学吧,没错~ 这两天有过无助、有过失落、也有过新发现时的欣喜若狂,总之最后还是做出了个小例子,怕以后再忘记,还是记录一下吧~也记录自己的成长,只有把学到的东西 讲给别人,才算是真的会了,此例子也献给那些正在搜索Lucene资料的朋友们吧~愿对你们有所帮助~
好了先贴代码吧!
主类:TestIndex.java
- package com.lj.test;
- import java.io.StringReader;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriter.MaxFieldLength;
- import org.apache.lucene.queryParser.MultiFieldQueryParser;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.Filter;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import com.lj.entity.Product;
- import com.lj.util.Configuration;
- /**
- * Lucene初级小例子,简单测试。
- * @author LiangJian
- * 2011-6-17 11:56:14
- */
- public class TestIndex {
- /** 创建Lucene索引 */
- public void createIndex(String indexPath,List<Product> productList) throws Exception{
- //记录开始时间
- long startTime = new Date().getTime();
- /** 建立索引,使用庖丁中文分词器PaodingAnalyzer。*/
- IndexWriter indexWriter = new IndexWriter(indexPath, new PaodingAnalyzer(), true, MaxFieldLength.LIMITED);
- /**
- * 说明:
- * Field.TermVector.NO:不保存term vectors
- Field.TermVector.YES:保存term vectors
- Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
- Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
- Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
- */
- for(Product product:productList){
- Document doc = new Document();
- doc.add(new Field("p_id",product.getP_id()+"",Field.Store.YES,Field.Index.NO));
- doc.add(new Field("p_name",product.getP_name(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(new Field("p_price",product.getP_price(),Field.Store.YES,Field.Index.NO));
- doc.add(new Field("p_content",product.getP_content(),Field.Store.YES,Field.Index.ANALYZED));
- indexWriter.addDocument(doc);
- }
- // optimize()方法是对索引进行优化,进行了索引优化后,索引才算是真正的生效。
- indexWriter.optimize();
- indexWriter.close();
- // 测试一下索引的时间
- long endTime = new Date().getTime();
- System.out.println("这花费了 " + (endTime - startTime)+ "毫秒来把数据增加到索引里面去!");
- }
- /**
- * 按Content字段查询
- * @param indexPath 索引文件路径
- * @param keyword 关键字
- * @return
- * @throws Exception
- */
- public List<Product> searchByKeyWord(String indexPath,String keyword)throws Exception{
- List<Product> productList = new ArrayList<Product>();
- IndexSearcher search = new IndexSearcher(indexPath);
- long startTime = new Date().getTime();
- //下面的是进行p_content和p_name 范围内进行搜索.
- String[] keywords = new String[]{"p_content","p_name"};//要检索的字段
- /** 这里需要注意的就是BooleanClause.Occur[]数组,它表示多个条件之间的关系,
- * BooleanClause.Occur.MUST表示and,
- * BooleanClause.Occur.MUST_NOT表示not,
- * BooleanClause.Occur.SHOULD表示or.
- * */
- BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD};//对应要检索的字段的逻辑(与、或)
- Analyzer analyzer = new PaodingAnalyzer();//使用庖丁分词,按分词进行检索
- //用MultiFieldQueryParser得到query对象
- Query query = MultiFieldQueryParser.parse(keyword, keywords, clauses, analyzer);//parser.parse(query);
- Filter filter = null;//过滤
- //开始匹配
- TopDocs topDocs = search.search(query, filter, 1000);
- System.out.println("共匹配到:"+topDocs.totalHits+"个.");
- for(ScoreDoc scorceDoc : topDocs.scoreDocs){
- Document doc = search.doc(scorceDoc.doc);
- // System.out.println(scorceDoc.doc+"---"+doc);//便于学习,可以打印出来看看。
- Product product = new Product();
- product.setP_id(Integer.parseInt(doc.get("p_id")));
- product.setP_name(doc.get("p_name"));
- product.setP_price(doc.get("p_price"));
- // product.setP_content(doc.get("p_content"));//不使用高亮
- product.setP_content(this.getHighLight(doc, analyzer, query, "p_content"));//使用高亮
- productList.add(product);
- }
- search.close();
- long endTime = new Date().getTime();
- System.out.println("检索耗时: " + (endTime - startTime)+ "毫秒!");
- return productList;
- }
- /**
- * 高亮设置
- * @param doc
- * @param analyzer 分词器
- * @param query
- * @param field 字段
- * @throws Exception
- * @reutrn 高亮后的值
- */
- public String getHighLight(Document doc,Analyzer analyzer,Query query,String field)throws Exception{
- //设置高亮显示格式
- // SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b>", "</b>");
- /* 语法高亮显示设置 */
- Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
- highlighter.setTextFragmenter(new SimpleFragmenter(100));
- // 取 field 字段值,准备进行高亮
- String fieldValue = doc.get(field);
- TokenStream tokenStream = analyzer.tokenStream(field,new StringReader(fieldValue));
- //转成高亮的值
- String highLightFieldValue = highlighter.getBestFragment(tokenStream, fieldValue);
- if(highLightFieldValue == null)
- highLightFieldValue = fieldValue;
- return highLightFieldValue;
- }
- /** 创建测试数据 */
- public List<Product> createProductList(){
- List<Product> productList = new ArrayList<Product>();
- for(int i=1;i<=20;i++){
- Product product = new Product();
- product.setP_id(i);
- product.setP_name("手表"+i);
- product.setP_price((i*i+Math.random())+"元");
- product.setP_content("手表的描述"+i+"块");
- productList.add(product);
- }
- return productList;
- }
- /**
- * 测试主方法
- * @param args
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- TestIndex test = new TestIndex();
- String indexPath = Configuration.getInstance().read("config.properties", "indexPath");
- //创建Lucene索引
- test.createIndex(indexPath+"Product/index", test.createProductList());
- //从Lucene索引库中——搜索
- List<Product> productList = test.searchByKeyWord(indexPath+"Product/index", "手表4 描述3");
- //搜索结果
- for(Product product:productList){
- System.out.println("---------------");
- System.out.println("p_id:"+product.getP_id());
- System.out.println("p_name:"+product.getP_name());
- System.out.println("p_price:"+product.getP_price());
- System.out.println("p_content:"+product.getP_content());
- System.out.println("---------------");
- }
- }
- }
读取配置文件类:Configuration.java
- package com.lj.util;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Properties;
- public class Configuration {
- //采用单例模式
- private static final Configuration configuration = new Configuration();
- private Configuration(){}
- public synchronized static Configuration getInstance(){
- return configuration;
- }
- public String read(String properties,String key){
- //读取配置文件
- InputStream in = this.getClass().getClassLoader().getResourceAsStream(properties);
- Properties p = new Properties();
- try {
- p.load(in);
- } catch (IOException e) {
- e.printStackTrace();
- }
- //取得配置文件中的值
- return p.getProperty(key);
- }
- }
配置文件:config.properties
- #配置索引路径
- indexPath=d:/LuceneIndex/LuceneTest02/
运行结果:
这花费了 1672毫秒来把数据增加到索引里面去!
共匹配到:2个.
检索耗时: 109毫秒!
---------------
p_id:4
p_name:手表4
p_price:16.29956430691176元
p_content:<b>手表</b>的<b>描述</b><b>4</b>块
---------------
---------------
p_id:3
p_name:手表3
p_price:9.465650388124237元
p_content:<b>手表</b>的<b>描述</b><b>3</b>块
---------------
你需要导入几个Jar文件:
lucene-core-2.4.1.jar(Lucene核心包)
lucene-highlighter-2.4.0.jar(用于高亮显示的)
commons-logging.jar
paoding-analysis.jar(到网上搜索paoding-analysis-2.0.4-beta,解压后把Jar拷贝过来就OK,把dic文件夹拷贝到工程的根目录下,或者配置PAODING_DIC_HOME环境变量到bic目录下)