lucene简单应用--多字段内容检索

最新推荐文章于 2021-03-13 14:06:50 发布

zxw394

最新推荐文章于 2021-03-13 14:06:50 发布

阅读量638

点赞数

文章标签： lucene

package lucene;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class HelloWord {
	/**
	 * 建立索引
	 */
    public static void createIndexFile() {  
        IndexWriter indexWriter=null;  
        try {  
            // 需要的分词器  
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);  
            // 创建的是哪个版本的IndexWriterConfig  
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(  
                    Version.LUCENE_36, analyzer);  
            // 创建系统文件  
            Directory directory = new SimpleFSDirectory(new File("c:"+File.separator+"index"));  
            indexWriter = new IndexWriter(directory,indexWriterConfig);   
            //获取实体对象  
            Article article=new Article(121,"梅西","XX2");    
            //indexWriter添加索引  
            Document doc=new Document();  
            //文本中添加内容     标题     内容  
            doc.add(new Field("id",article.getId().toString(),Store.YES,Index.ANALYZED));  
            doc.add(new Field("title",article.getTitle().toString(),Store.YES,Index.ANALYZED));  
            doc.add(new Field("content",article.getContent().toString(),Store.YES,Index.ANALYZED));   
            //添加到索引中去  
            indexWriter.addDocument(doc);
//            indexWriter.deleteAll();
            
        } catch (IOException e) {  
            e.printStackTrace();  
        }finally{  
            if(indexWriter!=null){  
                try {  
                    indexWriter.close();  
                }  catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
        }  
    }
    /**
     * 查询，搜索
     * @throws Exception
     */
    //如果查询是需要用到解析器，那解析器必须和创建时的解析器相同  
    public static void searchIndexFileResult() throws Exception{   
        List<Article> articles=new ArrayList<Article>();      
        //得到索引的目录  
        Directory directory = new SimpleFSDirectory(new File("C:"+File.separator+"index"));  
        //根据目录打开一个indexReader  
        IndexReader indexReader=IndexReader.open(directory);
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        //搜索的字段数组
        String[] strArray = new String[]{"title","id","content"};
        //检查搜索内容是否在3个字段里，SHOULD是‘或者’的关系，取的是3个字段检索的并集
        Occur[] occArray = 
            new Occur[]{BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD};
        //要搜索的内容
        String searchStr = "121";
        //搜索类
        IndexSearcher searcher = new IndexSearcher(indexReader);
        //多字段搜索
        Query query  = 
            MultiFieldQueryParser.parse(Version.LUCENE_36, searchStr,strArray,occArray, analyzer);
        //第二个参数是最多查询多少个；TopDocs封装的是搜索结果
        TopDocs docs = searcher.search(query, 100);
        System.out.println(docs.totalHits);
        //是一个文档集合
        ScoreDoc[] scores =  docs.scoreDocs;
        System.out.println(scores.length);
        //看ScoreDoc的结构--类似doc=1 score=1.8472979 shardIndex=-1
        System.out.println(Arrays.toString(scores));
        for(int i=0;i<scores.length;i++){
        	//每个文档都有一个doc号
        	int docc = scores[i].doc;
        	System.out.println(docc);
        	//根据doc号确定哪条记录
        	Document doc = searcher.doc(docc);
            Article article=new Article();  
                article.setId(Integer.parseInt(doc.get("id")));  
                article.setTitle(doc.get("title"));  
                article.setContent(doc.get("content"));  
                articles.add(article);  
        }  
        for(Article article:articles){  
            System.out.println(article.toString());  
        }     
    }  
    public static void main(String[] args) throws Exception {  
        // 建立要索引的文件  
    	createIndexFile();  
        // 从索引文件中查询数据  
        searchIndexFileResult();  
        // 获得结果，然后交由相关应用程序处理  
    }  
}

以上是我测试的源代码，在c盘上创建一个index文件夹后就可以用了，注释都写了！

其中Article类如下：

package lucene;

public class Article {
	private Integer id;
	private String title;
	private String content;
	
	public Article(){
		
	}
	public Article(Integer id,String title,String content){
		this.id=id;
		this.title=title;
		this.content = content;
	}
        //set和get...省略

补充：

说下Lucene检索的整个流程

1、建立索引的执行过程

在建立索引时，先要把文档存到索引库中，还要更新词汇表。

操作步骤如下：

（1）、把数据对象转换成相应的Document，其中的属性转为Field；

（2）、调用工具IndexWriter的addDocument(doc)，把Document添加到索引库中；

（3）、Lucene做的操作：

把文档存到索引库中，并自动指定一个内部编号，用来唯一标识这个条数据；内部编号类似与这条数据的地址，在索引库内部的数据进行调整后，这个编号就可能会改变，同时词汇表中的引用的编号也会做相应的改变，以保证正确。

更新词汇表。把文本中的词找出来放到词汇表中，简历与文档的对应关系。要把那些词放到词汇表中呢？这就用到一个叫Analyzer（分词器）的工具。他的作用是把一段文本中的词按照规则取出所包含的所有词。对应的是Analyzer类，这是一个抽象类，切分词的具体规则是由其子类实现。

在把对象的属性转化为 Field时，相关代码为：

doc.add(new Field(“title”,article.getTitle(), Store.YES, Index.Analyzed))

其中第三个参数的意思为

Store.NO 不存储属性的值；

Store.YES 存储属性的值

第四个参数

Index.NO 不建立索引

Index.ANALYZED 分词后建立索引

Index.NOT_ANALYZED 不分词，把整个内容作为一个词建立索引

Store是影响搜索出的结构是否有指定属性的原始内容。

Index是影响是否可以从这个属性中查询，或者是查询时可以查其中的某些词，还是要把整个内容作为一个词进行查询。

2、从索引库中搜索的执行过程（QueryParse、TopDocs、ScoreDoc）

在进行搜索时，先在词汇表中查找，得到符合条件的文档编号列表。再根据文档编号真正的取数据（Document）

操作步骤如下：

（1）、把要查询字符串转为Query对象。这就像在Hiberante总是用HQL查询时，也要先调用Session.createQuery(hql)转成Hibernate的Query对象一样。把查询字符串转换成Query是使用QueryParser（单字段），或者使用MultiFieldQueryParser（多字段）。查询字符串也要先经过Analyzer（分词器）。要求检索时使用Analyzer要与监理索引使用的Analzyer要一致，否则可能搜索不出正确的结果。

（2）、调用IndexSearcher.search()，进行查询，得到结果。此方法返回未TopDocs，是包含结果的多个信息的一个对象。其中有totalHits代表记录数，ScoreDoc的数组。ScoreDoc是代表一个结果的相关度得分与文档编号等信息的对象。

（3）、取出要用到的数据列表。调用IndexSearcher.doc(scoreDoc.doc)以取出指定编号对应的Document数据，在分页时要用到：一次只取一页的数据。

参考：http://yufenfei.iteye.com/blog/1751103