Lucene3.0 创建索引及多目录搜索

最新推荐文章于 2024-07-13 20:46:05 发布
tangtang
最新推荐文章于 2024-07-13 20:46:05 发布
阅读量340
点赞数
文章标签： lucene 文档 string 存储 token file
/**
 * 
 */
package com.test;

import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

/**
 * @author Administrator
 * 
 */
public class Test2 {

public void index() throws CorruptIndexException, 
            LockObtainFailedException, IOException { 
        // 索引目录 
        File indexDir = new File("D:/workspace/code/java/TestLucene3/index/txt/test/"); 
        // 注意：这里建立索引用的分词方法，在搜索时分词也应该采用同样的分词方法。不然搜索数据可能会不正确 
        // 使用Lucene自带分词器 
        Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 
        // 第一个参数是存放索引文件位置， 第二个参数是使用的分词方法， 第三个：true，建立全新的索引，false,建立增量索引。 
        // IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true); 

        // 第一个参数是存放索引目录有FSDirectory（存储到磁盘上）和RAMDirectory（存储到内存中）， 第二个参数是使用的分词器， 第三个：true，建立全新的索引，false,建立增量索引，第四个是建立的索引的最大长度。 
        IndexWriter indexWriter = new IndexWriter(FSDirectory.open(indexDir), 
                luceneAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED); 
        // 索引合并因子 
        // SetMergeFactor（合并因子）   
        // SetMergeFactor是控制segment合并频率的，其决定了一个索引块中包括多少个文档，当硬盘上的索引块达到多少时，   
        // 将它们合并成一个较大的索引块。当MergeFactor值较大时，生成索引的速度较快。MergeFactor的默认值是10，建议在建立索引前将其设置的大一些。 
        indexWriter.setMergeFactor(100); 
        // SetMaxBufferedDocs（最大缓存文档数）   
        // SetMaxBufferedDocs是控制写入一个新的segment前内存中保存的document的数目，   
        // 设置较大的数目可以加快建索引速度，默认为10。   
        indexWriter.setMaxBufferedDocs(100); 

        // SetMaxMergeDocs（最大合并文档数）   
        // SetMaxMergeDocs是控制一个segment中可以保存的最大document数目，值较小有利于追加索引的速度，默认Integer.MAX_VALUE，无需修改。   
        // 在创建大量数据的索引时，我们会发现索引过程的瓶颈在于大量的磁盘操作，如果内存足够大的话，   
        // 我们应当尽量使用内存，而非硬盘。可以通过SetMaxBufferedDocs来调整，增大Lucene使用内存的次数。   
        indexWriter.setMaxMergeDocs(1000); 

        // SetUseCompoundFile这个方法可以使Lucene在创建索引库时，会合并多个 Segments 文件到一个.cfs中。   
        // 此方式有助于减少索引文件数量，对于将来搜索的效率有较大影响。   
        // 压缩存储（True则为复合索引格式）   
        indexWriter.setUseCompoundFile(true); 
        
        long startTime = new Date().getTime(); 
        
        String temp = ""; 
        // 增加索引字段 
        //         
        // 在Field中有三个内部类：Field.Index,Field.Store,Field.termVector，而构造函数也用到了它们。   
        // 参数说明：   
        // Field.Store： 
        // Field.Store.NO：表示该Ｆield不需要存储。   
        // Field.Store.Yes：表示该Ｆield需要存储。   
        // Field.Store.COMPRESS：表示使用压缩方式来存储。   
        // Field.Index： 
        // Field.Index.NO：表示该Ｆield不需要索引。   
        // Field.Index.TOKENIZED：表示该Ｆield先被分词再索引。   
        // Field.Index.UN_TOKENIZED：表示不对该Ｆield进行分词，但要对其索引。   
        // Field.Index.NO_NORMS：表示该Ｆield进行索引，但是要对它用Analyzer，同时禁止它参加评分，主要是为了减少内在的消耗。 
        // TermVector这个参数也不常用，它有五个选项。 
        //                Field.TermVector.NO表示不索引Token的位置属性； 
        //                Field.TermVector.WITH_OFFSETS表示额外索引Token的结束点； 
        //                Field.TermVector.WITH_POSITIONS表示额外索引Token的当前位置； 
        //                Field.TermVector.WITH_POSITIONS_OFFSETS表示额外索引Token的当前和结束位置； 
        //                Field.TermVector.YES则表示存储向量。 

        // 增加文档 Field相当于增加数据库字段一样检索,获取都需要的内容,直接放index中,不过这样会增大index,保存文件的txt内容 
        /** 
         * Field.Store 表示“是否存储”，即该Field内的信息是否要被原封不动的保存在索引中。 
         * Field.Index 表示“是否索引”，即在这个Field中的数据是否在将来检索时需要被用户检索到，一个“不索引”的Field通常仅是提供辅助信息储存的功能。 
         * Field.TermVector 表示“是否切词”，即在这个Field中的数据是否需要被切词。 
         */ 
        Field fieldPath = new Field("path", "", Field.Store.YES, Field.Index.NO); 
        Field fieldBody = new Field("content", temp, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); 
        Field fieldId = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); 
        
        Document document = new Document(); 
        // 做测试，循环100000遍建索引。也可以读取文件内容建索引 
        for (int i=0; i<100000; i++) { 
            document  = new Document(); 
            temp = "王熙凤历幻返金陵　甄应嘉蒙恩还玉阙"; 
            
            fieldPath.setValue("D:\\workspace\\code\\java\\TestLucene3\\txt\\" + i + ".txt"); 
            fieldBody.setValue(temp); 
            fieldId.setValue(String.valueOf(i)); 
            
            document.add(fieldPath); 
            document.add(fieldBody); 
            document.add(fieldId); 
            indexWriter.addDocument(document); 
            i++; 
        } 
        //optimize()方法是对索引进行优化 
        indexWriter.optimize(); 
        indexWriter.close(); 
        
        // 若需要从索引中删除某一个或者某一类文档，IndexReader提供了两种方法： 
        // reader.DeleteDocument(int docNum) 
        // reader.DeleteDocuments(Term term) 
        // 前者是根据文档的编号来删除该文档，docNum是该文档进入索引时Lucene的编号，是按照顺序编的；后者是删除满足某一个条件的多个文档。 
        // 在执行了DeleteDocument或者DeleteDocuments方法后，系统会生成一个*.del的文件，该文件中记录了删除的文档，但并未从物理上删除这些文档。此时，这些文档是受保护的，当使用Document   
        // doc = reader.Document(i)来访问这些受保护的文档时，Lucene会报“Attempt to access a   
        // deleted document”异常。如果一次需要删除多个文档时，可以用两种方法来解决：   
        // 1. 删除一个文档后，用IndexWriter的Optimize方法来优化索引，这样我们就可以继续删除另一个文档。   
        // 2. 先扫描整个索引文件，记录下需要删除的文档在索引中的编号。然后，一次性调用DeleteDocument删除这些文档，再调用IndexWriter的Optimize方法来优化索引。 
        
        long endTime = new Date().getTime(); 
        System.out.println("\n这花费了" + (endTime - startTime) + " 毫秒增加到索引!"); 
    } 


/** 
     * 查询 
     * 
     * @param String word 关键词 
     * @param String filedName 域字段 
     * @param String indexDir 索引位置 
     * @throws CorruptIndexException 
     * @throws IOException 
     * @throws ParseException 
     * @auther <a href="mailto:gaoxuguo@feinno.com">Gao XuGuo</a> Nov 30, 2009 
     *         2:56:42 PM 
     */ 
    public List<Map<String, String>> search(String indexDir) 
            throws CorruptIndexException, IOException, ParseException { 
        File file = new File(indexDir); 
        IndexSearcher is = new IndexSearcher(FSDirectory.open(file), true); 
        String field = "content"; 

        BooleanQuery bq = new BooleanQuery(); 
        
        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, 
                new StandardAnalyzer(Version.LUCENE_CURRENT)); 
        Query query = parser.parse("content:王熙凤"); 
        
        Query q = new TermQuery(new Term("id","100")); 
        bq.add(q,Occur.SHOULD); 
        bq.add(query,Occur.SHOULD); 
        // 100表示取前100条数据 
        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); 

        long start = new Date().getTime();// start time 
        
        /** 
         * Lucene内置了三个Filter子类： 
         * 1)DateFilter使搜索只限于指定的日期域的值在某一时间范围内的文档空间里 
         * 2)QueryFilter把查询结果做为另一个新查询可搜索的文档空间 
         * 3)CachingWrappperFilter是其他过滤器的装饰器，将结果缓存起来以便再次使用，从而提高性能。 
         * 
         */ 
        String[] dirs = {indexDir}; 
        MultiSearcher ms = this.getMultiSearcher(dirs); 
        ms.search(bq, collector); 
        
//        is.search(bq, collector); 
        ScoreDoc[] docs = collector.topDocs().scoreDocs; 

        Document doc; 
        for (ScoreDoc sd : docs) { 
            doc = is.doc(sd.doc); 
            // 取得doc里面的Field并从doc里面读取值 
            for (Fieldable fa : doc.getFields()) { 
                System.out.print(fa.name() + "=" + doc.get(fa.name()) + " "); 
            } 
            System.out.println(); 
        } 
        long end = new Date().getTime(); 
        if(is != null) is.close(); 

        System.out.println("找到 " + collector.getTotalHits() 
                + " 条数据，花费时间 " + (end - start) 
                + " 秒"); 
        return null; 
    } 

	/**
	 * 得到MultiSearcher多目录查询实例 *
	 * 
	 * @param String
	 *            [] dirs 要查询的索引目录。
	 * @return MultiSearcher * @throws IOException
	 * @auther <a href="mailto:gaoxuguo@feinno.com">Gao XuGuo</a> Jan 22, 2010
	 *         3:44:16 PM
	 */
	private MultiSearcher getMultiSearcher(String[] dirs) throws IOException { // 多目录
		IndexSearcher[] searchers = new IndexSearcher[dirs.length];
		int i = 0;
		for (String dir : dirs) {
			searchers[i] = new IndexSearcher(FSDirectory.open(new File(dir)),
					true);
			i++;
		}
		// 多目录查询
		return new MultiSearcher(searchers);
	}
}
tangtang
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene3.0 创建索引及多目录搜索

/** * */package com.test;import java.io.File;import java.io.IOException;import java.util.Date;import java.util.List;impor
复制链接

扫一扫