全文检索工具lucene 之索引创建方法

最新推荐文章于 2024-07-28 15:46:11 发布

weixin_34056162

最新推荐文章于 2024-07-28 15:46:11 发布

阅读量79

点赞数

文章标签： python java

原文链接：https://my.oschina.net/MrBamboo/blog/822467

版权

2019独角兽企业重金招聘Python工程师标准>>>

这里创建索引使用的文档内容是:吃饭买菜洗车类似这样的词语,中间使用空格隔开

下面是创建索引和获取searcher对象的类,

package com.xxxxx.util.lucene;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

/**
 * 索引创建删除等相关方法,提供searcher对象实例
 * @author CCC
 *
 */
public class IndexUtil {
	//directory 和reader对象的创建销毁都是比较耗资源的,因此需要设置成单例
	private static Directory directory = null;
	private static IndexReader reader = null;
	
	static {
		try {
            // 这里指定的是索引文件的输出存放路径
			directory = FSDirectory.open(new File("C:/youcai/lucene/index03/"));
			reader = IndexReader.open(directory);
		} catch (IOException e) {
			
		}
	}
	
	/**
	 * 获取searcher的方法,在索引更新时自动重新创建
	 * @return searcher
	 */
	public static IndexSearcher getSearcher(){
		try {
			if (reader == null) {
				reader = IndexReader.open(directory);
			} else {
				IndexReader ir = IndexReader.openIfChanged(reader);
				if(ir != null){
					reader.close();
					reader = ir;
				}
			}
			return new IndexSearcher(reader);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	/**
	 * 查询文档数量信息方法
	 */
	public void query(){
		System.out.println("文档数量" + reader.numDocs());
		System.out.println("文档最大数量" + reader.maxDoc());
		System.out.println("删除的文档数量" + reader.numDeletedDocs());
	}
	
	/**
	 * 
	 * @param path 需要被创建索引的文档路径
	 * @param hasNew 是否需要重新创建索引(当文档更新时)
	 * 					true 为更新
	 */
	public void createIndex(String path,boolean hasNew){
		IndexWriter writer = null;
		try {
			// 因为是已经加工好的语料,所以直接使用空格分词器
			writer = new IndexWriter(directory,new IndexWriterConfig(Version.LUCENE_35, new WhitespaceAnalyzer(Version.LUCENE_35)));
			// 如果文档有更新内容,就删除全部索引重新创建
			if(hasNew){
				writer.deleteAll();
			}
			File file = new File(path);
			Document doc = null;
			for(File f : file.listFiles()){
				doc = new Document();
				// 加入文档内容
				doc.add(new Field("content",new FileReader(f)));
				// 加入文件名,不需要分词,但是考虑将其它支出-其它分类加权
				doc.add(new Field("fileName",f.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
				// 加入文件修改日期,由于数字默认不索引,所以手动设置为true,开启索引
				doc.add(new NumericField("updateTime",Field.Store.YES,true).setLongValue(f.lastModified()));
				// 加入文件大小 单位byte
				doc.add(new NumericField("size",Field.Store.YES,true).setIntValue((int)f.length()));
			
				writer.addDocument(doc);
			}
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (writer != null) {
					writer.close();
				}
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}

		}
		
		
	}
		
	
	
}

转载于:https://my.oschina.net/MrBamboo/blog/822467