lucene学习(2)

import java.io.IOException;

import junit.framework.TestCase;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;


/**
 * adding documents to an index
 * @author liang
 *
 */
public class IndexingTest extends TestCase {

	protected String[] ids = {"1","2"};
	protected String[] unindexed = {"Netherlands","Italy"};
	protected String[] unstored = {"Amsterdam has lots of bridges",
			"Venice has lots of canals"
	};
	protected String[] text = {"Amsterdam","Venice"};
	private Directory directory;
	
	protected void setUp() throws Exception{//a
		directory = new RAMDirectory();
		
		IndexWriter writer = getWriter();//b
		for(int i=0;i<ids.length;i++){//c
			Document doc  = new Document();
			doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
			doc.add(new Field("country",unindexed[i],Field.Store.YES,
					Field.Index.NO));
			doc.add(new Field("contents",unstored[i],Field.Store.NO,
					Field.Index.ANALYZED));
			doc.add(new Field("city",text[i],Field.Store.YES,
					Field.Index.ANALYZED));
			writer.addDocument(doc);
		}
		writer.close();
	}
	
	private IndexWriter getWriter() throws IOException{//d
		return new IndexWriter(directory,new WhitespaceAnalyzer(),IndexWriter.MaxFieldLength.UNLIMITED);
	}
	
	protected int getHitCount(String fieldName,String searchString) throws IOException{
		IndexSearcher searcher = new IndexSearcher(directory);//e
		Term t = new Term(fieldName,searchString);
		Query query = new TermQuery(t);//f
//		int hitCount = TestUtil.hitCount(searcher,query);//g
		int hitCount = searcher.search(query, 1).totalHits;
		searcher.close();
		return hitCount;
	}
	
	public void testIndexWrtier() throws IOException{
		IndexWriter writer = getWriter();
		assertEquals(ids.length,writer.numDocs());//h
	}
	
	public void testIndexReader() throws IOException{
		IndexReader reader = IndexReader.open(directory);
		assertEquals(ids.length,reader.maxDoc());//i
		assertEquals(ids.length,reader.numDocs());
		reader.close();
	}
	
	public void testDeleteBeforeIndexMerge() throws IOException{
		IndexWriter writer = getWriter();
		assertEquals(2, writer.numDocs());//1
		writer.deleteDocuments(new Term("id","1"));//2
		writer.commit();
		assertTrue(writer.hasDeletions());//index contains deletions
		assertEquals(2, writer.maxDoc());
		assertEquals(1, writer.numDocs());//1 indexed doucment,1 deleted document
		writer.close();
	}
	
	public void testDeleteAfterIndexMerge() throws IOException{
		IndexWriter writer = getWriter();
		assertEquals(2, writer.numDocs());
		writer.deleteDocuments(new Term("id","1"));
		writer.optimize();
		writer.commit();
		assertFalse(writer.hasDeletions());
		assertEquals(1, writer.maxDoc());//1 indexed document,0 deleted documents
		assertEquals(1, writer.numDocs());
		writer.close();
	}
	
	public void testFieldIndex() throws IOException{
		//index no 
		IndexSearcher searcher = new IndexSearcher(directory);
		Term term = new Term("country","Italy");
		Query query = new TermQuery(term);
		int hitCount = TestUtil.hitCount(searcher,query);//g
		assertEquals(0, hitCount);
		
		//index analyzer
		term = new Term("contents","bridges");
		query = new TermQuery(term);
		hitCount = TestUtil.hitCount(searcher, query);
		searcher.close();
		assertEquals(1, hitCount);
	}
	
	public void testUpdate() throws IOException{
		assertEquals(1, getHitCount("city", "Amsterdam"));
		IndexWriter writer = getWriter();
		
		Document doc = new Document();
		doc.add(new Field("id","1",Field.Store.YES,Field.Index.NOT_ANALYZED));
		doc.add(new Field("country","Netherlands",Field.Store.YES,
				Field.Index.NO));
		doc.add(new Field("contents","Amsterdam has lots of bridges",
				Field.Store.NO,Field.Index.ANALYZED));
		doc.add(new Field("city","Haag",Field.Store.YES,Field.Index.ANALYZED));
		writer.updateDocument(new Term("id","1"), doc);
		writer.close();
		
		assertEquals(0, getHitCount("city", "Amsterdam"));
		
		assertEquals(1, getHitCount("city", "Haag"));
		
	}
}

将文档从index中删除

 

  • deleteDocuments(Term) 删除所有包含term的document
  • deleteDocuments(Query) 删除匹配query的document
  • deleteDocuments(Query[]) 删除匹配任意其中一个query的document

如果要删除一个文档,必须保证每个文档都建立了索引而且对应field的值是唯一的那个field需要indexed而且

un-analyzed(索引而且不分词)

 

maxDoc() 返回所有的文档总数(删除的还有没删除的)
numDocs() 只返回没有删除的文档数

hasDeletions() 用来判断索引中是否有删除的文档

调用optimize()后,将移去已经删除的文档,


更新index中的文档
lucene不能只更新文档中只改变的部分,它删除旧的文档然后添加新的文档相当于先调用deleteDocument然后调用addDocument

 

  • updateDocument(Term,Document) 首先删除所有包含该term的文档,然后添加一个新的文档,利用indexWriter的默认analyzer
  • undateDocument(Term,Document,Analyzer) 同上,只不过是指明analyzer


Field options

  • Field.Index
    Field.Index.ANALYZED:
    field的值会被analyzer进行处理。将field的值进行分词,每个分词都能被搜索到
    常用于文本域(如body,title等)
    Field.Index.NOT_ANALYZED:
    field的值在index前不会被analyer处理。即整个field的值当作一个分词,用来搜索
    常用于希望被搜索但是不希望被分词
    Field.Index.ANALYZED_NO_NORMS:
    Field.Index.NOT_ANALYZED_NO_NORMS:
    Field.Index.NO:没有建立索引,即不能根据该field查找内容

 

  • Field.Store
    Field.Store.YES 存储该值
    Field.Store.NO 不存储该值

 

  • Field options for term vectors
    一般用于亮色显示和查看相似文档
    term vectors 是indexed field和stored field的混合体
    term vectors 首先被document id指出,然后是term,就像对每个文档再来个倒排索引

TermVector.YES :记录遇到的唯一的terms,在每个文档中它们的数量,不记录位置和偏移信息TermVector.WITH_POSITIONS:记录唯一的terms和数量,还记录每个term的位置,不记录偏移

TermVector.WITH_OFFSETS:记录唯一的terms和数量,还记录移,不记录位置TermVector.WITH_POSTION_OFFSETS 都记录

TermVector.NO 不记录任何term vector information

  

  只有索引该field才能索引term vector

 

 

Field(String name,Reader value,TermVector vector)
默认的选项值:Store.NO, Index.ANALYZED

Field(String name,TokenStream tokenStream,TermVector termVector)
默认的选项值:Store.NO ,Index.ANALYZED

Field(String name,byte[] value,Store store)
默认的选项值:Index.NO ,TermVector.NO,Store参数必须为Store.YES

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值