import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
/**
* adding documents to an index
* @author liang
*
*/
public class IndexingTest extends TestCase {
protected String[] ids = {"1","2"};
protected String[] unindexed = {"Netherlands","Italy"};
protected String[] unstored = {"Amsterdam has lots of bridges",
"Venice has lots of canals"
};
protected String[] text = {"Amsterdam","Venice"};
private Directory directory;
protected void setUp() throws Exception{//a
directory = new RAMDirectory();
IndexWriter writer = getWriter();//b
for(int i=0;i<ids.length;i++){//c
Document doc = new Document();
doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("country",unindexed[i],Field.Store.YES,
Field.Index.NO));
doc.add(new Field("contents",unstored[i],Field.Store.NO,
Field.Index.ANALYZED));
doc.add(new Field("city",text[i],Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.close();
}
private IndexWriter getWriter() throws IOException{//d
return new IndexWriter(directory,new WhitespaceAnalyzer(),IndexWriter.MaxFieldLength.UNLIMITED);
}
protected int getHitCount(String fieldName,String searchString) throws IOException{
IndexSearcher searcher = new IndexSearcher(directory);//e
Term t = new Term(fieldName,searchString);
Query query = new TermQuery(t);//f
// int hitCount = TestUtil.hitCount(searcher,query);//g
int hitCount = searcher.search(query, 1).totalHits;
searcher.close();
return hitCount;
}
public void testIndexWrtier() throws IOException{
IndexWriter writer = getWriter();
assertEquals(ids.length,writer.numDocs());//h
}
public void testIndexReader() throws IOException{
IndexReader reader = IndexReader.open(directory);
assertEquals(ids.length,reader.maxDoc());//i
assertEquals(ids.length,reader.numDocs());
reader.close();
}
public void testDeleteBeforeIndexMerge() throws IOException{
IndexWriter writer = getWriter();
assertEquals(2, writer.numDocs());//1
writer.deleteDocuments(new Term("id","1"));//2
writer.commit();
assertTrue(writer.hasDeletions());//index contains deletions
assertEquals(2, writer.maxDoc());
assertEquals(1, writer.numDocs());//1 indexed doucment,1 deleted document
writer.close();
}
public void testDeleteAfterIndexMerge() throws IOException{
IndexWriter writer = getWriter();
assertEquals(2, writer.numDocs());
writer.deleteDocuments(new Term("id","1"));
writer.optimize();
writer.commit();
assertFalse(writer.hasDeletions());
assertEquals(1, writer.maxDoc());//1 indexed document,0 deleted documents
assertEquals(1, writer.numDocs());
writer.close();
}
public void testFieldIndex() throws IOException{
//index no
IndexSearcher searcher = new IndexSearcher(directory);
Term term = new Term("country","Italy");
Query query = new TermQuery(term);
int hitCount = TestUtil.hitCount(searcher,query);//g
assertEquals(0, hitCount);
//index analyzer
term = new Term("contents","bridges");
query = new TermQuery(term);
hitCount = TestUtil.hitCount(searcher, query);
searcher.close();
assertEquals(1, hitCount);
}
public void testUpdate() throws IOException{
assertEquals(1, getHitCount("city", "Amsterdam"));
IndexWriter writer = getWriter();
Document doc = new Document();
doc.add(new Field("id","1",Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("country","Netherlands",Field.Store.YES,
Field.Index.NO));
doc.add(new Field("contents","Amsterdam has lots of bridges",
Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("city","Haag",Field.Store.YES,Field.Index.ANALYZED));
writer.updateDocument(new Term("id","1"), doc);
writer.close();
assertEquals(0, getHitCount("city", "Amsterdam"));
assertEquals(1, getHitCount("city", "Haag"));
}
}
将文档从index中删除
- deleteDocuments(Term) 删除所有包含term的document
- deleteDocuments(Query) 删除匹配query的document
- deleteDocuments(Query[]) 删除匹配任意其中一个query的document
如果要删除一个文档,必须保证每个文档都建立了索引而且对应field的值是唯一的那个field需要indexed而且
un-analyzed(索引而且不分词)
maxDoc() 返回所有的文档总数(删除的还有没删除的)
numDocs() 只返回没有删除的文档数
hasDeletions() 用来判断索引中是否有删除的文档
调用optimize()后,将移去已经删除的文档,
更新index中的文档
lucene不能只更新文档中只改变的部分,它删除旧的文档然后添加新的文档相当于先调用deleteDocument然后调用addDocument
- updateDocument(Term,Document) 首先删除所有包含该term的文档,然后添加一个新的文档,利用indexWriter的默认analyzer
- undateDocument(Term,Document,Analyzer) 同上,只不过是指明analyzer
Field options
- Field.Index
Field.Index.ANALYZED:
field的值会被analyzer进行处理。将field的值进行分词,每个分词都能被搜索到
常用于文本域(如body,title等)
Field.Index.NOT_ANALYZED:
field的值在index前不会被analyer处理。即整个field的值当作一个分词,用来搜索
常用于希望被搜索但是不希望被分词
Field.Index.ANALYZED_NO_NORMS:
Field.Index.NOT_ANALYZED_NO_NORMS:
Field.Index.NO:没有建立索引,即不能根据该field查找内容
- Field.Store
Field.Store.YES 存储该值
Field.Store.NO 不存储该值
- Field options for term vectors
一般用于亮色显示和查看相似文档
term vectors 是indexed field和stored field的混合体
term vectors 首先被document id指出,然后是term,就像对每个文档再来个倒排索引
TermVector.YES :记录遇到的唯一的terms,在每个文档中它们的数量,不记录位置和偏移信息TermVector.WITH_POSITIONS:记录唯一的terms和数量,还记录每个term的位置,不记录偏移
TermVector.WITH_OFFSETS:记录唯一的terms和数量,还记录移,不记录位置TermVector.WITH_POSTION_OFFSETS 都记录
TermVector.NO 不记录任何term vector information
只有索引该field才能索引term vector
Field(String name,Reader value,TermVector vector)
默认的选项值:Store.NO, Index.ANALYZED
Field(String name,TokenStream tokenStream,TermVector termVector)
默认的选项值:Store.NO ,Index.ANALYZED
Field(String name,byte[] value,Store store)
默认的选项值:Index.NO ,TermVector.NO,Store参数必须为Store.YES