Lucene学习笔记(二)

最新推荐文章于 2023-02-15 14:47:08 发布

phinecos

最新推荐文章于 2023-02-15 14:47:08 发布

阅读量673

点赞数

分类专栏： Java Search Engine 文章标签： lucene string query

本文链接：https://blog.csdn.net/phinecos/article/details/4612334

版权

Java 同时被 2 个专栏收录

122 篇文章 0 订阅

订阅专栏

Search Engine

15 篇文章 0 订阅

订阅专栏

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.SimpleAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import junit.framework.TestCase;

public class BaseIndexTestCase extends TestCase

{

protected String[] keywords = {"1", "2"};

protected String[] unindexed = {"Netherlands", "Italy"};

protected String[] unstored = {"Amsterdam has lots of bridges", "Venice has lots of canals"};

protected String[] text = {"Amsterdam", "Venice"};

protected Directory dir;

protected void setUp() throws IOException {

String indexDir =

System.getProperty("java.io.tmpdir", "tmp") +

System.getProperty("file.separator") + "index-dir";

dir = FSDirectory.getDirectory(indexDir, true);

addDocuments(dir);

}

protected void addDocuments(Directory dir)

throws IOException {

IndexWriter writer = new IndexWriter(dir, getAnalyzer(), true);

writer.setUseCompoundFile(isCompound());

for (int i = 0; i < keywords.length; i++)

{

Document doc = new Document();

doc.add(new Field("id",keywords[i],Field.Store.YES,Field.Index.UN_TOKENIZED));

doc.add(new Field("country",unindexed[i],Field.Store.YES,Field.Index.NO));

doc.add(new Field("contents",unstored[i],Field.Store.NO,Field.Index.TOKENIZED));

doc.add(new Field("city",text[i],Field.Store.YES,Field.Index.TOKENIZED));

writer.addDocument(doc);

}

writer.optimize();

writer.close();

}

protected Analyzer getAnalyzer()

{

return new SimpleAnalyzer();

}

protected boolean isCompound()

{

return true;

}

public void testIndexWriter() throws IOException

{

IndexWriter writer = new IndexWriter(dir,this.getAnalyzer(),false);

assertEquals(keywords.length,writer.docCount());

writer.close();

}

public void testIndexReader() throws IOException

{

IndexReader reader = IndexReader.open(dir);

assertEquals(keywords.length, reader.maxDoc());

assertEquals(keywords.length, reader.numDocs());

reader.close();

}

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.TermQuery;

public class DocumentDeleteTest extends BaseIndexTestCase

{

public void testDeleteBeforeIndexMerge() throws IOException

{

assertEquals(1, getHitCount("city", "Amsterdam"));

IndexReader reader = IndexReader.open(dir);

assertEquals(2, reader.maxDoc());

assertEquals(2, reader.numDocs());

reader.deleteDocument(1);

assertTrue(reader.isDeleted(1));

assertTrue(reader.hasDeletions());

assertEquals(2, reader.maxDoc());

assertEquals(1, reader.numDocs());

reader.close();

reader = IndexReader.open(dir);

assertEquals(2, reader.maxDoc());

assertEquals(1, reader.numDocs());

reader.close();

}

public void testDeleteAfterIndexMerge() throws IOException

{

IndexReader reader = IndexReader.open(dir);

assertEquals(2, reader.maxDoc());

assertEquals(2, reader.numDocs());

reader.deleteDocument(1);

reader.close();

IndexWriter writer = new IndexWriter(dir, getAnalyzer(),false);

writer.optimize();

writer.close();

reader = IndexReader.open(dir);

assertFalse(reader.isDeleted(1));

assertFalse(reader.hasDeletions());

assertEquals(1, reader.maxDoc());

assertEquals(1, reader.numDocs());

reader.close();

}

private int getHitCount(String fieldName, String searchString)

throws IOException {

IndexSearcher searcher = new IndexSearcher(dir);

Term t = new Term(fieldName, searchString);

Query query = new TermQuery(t);

Hits hits = searcher.search(query);

int hitCount = hits.length();

searcher.close();

return hitCount;

}

protected Analyzer getAnalyzer() {

return new WhitespaceAnalyzer();

}

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.WhitespaceAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.TermQuery;

public class DocumentUpdateTest extends BaseIndexTestCase

{

public void testUpdate() throws IOException

{

assertEquals(1, getHitCount("city", "Amsterdam"));

IndexReader reader = IndexReader.open(dir);

reader.deleteDocuments(new Term("city", "Amsterdam"));

reader.close();

IndexWriter writer = new IndexWriter(dir, getAnalyzer(),

false);

Document doc = new Document();

doc.add(new Field("id","1",Field.Store.YES,Field.Index.UN_TOKENIZED));

doc.add(new Field("country","Russia",Field.Store.YES,Field.Index.NO));

doc.add(new Field("contents","St. Petersburg has lots of bridges",Field.Store.NO,Field.Index.TOKENIZED));

doc.add(new Field("city","St. Petersburg",Field.Store.YES,Field.Index.TOKENIZED));

writer.addDocument(doc);

writer.optimize();

writer.close();

assertEquals(0, getHitCount("city", "Amsterdam"));

assertEquals(1, getHitCount("city", "Petersburg"));

}

protected Analyzer getAnalyzer() {

return new WhitespaceAnalyzer();

}

private int getHitCount(String fieldName, String searchString)

throws IOException {

IndexSearcher searcher = new IndexSearcher(dir);

Term t = new Term(fieldName, searchString);

Query query = new TermQuery(t);

Hits hits = searcher.search(query);

int hitCount = hits.length();

searcher.close();

return hitCount;

}

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.SimpleAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class IndexTuningDemo

{

public static void main(String[] args) throws Exception {

int docsInIndex = Integer.parseInt(args[0]);

// create an index called 'index-dir' in a temp directory

Directory dir = FSDirectory.getDirectory(

System.getProperty("java.io.tmpdir", "tmp") +

System.getProperty("file.separator") + "index-dir", true);

Analyzer analyzer = new SimpleAnalyzer();

IndexWriter writer = new IndexWriter(dir, analyzer, true);

// set variables that affect speed of indexing

writer.setMergeFactor(Integer.parseInt(args[1]));

writer.setMaxMergeDocs(Integer.parseInt(args[2]));

writer.setInfoStream(System.out);

writer.setMaxBufferedDocs(Integer.parseInt(args[3]));

System.out.println("Merge factor: " + writer.getMergeFactor());

System.out.println("Max merge docs: " + writer.getMaxMergeDocs());

System.out.println("Min merge docs: " + writer.getMaxBufferedDocs());

long start = System.currentTimeMillis();

for (int i = 0; i < docsInIndex; i++) {

Document doc = new Document();

doc.add(new Field("fieldname", "Bibamus", Field.Store.YES,Field.Index.TOKENIZED));

writer.addDocument(doc);

}

writer.close();

long stop = System.currentTimeMillis();

System.out.println("Time: " + (stop - start) + " ms");

}

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;

import junit.framework.TestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Iterator;

public class FSversusRAMDirectoryTest extends TestCase
{
   private Directory fsDir;
   private Directory ramDir;
   private Collection docs = loadDocuments( 3000 , 5 ); // 加载数据

   protected void setUp() throws Exception
  {
    String fsIndexDir = System.getProperty( " java.io.tmpdir " , " tmp " ) + System.getProperty( " file.separator " ) + " fs-index " ;
    ramDir = new RAMDirectory(); // 内存中目录
    fsDir = FSDirectory.getDirectory(fsIndexDir, true );
  }

   public void testTiming() throws IOException
  {
     long ramTiming = timeIndexWriter(ramDir);
     long fsTiming = timeIndexWriter(fsDir);

    assertTrue(fsTiming > ramTiming);


    System. out .println( " RAMDirectory Time: " + (ramTiming) + " ms " );
    System. out .println( " FSDirectory Time : " + (fsTiming) + " ms " );
  }

   private long timeIndexWriter(Directory dir) throws IOException
  {
     long start = System.currentTimeMillis();
    addDocuments(dir);
     long stop = System.currentTimeMillis();
     return (stop - start);
  }

   private void addDocuments(Directory dir) throws IOException
  {
    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true );

     /* *
    // change to adjust performance of indexing with FSDirectory
    writer.mergeFactor = writer.mergeFactor;
    writer.maxMergeDocs = writer.maxMergeDocs;
    writer.minMergeDocs = writer.minMergeDocs;
     */

     for (Iterator iter = docs.iterator(); iter.hasNext();)
    {
      Document doc = new Document();
      String word = (String) iter.next();
      doc.add( new Field( " keyword " ,word,Field.Store.YES,Field.Index.UN_TOKENIZED));
      doc.add( new Field( " unindexed " ,word,Field.Store.YES,Field.Index.NO));
      doc.add( new Field( " unstored " ,word,Field.Store.NO,Field.Index.TOKENIZED));
      doc.add( new Field( " text " ,word,Field.Store.YES,Field.Index.TOKENIZED));
      writer.addDocument(doc);
    }
    writer.optimize();
    writer.close();
  }

   private Collection loadDocuments( int numDocs, int wordsPerDoc)
  {
    Collection docs = new ArrayList(numDocs);
     for ( int i = 0 ; i < numDocs; i ++ )
    {
      StringBuffer doc = new StringBuffer(wordsPerDoc);
       for ( int j = 0 ; j < wordsPerDoc; j ++ )
      {
        doc.append( " Bibamus " );
      }
      docs.add(doc.toString());
    }
     return docs;
  }
}

phinecos

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene学习笔记(二)

import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.SimpleAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.
复制链接

扫一扫