lucene compass 学习系列笔记一，纯lucene 构建搜索

最新推荐文章于 2024-09-19 16:47:21 发布

jixiuffff

最新推荐文章于 2024-09-19 16:47:21 发布

阅读量2k

点赞数

文章标签： lucene query dictionary 数据库 c insert

本文链接：https://blog.csdn.net/jixiuffff/article/details/5526605

版权

一：下载lucene 以下版本， lucene-3.0.1 及lucene-2.9.2

3.0.1 版本的与以前的版本有很大的差异，而像paoding-analysis-2.0.4-beta （庖丁中文分词器，还只能使用2.*版本的），故两种版本都下载，进行对照学习

lucene-analyzers-2.9.2.jar

lucene-core-2.9.2.jar

lucene-highlighter-2.9.2.jar

将以上三个文件放到classpath 下就可以了

这个文件是庖丁中文分词器里的需要单独下载 paoding-analysis-2.0.4-beta

paoding-analysis.jar

（关于中文分词器）

如一句话 "人之所以痛苦，在于追求错误的东西"

庖丁会分成 : 之所以痛苦在于追求错误东西

lucene 自带的分词器会分成（每个汉字）：人之所以痛苦在于追求错误的东西

一般中文都用庖丁

=====================================================================

将庖丁加入到项目中来

1 paoding-analysis.jar 加到路径

2 copy /paoding-analysis-2.0.4-beta/dic/ 词典到项目根路径，dic与src 平级

3 copy paoding-analysis-2.0.4-beta/src/paoding-dic-home.properties 到src 下并修改为

#values are "system-env" or "this"; 可以是this 各system-env 两个值
#if value is "this" , using the paoding.dic.home as dicHome if configed! 如果配置成this 就用下面配的paoding.dic.home 路径作为词典路径

paoding.dic.home.config-fisrt=this

#dictionary home (directory)
#"classpath:xxx" means dictionary home is in classpath.
#e.g "classpath:dic" means dictionaries are in "classes/dic" directory or any other classpath directory
paoding.dic.home=dic

#seconds for dic modification detection
paoding.dic.detector.interval=60

======================================================================

import java.io.File;
import java.io.IOException;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.jixiuf.compass.pojo.Book;
import org.junit.Test;

public class Test1 {

//一个目录，lucene 存储索引的地方，（可以是文件系统上的目录，也可以是内存中，）
    Directory dir;

    @org.junit.Before
    public void setUp() {
        File f = new File("target");
        if (!f.exists()) {
            f.mkdirs();
        }
        try {
            dir = FSDirectory.open(f);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

//将一本书的 id name author 添加到索引中

注意这里的几个类

IndexWriter ，向索引目录中写入数据，

Document (相当于数据库中的一条记录， )

Field （相当于数据库的的一个字段） new Field (name , value);

此方法做的事相当于

insert into book (id ,name ,author) values (00001,"围城","钱钟书" );

如果想把多个Book 存进去，多建几个Document

    @Test
    public void testSave() {

        Book b = new Book();
        b.setId("00001");
        b.setName("english ");
        b.setAuthor("jack");
        try {
            IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(), true,
                    IndexWriter.MaxFieldLength.UNLIMITED);
            Field idF = new Field("id", b.getId(), Field.Store.YES,
                    Field.Index.NOT_ANALYZED);
            Field nameF = new Field("name", b.getName(), Field.Store.YES,
                    Field.Index.ANALYZED);
            Field authorF = new Field("author", b.getAuthor(), Field.Store.YES,
                    Field.Index.ANALYZED);
            Document d = new Document();
            d.add(idF);
            d.add(nameF);
            d.add(authorF);

            out.addDocument(d);

            out.commit();

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

===========================
    @Test
    public void testRead() {
        try {
            IndexReader reader = IndexReader.open(dir);
            for (int i = 0; i < reader.numDocs(); i++) {
                Document d = reader.document(i); //读出所有Document ,相当于数据库中的所有记录
                List<Fieldable> fields = d.getFields(); //读出每个Document 中的字段，
                for (Fieldable f : fields) {
                    if (f instanceof Field) {
                        System.out.println(f.name() + ":===" + f.stringValue());
                    }
                }
            }
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

//search

    @Test
    public void testSearch() {
        try {
            IndexSearcher sc = new IndexSearcher(dir);

//查name == english 的
            Query query = new TermQuery(new Term("name", "english"));
            TopDocs docs = sc.search(query, 10);

            System.out.println(docs.totalHits);
            for (int i = 0; i < docs.totalHits; i++) {
                Document d = sc.doc(i);
                System.out.println(d.get("name"));
                System.out.println(d.get("id"));
            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }

    @Test
    public void testSearch2() {
        try {
            IndexSearcher sc = new IndexSearcher(dir);
            QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "id",
                    new StandardAnalyzer(Version.LUCENE_CURRENT));
            Query q = parser.parse("id:00002"); // 词条author 为jack 的
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            sc.search(q, c);

            System.out.println("共有数据document的条数：" + c.getTotalHits());

            ScoreDoc[] docs = c.topDocs().scoreDocs;
            for (ScoreDoc doc : docs) {
                System.out.println("第" + doc.doc + "条");

                Document d = sc.doc(doc.doc);
                List<Fieldable> fs = d.getFields();
                for (Fieldable f : fs) {
                    System.out.println("============");
                    System.out.println(f.stringValue());
                }

            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        } catch (ParseException e) {
            e.printStackTrace();
        }
    }

    @Test
    public void testDel() {
        try {
            IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
                    MaxFieldLength.LIMITED);
            out.deleteDocuments(new Term("id", "00002"));

            out.commit();
            out.close();
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    // 词条搜索
    @Test
    public void testTermSearch() {
        try {
            IndexSearcher sc = new IndexSearcher(dir, true);
            Term t = new Term("name", "围城");
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            sc.search(new TermQuery(t), c);
            int count = c.getTotalHits();
            System.out.println(count);
            ScoreDoc[] docs = c.topDocs().scoreDocs;
            for (ScoreDoc doc : docs) {
                System.out.println("第" + doc.doc + "条");

                Document d = sc.doc(doc.doc);
                List<Fieldable> fs = d.getFields();
                for (Fieldable f : fs) {
                    System.out.println("============");
                    System.out.println(f.stringValue());
                }

            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }

    @Test
    public void testBoolean() {
        try {
            IndexSearcher sc = new IndexSearcher(dir);
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            BooleanQuery b = new BooleanQuery();
            b.add(new TermQuery(new Term("name", "围城")), Occur.MUST);
            b.add(new TermQuery(new Term("id", "00001")), Occur.SHOULD);
            sc.search(b, c);
            int count = c.getTotalHits();
            System.out.println(count);
            for (int i = 0; i < count; i++) {
                Document d = sc.doc(i);
                System.out.println(d);
            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }

    @Test
    // 字符范围
    public void testTermRange1() {

        try {
            IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
                    false, IndexWriter.MaxFieldLength.UNLIMITED);

            Field b = new Field("char", "b", Store.YES, Index.NOT_ANALYZED);
            Document d = new Document();

            d.add(b);

            out.addDocument(d);
            d = new Document();

            Field a = new Field("char", "a", Store.YES, Index.NOT_ANALYZED);
            d.add(a);
            out.addDocument(d);
            out.commit();

            IndexSearcher sc = new IndexSearcher(dir);
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            TermRangeQuery tq = new TermRangeQuery("char", "a", "z", true, true);
            System.out.println(tq.toString());
            sc.search(tq, c);
            int count = c.getTotalHits();
            System.out.println(count);
            for (ScoreDoc dc : c.topDocs().scoreDocs) {
                System.out.println(sc.doc(dc.doc));
            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }

    @Test
    // 前缀
    public void testPrefix() {

        try {
            IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
                    false, IndexWriter.MaxFieldLength.UNLIMITED);
            Document d = new Document();
            d.add(new Field("prefix", "ab", Store.YES, Index.NOT_ANALYZED));
            out.addDocument(d);

            d = new Document();
            d.add(new Field("prefix", "abc", Store.YES, Index.NOT_ANALYZED));
            out.addDocument(d);

            d = new Document();
            d.add(new Field("prefix", "aabc", Store.YES, Index.NOT_ANALYZED));
            out.addDocument(d);

            out.commit();
            out.close();
            IndexSearcher sc = new IndexSearcher(dir, true);
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            PrefixQuery q = new PrefixQuery(new Term("prefix", "ab"));
            System.out.println(q.toString());
            sc.search(q, c);
            int count = c.getTotalHits();

            System.out.println(count);
            for (ScoreDoc doc : c.topDocs().scoreDocs) {
                System.out.println(sc.doc(doc.doc).getField("prefix")
                        .stringValue());
            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }

    @Test
    // 短语
    public void testPhrase() {

        try {

            IndexSearcher sc = new IndexSearcher(dir, true);
            TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
            PhraseQuery q = new PhraseQuery();

            q.add(new Term("name", "我们"));
            q.add(new Term("name", "孩子"));
            q.setSlop(2);// 表示我们和孩子两词间可以有2 个不相干的词

            System.out.println(q.toString());
            sc.search(q, c);
            int count = c.getTotalHits();
            System.out.println(count);
            for (ScoreDoc doc : c.topDocs().scoreDocs) {
                System.out.println(sc.doc(doc.doc).getField("name")
                        .stringValue());
                System.out.println(sc.explain(q, doc.doc).toString());
            }

        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();

        }
    }
}