lucene实现全文检索的示例代码

最新推荐文章于 2020-11-19 23:46:45 发布

flyingaway12

最新推荐文章于 2020-11-19 23:46:45 发布

阅读量470

点赞数

分类专栏： linux java

java 同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

linux

2 篇文章 0 订阅

订阅专栏

全文搜索在一般的web应用系统中都有这样的需求，lucene个用Java写的全文索引引擎工具包，我们用的Eclipse就是用它实现的全文搜索，关于lucene的工作原理，网络上的资料一把一把的，一个连接介绍的很全http://www.chedong.com/tech/lucene.html
我最近在一个项目中用到lucene实现全文搜索，开始在网上也查了不少资料，但一个能完全跑起来的例子没有几个，我把自己的实现那出来，就当是给自己做个笔记，也希望刚用lucene的朋友不要走太多的弯路。我是对新闻文章进行搜索，先看代码

public class Article{
    // Fields
     private Integer articleId;
     private ColumnSort columnSort;
     private String title;
     private String url;
     private String content;
     private String contentText;
     private String createMan;
     private Date createTime;
     private String releaseMan;
     private Date releaseTime;
     private String status="0";
     private String templateName;
     private String memo;
}

都列出了文章的相关属性，为了节省篇幅，省去了setter() getter()方法，
下面的ArticleIndexUtil类是创建索引，删除，恢复，判断索引是否存在的类

import java.io.File;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.kemp.video.article.Article;

public class ArticleIndexUtil {
    private static Log log = LogFactory.getLog(ArticleIndexUtil.class);


    public static void createArticleIndex(String indexDir,Article article){
        Document doc1 = new Document();
        doc1.add(new Field("url", article.getUrl(),Field.Store.YES,Field.Index.TOKENIZED));
        doc1.add(new Field("title",article.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
        doc1.add(new Field("contentText",article.getContentText(), Field.Store.YES, Field.Index.TOKENIZED));
        IndexWriter writer;
        try {
            if(indexExist(indexDir))//如果索引文件已经存在就以追加索引的方式进行添加索引,否则是新增索引,getDirectory(indexDir, false)方法的第二个参数一定为false否则每次只能创建当前索引
                writer = new IndexWriter(FSDirectory.getDirectory(indexDir, false), new StandardAnalyzer(), false);
            else
                writer = new IndexWriter(FSDirectory.getDirectory(indexDir, false), new StandardAnalyzer(), true);
            writer.setMaxFieldLength(5000);
            writer.addDocument(doc1);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            log.info(e.getMessage());
        }

    }


    public static void deleteArticleIndex(String indexDir,String url)
    {
        try {
            File files = new File(indexDir);
            Directory directory = FSDirectory.getDirectory(files, false);
            IndexReader reader = IndexReader.open(directory);
            IndexReader.unlock(directory);
            Term aTerm = new Term("url", String.valueOf(url));
            reader.deleteDocuments(aTerm);
            reader.close();
            directory.close();// 关闭
            reader.close();
        } catch (IOException e) {
            log.info("Exception in deleteArticleIndex: "+e.getMessage());
        }
    }


    public void unDeleteIndex(String indexDir) {
        try {
            IndexReader reader = IndexReader.open(indexDir);
            reader.undeleteAll();
            reader.close();
        } catch (IOException e) {
            log.info(" 捕捉 " + e.getClass() + "\n 错误信息: "+ e.getMessage());
        }
    }

    //判断索引是否存在
    public static boolean indexExist(String indexDir)
    {
        return IndexReader.indexExists(indexDir);
    }
}

在创建索引的时候我在 Document中加入了文章的url,title,contentText字段，url是这篇文章的实际地址，这样搜索出来就直接可以查看，不用在去查询数据库，
if(indexExist(indexDir)){
writer = new IndexWriter(FSDirectory.getDirectory(indexDir, false), new StandardAnalyzer(), false);
} else{
writer = new IndexWriter(FSDirectory.getDirectory(indexDir, false), new StandardAnalyzer(), true);
}在这里做这样的判断，很重要，当然我这里是采用的追加索引的方式，
FSDirectory.getDirectory(indexDir, false)一定注意第二个参数的意义，当为false的时候，就是每次创建索引的时候，不删除已经存在的索引文件，否则你每次创建索引的时候，都是最后一次的索引，当然看你采用的是追加的方式还是其他，我在这个问题上郁闷了好久。
删除索引这里不是说删除整个索引文件，比如说我的一片文章删除了，当然他在索引中对应的记录也应当被删除，根删除数据库中的一条记录一样，我这里是用url进行唯一标识删除
下面是搜索的代买

import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import com.kemp.video.article.Article;

public class ArticleSearchUtil {

 private static Log log = LogFactory.getLog(ArticleSearchUtil.class);


 public static ArrayList searchArticle(String indexDir, String queryKey) {
 ArrayList list = new ArrayList();
 list = ArticleSearchUtil.searchByKey(indexDir, "contentText", queryKey);
 return list;
 }


 public static ArrayList searchByKey(String indexDir, String property,String queryKey) {
 IndexSearcher searcher;
 Hits hits = null;
 Query query = null;
 ArrayList list = new ArrayList();
 try {
 searcher = new IndexSearcher(indexDir);
 QueryParser qp = new QueryParser(property, new StandardAnalyzer());
 query = qp.parse(queryKey);
 hits = searcher.search(query);
 list = getAticlesFormHits(hits,queryKey);
 searcher.close();
 } catch (IOException e) {
 log.info("printStackTrace in searchByKey " + e.getMessage());
 } catch (ParseException e) {
 log.info("printStackTrace in ParseException " + e.getMessage());
 }
 return list;
 }


 public static ArrayList getAticlesFormHits(Hits hits,String queryKey){
 ArrayList list = new ArrayList();
 for (int i = 0; i < hits.length(); i++) {
 Article article = new Article();
 Document doc;
 try {
 doc = hits.doc(i);
 article.setUrl(doc.get("url"));
 String title = doc.get("title");
 if(title.contains(queryKey)){
 title = title.replaceAll(queryKey, ""+queryKey+"");
 }
 article.setTitle(title);
 String content = doc.get("contentText");
 if(content.length()>150){//把关键字所在的位置的前后共150个字截取显示出来，并把关键字表红进行显示
 if(content.contains(queryKey)){
 int index = content.indexOf(queryKey);
 int length = content.length();
 if(index>150){
 if((length-index)<140){
 content = content.substring(length-150,length)+".....";
 }else
 content = content.substring(index-10,index+140)+".....";
 }else
 content = content.substring(0,150)+".....";
 content = content.replaceAll(queryKey, ""+queryKey+"");
 }
 }
 article.setContentText(content);
 list.add(article);
 } catch (IOException e) {
 e.printStackTrace();
 }
 }
 return list;
 }
}

public static ArrayList searchByKey(String indexDir, String property,String queryKey)主要看这个方法，其他两个可以不看，searcher.close();这句一定得有，不关闭的话在你删除或者搜索的创建的时候会给你带来不必要的麻烦.
对于修改的做法是先删除，在创建的方式把两个方法结合就可