Lucene 3.6 中文分词、分页查询、高亮显示等

最新推荐文章于 2022-04-25 11:10:12 发布

iteye_1364

最新推荐文章于 2022-04-25 11:10:12 发布

阅读量246

点赞数

分类专栏： JAVA 文章标签：数据库 java

本文链接：https://blog.csdn.net/iteye_1364/article/details/82447965

版权

JAVA 专栏收录该内容

228 篇文章 0 订阅

订阅专栏

1、准备工作

下载lucene 3.6.1 ： http://lucene.apache.org/

下载中文分词IK Analyzer： http://code.google.com/p/ik-analyzer/downloads/list （注意下载的是IK Analyzer 2012_u5_source.zip，其他版本有bug）

下载solr 3.6.1： http://lucene.apache.org/solr/（编译IK Analyzer时需引用包）

OK，将lucene 、solr 相关包（lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar）拷贝到项目lib下，IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引（使用IK分词）

 
package lucene.util; 
 
   002 
 
import org.apache.lucene.index.IndexWriter; 
 
import org.apache.lucene.index.IndexWriterConfig; 
 
import org.apache.lucene.index.CorruptIndexException; 
 
import org.apache.lucene.store.FSDirectory; 
 
import org.apache.lucene.store.Directory; 
 
import org.apache.lucene.analysis.Analyzer; 
 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
 
import org.apache.lucene.util.Version; 
 
import org.apache.lucene.document.Document; 
 
import org.apache.lucene.document.Field; 
 
import org.wltea.analyzer.lucene.IKAnalyzer; 
 
   014 
 
import java.sql.Connection; 
 
import java.io.File; 
 
import java.io.IOException; 
 
import java.util.ArrayList; 
 
import java.util.Date; 
 
   020 
 
import modules.gk.Gk_info; 
 
import modules.gk.Gk_infoSub; 
 
import web.sys.Globals; 
 
import web.db.DBConnector; 
 
import web.db.ObjectCtl; 
 
import web.util.StringUtil; 
 
   027//Wizzer.cn
 
public class LuceneIndex { 
 
    IndexWriter writer = null; 
 
    FSDirectory dir = null; 
 
    boolean create = true; 
 
   032 
 
    public void init() { 
 
        long a1 = System.currentTimeMillis(); 
 
        System.out.println("[Lucene 开始执行：" + new Date() + "]"); 
 
        Connection con = DBConnector.getconecttion(); //取得一个数据库连接 
 
        try { 
 
            final File docDir = newFile(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene 
 
            if (!docDir.exists()) { 
 
                docDir.mkdirs(); 
 
            } 
 
            String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false 
 
            if ("false".equals(cr.toLowerCase())) { 
 
                create = false; 
 
            } 
 
            Directory dir = FSDirectory.open(docDir); 
 
   047//            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
 
            Analyzer analyzer = new IKAnalyzer(true); 
 
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); 
 
            if (create) { 
 
                // Create a new index in the directory, removing any 
 
                // previously indexed documents: 
 
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 
 
            } else { 
 
                // Add new documents to an existing index: 
 
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 
 
            } 
 
            IndexWriter writer = new IndexWriter(dir, iwc); 
 
            String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 "; 
 
            int rowCount = ObjectCtl.getRowCount(con, sql); 
 
            int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每页记录数 
 
            int pages = (rowCount - 1) / pageSize + 1; //计算总页数 
 
            ArrayList list = null; 
 
            Gk_infoSub gk = null; 
 
            for (int i = 1; i < pages+1; i++) { 
 
                long a = System.currentTimeMillis(); 
 
                list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub()); 
 
                for (int j = 0; j < list.size(); j++) { 
 
                    gk = (Gk_infoSub) list.get(j); 
 
                    Document doc = new Document(); 
 
                    doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词 
 
                    doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED)); 
 
                    doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED)); 
 
                    doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词 
 
                    doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED)); 
 
                    writer.addDocument(doc); 
 
                    ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态 
 
                } 
 
   079 
 
                long b = System.currentTimeMillis(); 
 
                long c = b - a; 
 
                System.out.println("[Lucene " + rowCount + "条，" + pages + "页，第" + i + "页花费时间：" + c + "毫秒]"); 
 
            } 
 
            writer.commit(); 
 
   085 
 
        } catch (Exception e) { 
 
            e.printStackTrace(); 
 
        } finally { 
 
            DBConnector.freecon(con); //释放数据库连接 
 
            try { 
 
                if (writer != null) { 
 
                    writer.close(); 
 
                } 
 
            } catch (CorruptIndexException e) { 
 
                e.printStackTrace(); 
 
            } catch (IOException e) { 
 
                e.printStackTrace(); 
 
            } finally { 
 
                try { 
 
                    if (dir != null && IndexWriter.isLocked(dir)) { 
 
                        IndexWriter.unlock(dir);//注意解锁 
 
                    } 
 
                } catch (IOException e) { 
 
                    e.printStackTrace(); 
 
                } 
 
            } 
 
        } 
 
        long b1 = System.currentTimeMillis(); 
 
        long c1 = b1 - a1; 
 
        System.out.println("[Lucene 执行完毕，花费时间：" + c1 + "毫秒，完成时间：" + newDate() + "]"); 
 
    } 
 
   112}

3、单字段查询以及多字段分页查询高亮显示

 
package lucene.util; 
 
   002 
 
import org.apache.lucene.store.FSDirectory; 
 
import org.apache.lucene.store.Directory; 
 
import org.apache.lucene.search.*; 
 
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 
 
import org.apache.lucene.search.highlight.Highlighter; 
 
import org.apache.lucene.search.highlight.SimpleFragmenter; 
 
import org.apache.lucene.search.highlight.QueryScorer; 
 
import org.apache.lucene.queryParser.QueryParser; 
 
import org.apache.lucene.queryParser.MultiFieldQueryParser; 
 
import org.apache.lucene.analysis.TokenStream; 
 
import org.apache.lucene.analysis.Analyzer; 
 
import org.apache.lucene.analysis.KeywordAnalyzer; 
 
import org.apache.lucene.document.Document; 
 
import org.apache.lucene.index.IndexReader; 
 
import org.apache.lucene.index.Term; 
 
import org.apache.lucene.util.Version; 
 
import modules.gk.Gk_infoSub; 
 
   020 
 
import java.util.ArrayList; 
 
import java.io.File; 
 
import java.io.StringReader; 
 
import java.lang.reflect.Constructor; 
 
   025 
 
import web.util.StringUtil; 
 
import web.sys.Globals; 
 
import org.wltea.analyzer.lucene.IKAnalyzer; 
 
   029//Wizzer.cn
 
public class LuceneQuery { 
 
    private static String indexPath;// 索引生成的目录 
 
    private int rowCount;// 记录数 
 
    private int pages;// 总页数 
 
    private int currentPage;// 当前页数 
 
    private int pageSize;   //每页记录数 
 
   036 
 
    public LuceneQuery() { 
 
        this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString(); 
 
    } 
 
   040 
 
    public int getRowCount() { 
 
        return rowCount; 
 
    } 
 
   044 
 
    public int getPages() { 
 
        return pages; 
 
    } 
 
   048 
 
    public int getPageSize() { 
 
        return pageSize; 
 
    } 
 
   052 
 
    public int getCurrentPage() { 
 
        return currentPage; 
 
    } 
 
   056 
 
    /** 
 
     * 函数功能:根据字段查询索引 
 
     */ 
 
    public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) { 
 
        ArrayList list = new ArrayList(); 
 
        try { 
 
            if (curpage <= 0) { 
 
                curpage = 1; 
 
            } 
 
            if (pageSize <= 0) { 
 
                pageSize = 20; 
 
            } 
 
            this.pageSize = pageSize;   //每页记录数 
 
            this.currentPage = curpage;   //当前页 
 
            int start = (curpage - 1) * pageSize; 
 
            Directory dir = FSDirectory.open(new File(indexPath)); 
 
            IndexReader reader = IndexReader.open(dir); 
 
            IndexSearcher searcher = new IndexSearcher(reader); 
 
            Analyzer analyzer = new IKAnalyzer(true); 
 
            QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer); 
 
            queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); 
 
            Query query = queryParser.parse(keyWord); 
 
            int hm = start + pageSize; 
 
            TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 
 
            searcher.search(query, res); 
 
   082 
 
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 
 
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(query)); 
 
            this.rowCount = res.getTotalHits(); 
 
            this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 
 
            TopDocs tds = res.topDocs(start, pageSize); 
 
            ScoreDoc[] sd = tds.scoreDocs; 
 
            for (int i = 0; i < sd.length; i++) { 
 
                Document hitDoc = reader.document(sd[i].doc); 
 
                list.add(createObj(hitDoc, analyzer, highlighter)); 
 
            } 
 
   093 
 
        } catch (Exception e) { 
 
            e.printStackTrace(); 
 
        } 
 
   097 
 
        return list; 
 
   099 
 
    } 
 
    /** 
 
     * 函数功能:根据字段查询索引 
 
     */ 
 
    public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) { 
 
        ArrayList list = new ArrayList(); 
 
        try { 
 
            if (curpage <= 0) { 
 
                curpage = 1; 
 
            } 
 
            if (pageSize <= 0) { 
 
                pageSize = 20; 
 
            } 
 
            this.pageSize = pageSize;   //每页记录数 
 
            this.currentPage = curpage;   //当前页 
 
            int start = (curpage - 1) * pageSize; 
 
            Directory dir = FSDirectory.open(new File(indexPath)); 
 
            IndexReader reader = IndexReader.open(dir); 
 
            IndexSearcher searcher = new IndexSearcher(reader); 
 
            BooleanQuery bQuery = new BooleanQuery();  //组合查询 
 
            if (!"".equals(allkeyword)) {//包含全部关键词 
 
                KeywordAnalyzer analyzer = new KeywordAnalyzer(); 
 
                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND 
 
                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 
 
                bQuery.add(query, BooleanClause.Occur.MUST);  //AND 
 
            } 
 
            if (!"".equals(onekeyword)) { //包含任意关键词 
 
                Analyzer analyzer = new IKAnalyzer(true); 
 
                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR 
 
                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 
 
                bQuery.add(query, BooleanClause.Occur.MUST);  //AND 
 
            } 
 
            if (!"".equals(nokeyword)) { //排除关键词 
 
                Analyzer analyzer = new IKAnalyzer(true); 
 
                BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT 
 
                Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 
 
                bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND 
 
   137 
 
            } 
 
            int hm = start + pageSize; 
 
            TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 
 
            searcher.search(bQuery, res); 
 
            SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 
 
            Highlighter highlighter = new Highlighter(simpleHTMLFormatter, newQueryScorer(bQuery)); 
 
            this.rowCount = res.getTotalHits(); 
 
            this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 
 
            System.out.println("rowCount:" + rowCount); 
 
            TopDocs tds = res.topDocs(start, pageSize); 
 
            ScoreDoc[] sd = tds.scoreDocs; 
 
            Analyzer analyzer = new IKAnalyzer(); 
 
            for (int i = 0; i < sd.length; i++) { 
 
                Document hitDoc = reader.document(sd[i].doc); 
 
                list.add(createObj(hitDoc, analyzer, highlighter)); 
 
            } 
 
   154 
 
        } catch (Exception e) { 
 
            e.printStackTrace(); 
 
        } 
 
   158 
 
        return list; 
 
   160 
 
    } 
 
   162 
 
    /** 
 
     * 创建返回对象（高亮） 
 
     */ 
 
   166 
 
    private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) { 
 
   168 
 
        Gk_infoSub gk = new Gk_infoSub(); 
 
        try { 
 
   171 
 
            if (doc != null) { 
 
                gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 
 
                gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 
 
                String title = StringUtil.null2String(doc.get("title")); 
 
                gk.setTitle(title); 
 
                if (!"".equals(title)) { 
 
                    highlighter.setTextFragmenter(newSimpleFragmenter(title.length())); 
 
                    TokenStream tk = analyzer.tokenStream("title", newStringReader(title)); 
 
                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title)); 
 
                    if (!"".equals(htext)) { 
 
                        gk.setTitle(htext); 
 
                    } 
 
                } 
 
                String keywords = StringUtil.null2String(doc.get("keywords")); 
 
                gk.setKeywords(keywords); 
 
                if (!"".equals(keywords)) { 
 
                    highlighter.setTextFragmenter(newSimpleFragmenter(keywords.length())); 
 
                    TokenStream tk = analyzer.tokenStream("keywords", newStringReader(keywords)); 
 
                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords)); 
 
                    if (!"".equals(htext)) { 
 
                        gk.setKeywords(htext); 
 
                    } 
 
                } 
 
                String describes = StringUtil.null2String(doc.get("describes")); 
 
                gk.setDescribes(describes); 
 
                if (!"".equals(describes)) { 
 
                    highlighter.setTextFragmenter(newSimpleFragmenter(describes.length())); 
 
                    TokenStream tk = analyzer.tokenStream("keywords", newStringReader(describes)); 
 
                    String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes)); 
 
                    if (!"".equals(htext)) { 
 
                        gk.setDescribes(htext); 
 
                    } 
 
                } 
 
   205 
 
            } 
 
            return gk; 
 
        } 
 
        catch (Exception e) { 
 
   210 
 
            e.printStackTrace(); 
 
            return null; 
 
        } 
 
        finally { 
 
            gk = null; 
 
        } 
 
   217 
 
    } 
 
   219 
 
    private synchronized static Object createObj(Document doc) { 
 
   221 
 
        Gk_infoSub gk = new Gk_infoSub(); 
 
        try { 
 
   224 
 
            if (doc != null) { 
 
                gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 
 
                gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 
 
                gk.setTitle(StringUtil.null2String(doc.get("title"))); 
 
                gk.setKeywords(StringUtil.null2String(doc.get("keywords"))); 
 
                gk.setDescribes(StringUtil.null2String(doc.get("describes"))); 
 
            } 
 
            return gk; 
 
        } 
 
        catch (Exception e) { 
 
   235 
 
            e.printStackTrace(); 
 
            return null; 
 
        } 
 
        finally { 
 
            gk = null; 
 
        } 
 
   242 
 
    } 
 
   244}

单字段查询：

 
long a = System.currentTimeMillis(); 
 
try { 
 
    int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 
 
    int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 
 
    String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title"))); 
 
    LuceneQuery lu = new LuceneQuery(); 
 
    form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize)); 
 
    form.addResult("curPage", lu.getCurrentPage()); 
 
    form.addResult("pageSize", lu.getPageSize()); 
 
    form.addResult("rowCount", lu.getRowCount()); 
 
    form.addResult("pageCount", lu.getPages()); 
 
} catch (Exception e) { 
 
    e.printStackTrace(); 
 
   14}
 
long b = System.currentTimeMillis(); 
 
long c = b - a; 
 
System.out.println("[搜索信息花费时间：" + c + "毫秒]");

多字段查询：

 
long a = System.currentTimeMillis(); 
 
try { 
 
    int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 
 
    int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 
 
    String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword"))); 
 
    String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword"))); 
 
    String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword"))); 
 
    LuceneQuery lu = new LuceneQuery(); 
 
    form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize)); 
 
    form.addResult("curPage", lu.getCurrentPage()); 
 
    form.addResult("pageSize", lu.getPageSize()); 
 
    form.addResult("rowCount", lu.getRowCount()); 
 
    form.addResult("pageCount", lu.getPages()); 
 
} catch (Exception e) { 
 
    e.printStackTrace(); 
 
   16}
 
long b = System.currentTimeMillis(); 
 
long c = b - a; 
 
System.out.println("[高级检索花费时间：" + c + "毫秒]");

4、Lucene通配符查询

 
   1 BooleanQuery bQuery = new BooleanQuery();  //组合查询 
 
   2 if (!"".equals(title)) { 
 
   3     WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*")); 
 
   4 
 
   5     bQuery.add(w1, BooleanClause.Occur.MUST);  //AND 
 
   6}
 
   7 int hm = start + pageSize; 
 
   8 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 
 
   9searcher.search(bQuery, res);

5、Lucene嵌套查询

实现SQL：(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)

 
BooleanQuery bQuery = new BooleanQuery(); 
 
BooleanQuery b1 = new BooleanQuery(); 
 
WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*")); 
 
WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*")); 
 
b1.add(w1, BooleanClause.Occur.MUST);//AND 
 
b1.add(w2, BooleanClause.Occur.MUST);//AND 
 
bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR 
 
BooleanQuery b2 = new BooleanQuery(); 
 
WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*")); 
 
WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*")); 
 
WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1")); 
 
b2.add(w3, BooleanClause.Occur.MUST);//AND 
 
b2.add(w4, BooleanClause.Occur.MUST);//AND 
 
b2.add(w5, BooleanClause.Occur.MUST);//AND 
 
bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR

6、Lucene先根据时间排序后分页

 
int hm = start + pageSize; 
 
Sort sort = new Sort(new SortField("pdate", SortField.STRING, true)); 
 
TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false); 
 
   04searcher.search(bQuery, res);
 
this.rowCount = res.getTotalHits(); 
 
this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 
 
TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize); 
 
   08ScoreDoc[] sd = tds.scoreDocs;
 
System.out.println("rowCount:" + rowCount); 
 
int i=0; 
 
for (ScoreDoc scoreDoc : sd) { 
 
    i++; 
 
    if(i<start){ 
 
        continue; 
 
    } 
 
    if(i>hm){ 
 
        break; 
 
    } 
 
    Document doc = searcher.doc(scoreDoc.doc); 
 
    list.add(createObj(doc)); 
 
   21}