1、准备工作
下载lucene 3.6.1 : http://lucene.apache.org/
下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug)
下载solr 3.6.1: http://lucene.apache.org/solr/(编译IK Analyzer时需引用包)
OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。
2、从Oracle数据库中取数据创建索引(使用IK分词)
003 | import org.apache.lucene.index.IndexWriter; |
004 | import org.apache.lucene.index.IndexWriterConfig; |
005 | import org.apache.lucene.index.CorruptIndexException; |
006 | import org.apache.lucene.store.FSDirectory; |
007 | import org.apache.lucene.store.Directory; |
008 | import org.apache.lucene.analysis.Analyzer; |
009 | import org.apache.lucene.analysis.standard.StandardAnalyzer; |
010 | import org.apache.lucene.util.Version; |
011 | import org.apache.lucene.document.Document; |
012 | import org.apache.lucene.document.Field; |
013 | import org.wltea.analyzer.lucene.IKAnalyzer; |
015 | import java.sql.Connection; |
017 | import java.io.IOException; |
018 | import java.util.ArrayList; |
019 | import java.util.Date; |
021 | import modules.gk.Gk_info; |
022 | import modules.gk.Gk_infoSub; |
023 | import web.sys.Globals; |
024 | import web.db.DBConnector; |
025 | import web.db.ObjectCtl; |
026 | import web.util.StringUtil; |
028 | public class LuceneIndex { |
029 | IndexWriter writer = null ; |
030 | FSDirectory dir = null ; |
031 | boolean create = true ; |
034 | long a1 = System.currentTimeMillis(); |
035 | System.out.println( "[Lucene 开始执行:" + new Date() + "]" ); |
036 | Connection con = DBConnector.getconecttion(); |
038 | final File docDir = new File(Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString()); |
039 | if (!docDir.exists()) { |
042 | String cr = Globals.SYS_COM_CONFIG.get( "sys.index.create" ).toString(); |
043 | if ( "false" .equals(cr.toLowerCase())) { |
046 | Directory dir = FSDirectory.open(docDir); |
048 | Analyzer analyzer = new IKAnalyzer( true ); |
049 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); |
053 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); |
056 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); |
058 | IndexWriter writer = new IndexWriter(dir, iwc); |
059 | String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 " ; |
060 | int rowCount = ObjectCtl.getRowCount(con, sql); |
061 | int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get( "sys.index.size" ).toString()); |
062 | int pages = (rowCount - 1 ) / pageSize + 1 ; |
063 | ArrayList list = null ; |
064 | Gk_infoSub gk = null ; |
065 | for ( int i = 1 ; i < pages+ 1 ; i++) { |
066 | long a = System.currentTimeMillis(); |
067 | list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub()); |
068 | for ( int j = 0 ; j < list.size(); j++) { |
069 | gk = (Gk_infoSub) list.get(j); |
070 | Document doc = new Document(); |
071 | doc.add( new Field( "indexno" , StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); |
072 | doc.add( new Field( "title" , StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED)); |
073 | doc.add( new Field( "describes" , StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED)); |
074 | doc.add( new Field( "pdate" , StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); |
075 | doc.add( new Field( "keywords" , StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED)); |
076 | writer.addDocument(doc); |
077 | ObjectCtl.executeUpdateBySql(con, "UPDATE TABLEA SET SSTAG=1 WHERE indexno='" +gk.getIndexno()+ "'" ); |
080 | long b = System.currentTimeMillis(); |
082 | System.out.println( "[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]" ); |
086 | } catch (Exception e) { |
089 | DBConnector.freecon(con); |
091 | if (writer != null ) { |
094 | } catch (CorruptIndexException e) { |
096 | } catch (IOException e) { |
100 | if (dir != null && IndexWriter.isLocked(dir)) { |
101 | IndexWriter.unlock(dir); |
103 | } catch (IOException e) { |
108 | long b1 = System.currentTimeMillis(); |
110 | System.out.println( "[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" + new Date() + "]" ); |
3、单字段查询以及多字段分页查询高亮显示
003 | import org.apache.lucene.store.FSDirectory; |
004 | import org.apache.lucene.store.Directory; |
005 | import org.apache.lucene.search.*; |
006 | import org.apache.lucene.search.highlight.SimpleHTMLFormatter; |
007 | import org.apache.lucene.search.highlight.Highlighter; |
008 | import org.apache.lucene.search.highlight.SimpleFragmenter; |
009 | import org.apache.lucene.search.highlight.QueryScorer; |
010 | import org.apache.lucene.queryParser.QueryParser; |
011 | import org.apache.lucene.queryParser.MultiFieldQueryParser; |
012 | import org.apache.lucene.analysis.TokenStream; |
013 | import org.apache.lucene.analysis.Analyzer; |
014 | import org.apache.lucene.analysis.KeywordAnalyzer; |
015 | import org.apache.lucene.document.Document; |
016 | import org.apache.lucene.index.IndexReader; |
017 | import org.apache.lucene.index.Term; |
018 | import org.apache.lucene.util.Version; |
019 | import modules.gk.Gk_infoSub; |
021 | import java.util.ArrayList; |
023 | import java.io.StringReader; |
024 | import java.lang.reflect.Constructor; |
026 | import web.util.StringUtil; |
027 | import web.sys.Globals; |
028 | import org.wltea.analyzer.lucene.IKAnalyzer; |
030 | public class LuceneQuery { |
031 | private static String indexPath; |
032 | private int rowCount; |
034 | private int currentPage; |
035 | private int pageSize; |
037 | public LuceneQuery() { |
038 | this .indexPath = Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString(); |
041 | public int getRowCount() { |
045 | public int getPages() { |
049 | public int getPageSize() { |
053 | public int getCurrentPage() { |
060 | public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) { |
061 | ArrayList list = new ArrayList(); |
069 | this .pageSize = pageSize; |
070 | this .currentPage = curpage; |
071 | int start = (curpage - 1 ) * pageSize; |
072 | Directory dir = FSDirectory.open( new File(indexPath)); |
073 | IndexReader reader = IndexReader.open(dir); |
074 | IndexSearcher searcher = new IndexSearcher(reader); |
075 | Analyzer analyzer = new IKAnalyzer( true ); |
076 | QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title" , analyzer); |
077 | queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); |
078 | Query query = queryParser.parse(keyWord); |
079 | int hm = start + pageSize; |
080 | TopScoreDocCollector res = TopScoreDocCollector.create(hm, false ); |
081 | searcher.search(query, res); |
083 | SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<span style='color:red'>" , "</span>" ); |
084 | Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); |
085 | this .rowCount = res.getTotalHits(); |
086 | this .pages = (rowCount - 1 ) / pageSize + 1 ; |
087 | TopDocs tds = res.topDocs(start, pageSize); |
088 | ScoreDoc[] sd = tds.scoreDocs; |
089 | for ( int i = 0 ; i < sd.length; i++) { |
090 | Document hitDoc = reader.document(sd[i].doc); |
091 | list.add(createObj(hitDoc, analyzer, highlighter)); |
094 | } catch (Exception e) { |
104 | public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) { |
105 | ArrayList list = new ArrayList(); |
113 | this .pageSize = pageSize; |
114 | this .currentPage = curpage; |
115 | int start = (curpage - 1 ) * pageSize; |
116 | Directory dir = FSDirectory.open( new File(indexPath)); |
117 | IndexReader reader = IndexReader.open(dir); |
118 | IndexSearcher searcher = new IndexSearcher(reader); |
119 | BooleanQuery bQuery = new BooleanQuery(); |
120 | if (! "" .equals(allkeyword)) { |
121 | KeywordAnalyzer analyzer = new KeywordAnalyzer(); |
122 | BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; |
123 | Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer); |
124 | bQuery.add(query, BooleanClause.Occur.MUST); |
126 | if (! "" .equals(onekeyword)) { |
127 | Analyzer analyzer = new IKAnalyzer( true ); |
128 | BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; |
129 | Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer); |
130 | bQuery.add(query, BooleanClause.Occur.MUST); |
132 | if (! "" .equals(nokeyword)) { |
133 | Analyzer analyzer = new IKAnalyzer( true ); |
134 | BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; |
135 | Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{ "title" , "describes" , "keywords" }, flags, analyzer); |
136 | bQuery.add(query, BooleanClause.Occur.MUST_NOT); |
139 | int hm = start + pageSize; |
140 | TopScoreDocCollector res = TopScoreDocCollector.create(hm, false ); |
141 | searcher.search(bQuery, res); |
142 | SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<span style='color:red'>" , "</span>" ); |
143 | Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery)); |
144 | this .rowCount = res.getTotalHits(); |
145 | this .pages = (rowCount - 1 ) / pageSize + 1 ; |
146 | System.out.println( "rowCount:" + rowCount); |
147 | TopDocs tds = res.topDocs(start, pageSize); |
148 | ScoreDoc[] sd = tds.scoreDocs; |
149 | Analyzer analyzer = new IKAnalyzer(); |
150 | for ( int i = 0 ; i < sd.length; i++) { |
151 | Document hitDoc = reader.document(sd[i].doc); |
152 | list.add(createObj(hitDoc, analyzer, highlighter)); |
155 | } catch (Exception e) { |
167 | private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) { |
169 | Gk_infoSub gk = new Gk_infoSub(); |
173 | gk.setIndexno(StringUtil.null2String(doc.get( "indexno" ))); |
174 | gk.setPdate(StringUtil.null2String(doc.get( "pdate" ))); |
175 | String title = StringUtil.null2String(doc.get( "title" )); |
177 | if (! "" .equals(title)) { |
178 | highlighter.setTextFragmenter( new SimpleFragmenter(title.length())); |
179 | TokenStream tk = analyzer.tokenStream( "title" , new StringReader(title)); |
180 | String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title)); |
181 | if (! "" .equals(htext)) { |
185 | String keywords = StringUtil.null2String(doc.get( "keywords" )); |
186 | gk.setKeywords(keywords); |
187 | if (! "" .equals(keywords)) { |
188 | highlighter.setTextFragmenter( new SimpleFragmenter(keywords.length())); |
189 | TokenStream tk = analyzer.tokenStream( "keywords" , new StringReader(keywords)); |
190 | String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords)); |
191 | if (! "" .equals(htext)) { |
192 | gk.setKeywords(htext); |
195 | String describes = StringUtil.null2String(doc.get( "describes" )); |
196 | gk.setDescribes(describes); |
197 | if (! "" .equals(describes)) { |
198 | highlighter.setTextFragmenter( new SimpleFragmenter(describes.length())); |
199 | TokenStream tk = analyzer.tokenStream( "keywords" , new StringReader(describes)); |
200 | String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes)); |
201 | if (! "" .equals(htext)) { |
202 | gk.setDescribes(htext); |
209 | catch (Exception e) { |
220 | private synchronized static Object createObj(Document doc) { |
222 | Gk_infoSub gk = new Gk_infoSub(); |
226 | gk.setIndexno(StringUtil.null2String(doc.get( "indexno" ))); |
227 | gk.setPdate(StringUtil.null2String(doc.get( "pdate" ))); |
228 | gk.setTitle(StringUtil.null2String(doc.get( "title" ))); |
229 | gk.setKeywords(StringUtil.null2String(doc.get( "keywords" ))); |
230 | gk.setDescribes(StringUtil.null2String(doc.get( "describes" ))); |
234 | catch (Exception e) { |
单字段查询:
01 | long a = System.currentTimeMillis(); |
03 | int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" ))); |
04 | int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" ))); |
05 | String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "title" ))); |
06 | LuceneQuery lu = new LuceneQuery(); |
07 | form.addResult( "list" , lu.queryIndexTitle(title, curpage, pagesize)); |
08 | form.addResult( "curPage" , lu.getCurrentPage()); |
09 | form.addResult( "pageSize" , lu.getPageSize()); |
10 | form.addResult( "rowCount" , lu.getRowCount()); |
11 | form.addResult( "pageCount" , lu.getPages()); |
12 | } catch (Exception e) { |
15 | long b = System.currentTimeMillis(); |
17 | System.out.println( "[搜索信息花费时间:" + c + "毫秒]" ); |
多字段查询:
01 | long a = System.currentTimeMillis(); |
03 | int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" ))); |
04 | int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" ))); |
05 | String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "allkeyword" ))); |
06 | String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "onekeyword" ))); |
07 | String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "nokeyword" ))); |
08 | LuceneQuery lu = new LuceneQuery(); |
09 | form.addResult( "list" , lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize)); |
10 | form.addResult( "curPage" , lu.getCurrentPage()); |
11 | form.addResult( "pageSize" , lu.getPageSize()); |
12 | form.addResult( "rowCount" , lu.getRowCount()); |
13 | form.addResult( "pageCount" , lu.getPages()); |
14 | } catch (Exception e) { |
17 | long b = System.currentTimeMillis(); |
19 | System.out.println( "[高级检索花费时间:" + c + "毫秒]" ); |
4、Lucene通配符查询
1 | BooleanQuery bQuery = new BooleanQuery(); |
2 | if (! "" .equals(title)) { |
3 | WildcardQuery w1 = new WildcardQuery( new Term( "title" , title+ "*" )); |
5 | bQuery.add(w1, BooleanClause.Occur.MUST); |
7 | int hm = start + pageSize; |
8 | TopScoreDocCollector res = TopScoreDocCollector.create(hm, false ); |
9 | searcher.search(bQuery, res); |
5、Lucene嵌套查询
实现SQL:(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
01 | BooleanQuery bQuery = new BooleanQuery(); |
02 | BooleanQuery b1 = new BooleanQuery(); |
03 | WildcardQuery w1 = new WildcardQuery( new Term( "unitid" , unitid + "*" )); |
04 | WildcardQuery w2 = new WildcardQuery( new Term( "idml" , id2 + "*" )); |
05 | b1.add(w1, BooleanClause.Occur.MUST); |
06 | b1.add(w2, BooleanClause.Occur.MUST); |
07 | bQuery.add(b1, BooleanClause.Occur.SHOULD); |
08 | BooleanQuery b2 = new BooleanQuery(); |
09 | WildcardQuery w3 = new WildcardQuery( new Term( "tounitid" , unitid + "*" )); |
10 | WildcardQuery w4 = new WildcardQuery( new Term( "tomlid" , id2 + "*" )); |
11 | WildcardQuery w5 = new WildcardQuery( new Term( "tostate" , "1" )); |
12 | b2.add(w3, BooleanClause.Occur.MUST); |
13 | b2.add(w4, BooleanClause.Occur.MUST); |
14 | b2.add(w5, BooleanClause.Occur.MUST); |
15 | bQuery.add(b2, BooleanClause.Occur.SHOULD); |
6、Lucene先根据时间排序后分页
01 | int hm = start + pageSize; |
02 | Sort sort = new Sort( new SortField( "pdate" , SortField.STRING, true )); |
03 | TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false ); |
04 | searcher.search(bQuery, res); |
05 | this .rowCount = res.getTotalHits(); |
06 | this .pages = (rowCount - 1 ) / pageSize + 1 ; |
07 | TopDocs tds =searcher.search(bQuery,rowCount,sort); |
08 | ScoreDoc[] sd = tds.scoreDocs; |
09 | System.out.println( "rowCount:" + rowCount); |
11 | for (ScoreDoc scoreDoc : sd) { |
19 | Document doc = searcher.doc(scoreDoc.doc); |
20 | list.add(createObj(doc)); |
这个效率不高,正常的做饭是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。