以下是本人在实际开发中使用Lucene5时自己写的两个工具类,需要下载Lucene5,IKAnalyzer中文分词器V2012_FF,并且对Lucene5打补丁,修改包org.wltea.analyzer.lucene中的两个类,如下:
IKAnalyzer.java
1 package org.wltea.analyzer.lucene; 2 import java.io.Reader; 3 import java.io.StringReader; 4 5 import org.apache.lucene.analysis.Analyzer; 6 import org.apache.lucene.util.IOUtils; 7 8 public class IKAnalyzer extends Analyzer { 9 10 @Override 11 protected TokenStreamComponents createComponents(String arg0) { 12 Reader reader=null; 13 try{ 14 reader=new StringReader(arg0); 15 IKTokenizer it = new IKTokenizer(reader); 16 return new Analyzer.TokenStreamComponents(it); 17 }finally { 18 IOUtils.closeWhileHandlingException(reader); 19 } 20 } 21 22 }
IKTokenizer.java
1 package org.wltea.analyzer.lucene; 2 3 import java.io.IOException; 4 import java.io.Reader; 5 6 import org.apache.lucene.analysis.Tokenizer; 7 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 10 import org.wltea.analyzer.core.IKSegmenter; 11 import org.wltea.analyzer.core.Lexeme; 12 13 public class IKTokenizer extends Tokenizer { 14 // IK分词器实现 15 private IKSegmenter _IKImplement; 16 17 // 词元文本属性 18 private final CharTermAttribute termAtt; 19 // 词元位移属性 20 private final OffsetAttribute offsetAtt; 21 // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 22 private final TypeAttribute typeAtt; 23 // 记录最后一个词元的结束位置 24 private int endPosition; 25 26 public IKTokenizer(Reader in) { 27 this(in, false); 28 } 29 30 public IKTokenizer(Reader in, boolean useSmart) { 31 offsetAtt = addAttribute(OffsetAttribute.class); 32 termAtt = addAttribute(CharTermAttribute.class); 33 typeAtt = addAttribute(TypeAttribute.class); 34 _IKImplement = new IKSegmenter(input, useSmart); 35 } 36 37 @Override 38 public boolean incrementToken() throws IOException { 39 // 清除所有的词元属性 40 clearAttributes(); 41 Lexeme nextLexeme = _IKImplement.next(); 42 if (nextLexeme != null) { 43 // 将Lexeme转成Attributes 44 // 设置词元文本 45 termAtt.append(nextLexeme.getLexemeText()); 46 // 设置词元长度 47 termAtt.setLength(nextLexeme.getLength()); 48 // 设置词元位移 49 offsetAtt.setOffset(nextLexeme.getBeginPosition(), 50 nextLexeme.getEndPosition()); 51 // 记录分词的最后位置 52 endPosition = nextLexeme.getEndPosition(); 53 // 记录词元分类 54 typeAtt.setType(nextLexeme.getLexemeTypeString()); 55 // 返会true告知还有下个词元 56 return true; 57 } 58 // 返会false告知词元输出完毕 59 return false; 60 } 61 62 @Override 63 public void reset() throws IOException { 64 super.reset(); 65 _IKImplement.reset(input); 66 } 67 68 @Override 69 public final void end() { 70 // set final offset 71 int finalOffset = correctOffset(this.endPosition); 72 offsetAtt.setOffset(finalOffset, finalOffset); 73 } 74 75 }
工具类:Lucene.java
1 /** 2 * 3 */ 4 package com.jjh.common; 5 6 import java.io.IOException; 7 import java.nio.file.Path; 8 import java.util.Objects; 9 10 import org.apache.lucene.analysis.Analyzer; 11 import org.apache.lucene.document.Document; 12 import org.apache.lucene.document.TextField; 13 import org.apache.lucene.document.Field.Store; 14 import org.apache.lucene.index.DirectoryReader; 15 import org.apache.lucene.index.IndexWriter; 16 import org.apache.lucene.index.IndexWriterConfig; 17 import org.apache.lucene.index.Term; 18 import org.apache.lucene.queryparser.classic.ParseException; 19 import org.apache.lucene.queryparser.classic.QueryParser; 20 import org.apache.lucene.queryparser.classic.QueryParserBase; 21 import org.apache.lucene.search.IndexSearcher; 22 import org.apache.lucene.search.Query; 23 import org.apache.lucene.search.ScoreDoc; 24 import org.apache.lucene.search.TopDocs; 25 import org.apache.lucene.store.Directory; 26 import org.apache.lucene.store.FSDirectory; 27 import org.apache.lucene.store.NoLockFactory; 28 import org.json.JSONArray; 29 import org.json.JSONException; 30 import org.json.JSONObject; 31 import org.wltea.analyzer.lucene.IKAnalyzer; 32 33 /** 34 * @author Administrator 35 * 36 */ 37 public final class Lucene { 38 39 private final Analyzer analyzer; 40 private final IndexWriter indexWriter; 41 private final Directory fsDirectory; 42 43 /** 44 * @throws IOException 45 * 46 */ 47 Lucene(Path path) throws IOException { 48 // TODO Auto-generated constructor stub 49 // 索引初始化 50 analyzer = new IKAnalyzer(); 51 IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); 52 iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 53 fsDirectory = FSDirectory.open(path,NoLockFactory.INSTANCE); 54 indexWriter = new IndexWriter(fsDirectory, iwConfig); 55 } 56 57 public void close() throws IOException { 58 if (Objects.nonNull(indexWriter) && indexWriter.isOpen()) 59 indexWriter.close(); 60 if (Objects.nonNull(fsDirectory)) 61 fsDirectory.close(); 62 if (Objects.nonNull(analyzer)) 63 analyzer.close(); 64 } 65 66 public Lucene addDocument(String ID, String content) throws IOException { 67 // 对ID建立索引,存储原始值 68 TextField postIdField = new TextField("id", ID, Store.YES); 69 // 对Content建立索引,但不存储原始值 70 TextField postContentField = new TextField("content", content, Store.NO); 71 // 追加一个索引条目,相当于表里面的行 72 Document doc = new Document(); 73 doc.add(postIdField); 74 doc.add(postContentField); 75 synchronized (this) { 76 indexWriter.addDocument(doc); 77 indexWriter.commit(); 78 } 79 return this; 80 } 81 82 public Lucene delete(String ID) throws IOException { 83 // 做删除标志 84 synchronized (this) { 85 indexWriter.deleteDocuments(new Term("id", ID)); 86 indexWriter.forceMergeDeletes(); 87 indexWriter.commit(); 88 } 89 return this; 90 } 91 92 /** 93 * 两次查询,适合数据量比较大,不会导致内存溢出 94 * 95 * @param content:要查询的内容,未人为分词 96 * @param pageIndex:第几页 97 * @param pageSize:每页的记录数 98 * @return:返回json:{"total":100,"rows":[id1,id2...]} 99 * @throws IOException 100 * @throws ParseException 101 * @throws JSONException 102 */ 103 public JSONObject findByPagination(String content, int pageIndex, int pageSize) 104 throws IOException, JSONException, ParseException { 105 JSONObject json = new JSONObject(); 106 try (DirectoryReader ireader = DirectoryReader.open(fsDirectory)) { 107 IndexSearcher isearcher = new IndexSearcher(ireader); 108 QueryParser qp = new QueryParser("content", analyzer); 109 qp.setDefaultOperator(QueryParserBase.AND_OPERATOR); 110 Query query = qp.parse(content); 111 // 得到上一次分页的最后一条记录 112 ScoreDoc lastSd = Lucenes.getLastScoreDoc(json, pageIndex, pageSize, query, isearcher); 113 TopDocs tds = isearcher.searchAfter(lastSd, query, pageSize); 114 json.put("rows", new JSONArray()); 115 for (ScoreDoc sd : tds.scoreDocs) { 116 // 通过文档id获取对应的文档 117 Document doc = isearcher.doc(sd.doc); 118 String id = doc.get("id"); 119 json.getJSONArray("rows").put(id); 120 } 121 } 122 return json; 123 } 124 125 /* 126 * @param content:要查询的内容,未人为分词 127 * 128 * @return:返回id 129 */ 130 public String findOne(String content) throws IOException, ParseException { 131 String id = null; 132 try (DirectoryReader ireader = DirectoryReader.open(fsDirectory)) { 133 IndexSearcher isearcher = new IndexSearcher(ireader); 134 QueryParser qp = new QueryParser("content", analyzer); 135 qp.setDefaultOperator(QueryParserBase.AND_OPERATOR); 136 Query query = qp.parse(content); 137 TopDocs tds = isearcher.search(query, 1); 138 if (tds.totalHits == 1) { 139 // 通过文档id获取对应的文档 140 Document doc = isearcher.doc(tds.scoreDocs[0].doc); 141 id = doc.get("id"); 142 } 143 144 } 145 return id; 146 } 147 148 public boolean hit(String keywords) throws IOException, ParseException { 149 boolean hit = false; 150 try (DirectoryReader ireader = DirectoryReader.open(fsDirectory);) { 151 IndexSearcher isearcher = new IndexSearcher(ireader); 152 QueryParser qp = new QueryParser("content", analyzer); 153 qp.setDefaultOperator(QueryParserBase.AND_OPERATOR); 154 Query query = qp.parse(keywords); 155 TopDocs topDocs = isearcher.search(query, 1); 156 hit = topDocs.totalHits >= 1; 157 } 158 return hit; 159 } 160 161 }
工具类:Lucenes.java
1 /** 2 * 3 */ 4 package com.jjh.common; 5 6 import java.io.IOException; 7 import java.nio.file.Path; 8 import java.nio.file.Paths; 9 import java.util.HashMap; 10 import java.util.Map; 11 import java.util.concurrent.ExecutorService; 12 import java.util.concurrent.Executors; 13 import java.util.concurrent.TimeUnit; 14 15 import org.apache.lucene.analysis.Analyzer; 16 import org.apache.lucene.document.Document; 17 import org.apache.lucene.document.TextField; 18 import org.apache.lucene.document.Field.Store; 19 import org.apache.lucene.index.DirectoryReader; 20 import org.apache.lucene.index.IndexWriter; 21 import org.apache.lucene.index.IndexWriterConfig; 22 import org.apache.lucene.index.Term; 23 import org.apache.lucene.queryparser.classic.ParseException; 24 import org.apache.lucene.queryparser.classic.QueryParser; 25 import org.apache.lucene.queryparser.classic.QueryParserBase; 26 import org.apache.lucene.search.IndexSearcher; 27 import org.apache.lucene.search.Query; 28 import org.apache.lucene.search.ScoreDoc; 29 import org.apache.lucene.search.TopDocs; 30 import org.apache.lucene.store.Directory; 31 import org.apache.lucene.store.FSDirectory; 32 import org.apache.lucene.store.NoLockFactory; 33 import org.json.JSONArray; 34 import org.json.JSONException; 35 import org.json.JSONObject; 36 import org.wltea.analyzer.lucene.IKAnalyzer; 37 38 /** 39 * @author Administrator 40 * 41 */ 42 public final class Lucenes { 43 private static final Object lock = new Object(); 44 45 // 对象容器 46 private static final Map<String, Lucene> lucenes = new HashMap<>(); 47 48 /** 49 * @throws IOException 50 * 51 */ 52 private Lucenes() throws IOException { 53 // TODO Auto-generated constructor stub 54 55 } 56 57 // 工厂方法 58 public static Lucene getInstance(String first, String... more) throws IOException { 59 Path path = Paths.get(first, more); 60 if (!lucenes.containsKey(path.toString())) 61 synchronized (lucenes) { 62 if (!lucenes.containsKey(path.toString())) { 63 Lucene lucene = new Lucene(path); 64 lucenes.put(path.toString(), lucene); 65 } 66 } 67 return lucenes.get(path.toString()); 68 } 69 70 public static Lucene getLucene(String first, String... more) { 71 return lucenes.get(Paths.get(first, more).toString()); 72 } 73 74 public static void remove(Lucene lucene) { 75 lucenes.remove(lucene); 76 77 } 78 79 /** 80 * @param ID:文档的ID值 81 * @param content:非结构化的内容 82 * @param indexDirectory:存储索引的目录 83 * @throws IOException 84 */ 85 static public void index(Path indexDirectory, String ID, String content) throws IOException { 86 Analyzer analyzer = new IKAnalyzer(); 87 // 对ID建立索引,存储原始值 88 TextField postIdField = new TextField("id", ID, Store.YES); 89 // 对Content建立索引,但不存储原始值 90 TextField postContentField = new TextField("content", content, Store.NO); 91 // 追加一个索引条目,相当于表里面的行 92 Document doc = new Document(); 93 doc.add(postIdField); 94 doc.add(postContentField); 95 IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); 96 iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 97 synchronized (lock) { 98 try (Directory fsDirectory = FSDirectory.open(indexDirectory, NoLockFactory.INSTANCE); 99 IndexWriter indexWriter = new IndexWriter(fsDirectory, iwConfig);) { 100 indexWriter.addDocument(doc); 101 } 102 103 } 104 105 } 106 107 /** 108 * @param indexDirectory:存储索引的目录 109 * @param ID 110 * @throws IOException 111 */ 112 static public void delete(Path indexDirectory, String ID) throws IOException { 113 Analyzer analyzer = new IKAnalyzer(); 114 IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); 115 iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 116 try (Directory fsDirectory = FSDirectory.open(indexDirectory); 117 IndexWriter indexWriter = new IndexWriter(fsDirectory, iwConfig);) { 118 // 做删除标志 119 indexWriter.deleteDocuments(new Term("id", ID)); 120 indexWriter.forceMergeDeletes(); 121 indexWriter.commit(); 122 123 } 124 } 125 126 /** 127 * 两次查询,适合数据量比较大,不会导致内存溢出 128 * 129 * @param indexDirectory:存放索引的目录 130 * @param content:要查询的内容,未人为分词 131 * @param pageIndex:第几页 132 * @param pageSize:每页的记录数 133 * @return:返回json:{"total":100,"rows":[id1,id2...]} 134 * @throws IOException 135 * @throws ParseException 136 * @throws JSONException 137 */ 138 public static JSONObject findByPagination(Path indexDirectory, String content, int pageIndex, int pageSize) 139 throws IOException, JSONException, ParseException { 140 JSONObject json = new JSONObject(); 141 Analyzer analyzer = new IKAnalyzer(); 142 try (Directory fsDirectory = FSDirectory.open(indexDirectory); 143 DirectoryReader ireader = DirectoryReader.open(fsDirectory)) { 144 IndexSearcher isearcher = new IndexSearcher(ireader); 145 QueryParser qp = new QueryParser("content", analyzer); 146 qp.setDefaultOperator(QueryParserBase.AND_OPERATOR); 147 Query query = qp.parse(content); 148 // 得到上一次分页的最后一条记录 149 ScoreDoc lastSd = Lucenes.getLastScoreDoc(json, pageIndex, pageSize, query, isearcher); 150 TopDocs tds = isearcher.searchAfter(lastSd, query, pageSize); 151 json.put("rows", new JSONArray()); 152 for (ScoreDoc sd : tds.scoreDocs) { 153 // 通过文档id获取对应的文档 154 Document doc = isearcher.doc(sd.doc); 155 String id = doc.get("id"); 156 json.getJSONArray("rows").put(id); 157 } 158 } 159 return json; 160 } 161 162 public static boolean hasIndex(String first, String... paths) throws IOException { 163 try (Directory dir = FSDirectory.open(Paths.get(first, paths))) { 164 return DirectoryReader.indexExists(dir); 165 } 166 } 167 168 public static ScoreDoc getLastScoreDoc(JSONObject json, int pageIndex, int pageSize, Query qp, 169 IndexSearcher isearcher) throws IOException, JSONException { 170 // TODO Auto-generated method stub 171 // 期望获取的记录数 172 int num = pageSize * (pageIndex - 1); 173 TopDocs tds = isearcher.search(qp, num == 0 ? 1 : num); 174 // 获取总的记录条数 175 json.put("total", tds.totalHits); 176 // 如果是第一页,直接返回 177 if (pageIndex == 1) 178 return null; 179 return tds.scoreDocs[num - 1]; 180 } 181 182 public static void main(String[] args) throws IOException, ParseException, JSONException, InterruptedException { 183 // delete(Paths.get("f:/jjh/admin", "lucene"),"1"); 184 // ExecutorService 185 Lucene l=getInstance("f:/jjh", "lucene"); 186 /*l.addDocument("1", "工商管理交通事故").commit(); 187 l.addDocument("2", "华信智原你好").commit();*/ 188 //l.close(); 189 ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); 190 pool.execute(() -> { 191 try { 192 l.addDocument("1", "工商管理交通事故"); 193 } catch (IOException e) { 194 // TODO Auto-generated catch block 195 e.printStackTrace(); 196 } 197 }); 198 pool.execute(() -> { 199 try { 200 l.addDocument("2", "华信智原你好"); 201 } catch (IOException e) { 202 // TODO Auto-generated catch block 203 e.printStackTrace(); 204 } 205 }); 206 pool.awaitTermination(5, TimeUnit.SECONDS); 207 pool.shutdownNow(); 208 l.close(); 209 // System.out.println(findByPagination(Paths.get("F:/jjh/lucene/lucene"),"大数据",1,10)); 210 // System.out.println(hasIndex("f:/jjh/lucene")); 211 } 212 213 }
工具类: IKAnalzyer.java
1 /** 2 * 3 */ 4 package com.jjh.common; 5 6 import java.io.FileNotFoundException; 7 import java.io.IOException; 8 import java.io.Reader; 9 import java.io.StringReader; 10 import java.util.ArrayList; 11 import java.util.Collections; 12 import java.util.HashMap; 13 import java.util.List; 14 import java.util.Map; 15 16 import org.apache.lucene.analysis.Analyzer; 17 import org.apache.lucene.analysis.TokenStream; 18 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 19 import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 20 import org.apache.xmlbeans.XmlException; 21 import org.json.JSONException; 22 import org.wltea.analyzer.core.IKSegmenter; 23 import org.wltea.analyzer.core.Lexeme; 24 import org.wltea.analyzer.lucene.IKAnalyzer; 25 26 /** 27 * @author Administrator 28 * 29 */ 30 public final class IKAnalzyer { 31 32 /** 33 * 34 */ 35 private IKAnalzyer() { 36 // TODO Auto-generated constructor stub 37 } 38 39 static public List<Map.Entry<String, Integer>> analyze(String content) throws IOException 40 { 41 Analyzer analyzer = new IKAnalyzer(); 42 Map<String, Integer> map=new HashMap<>(); 43 TokenStream ts = null; 44 try 45 { 46 ts = analyzer.tokenStream("myfield", new StringReader(content)); 47 CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 48 ts.reset(); 49 for(;ts.incrementToken();) 50 { 51 String s=term.toString(); 52 if(map.containsKey(s)) 53 map.put(s, (map.get(s)+1)); 54 else 55 map.put(s, 1); 56 } 57 ts.end(); 58 } 59 finally { 60 analyzer.close(); 61 if(ts != null){ 62 ts.close(); 63 } 64 } 65 List<Map.Entry<String, Integer>> list=new ArrayList<>(map.entrySet()); 66 Collections.sort(list, (e1,e2)->e2.getValue()-e1.getValue()); 67 return list; 68 } 69 70 static public List<Map.Entry<String, Integer>> analyze(Reader input) throws IOException 71 { 72 IKSegmenter ik=new IKSegmenter(input,true); 73 Map<String, Integer> map=new HashMap<>(); 74 for(;;) 75 { 76 Lexeme l=ik.next(); 77 if(l==null) 78 break; 79 else 80 { 81 String s=l.getLexemeText(); 82 if(map.containsKey(s)) 83 map.put(s, (map.get(s)+1)); 84 else 85 map.put(s, 1); 86 } 87 88 } 89 List<Map.Entry<String, Integer>> list=new ArrayList<>(map.entrySet()); 90 Collections.sort(list, (e1,e2)->e2.getValue()-e1.getValue()); 91 return list; 92 } 93 94 public static void main(String[] args) throws FileNotFoundException, IOException, XmlException, OpenXML4JException, JSONException 95 { 96 for(Map.Entry<String, Integer> e: analyze(new StringReader("工商管理交通事故工商管理"))) 97 System.out.println(e.getKey()); 98 99 } 100 101 102 }