Lucene5工具类

最新推荐文章于 2023-05-22 21:35:28 发布

ahtjh2015

最新推荐文章于 2023-05-22 21:35:28 发布

阅读量140

点赞数

文章标签： json java 人工智能

原文链接：http://www.cnblogs.com/jjh-java/p/6795597.html

版权

以下是本人在实际开发中使用Lucene5时自己写的两个工具类，需要下载Lucene5，IKAnalyzer中文分词器V2012_FF，并且对Lucene5打补丁，修改包org.wltea.analyzer.lucene中的两个类，如下：

IKAnalyzer.java

 1 package org.wltea.analyzer.lucene;
 2 import java.io.Reader;
 3 import java.io.StringReader;
 4 
 5 import org.apache.lucene.analysis.Analyzer;
 6 import org.apache.lucene.util.IOUtils;
 7 
 8 public class IKAnalyzer extends Analyzer {
 9 
10     @Override
11     protected TokenStreamComponents createComponents(String arg0) {
12         Reader reader=null;
13         try{
14             reader=new StringReader(arg0);
15             IKTokenizer it = new IKTokenizer(reader);
16             return new Analyzer.TokenStreamComponents(it);
17         }finally {
18             IOUtils.closeWhileHandlingException(reader);
19         }
20     }
21 
22 }

IKTokenizer.java

 1 package org.wltea.analyzer.lucene;
 2 
 3 import java.io.IOException;
 4 import java.io.Reader;
 5 
 6 import org.apache.lucene.analysis.Tokenizer;
 7 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 9 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10 import org.wltea.analyzer.core.IKSegmenter;
11 import org.wltea.analyzer.core.Lexeme;
12 
13 public class IKTokenizer extends Tokenizer {
14     // IK分词器实现
15     private IKSegmenter _IKImplement;
16 
17     // 词元文本属性
18     private final CharTermAttribute termAtt;
19     // 词元位移属性
20     private final OffsetAttribute offsetAtt;
21     // 词元分类属性（该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量）
22     private final TypeAttribute typeAtt;
23     // 记录最后一个词元的结束位置
24     private int endPosition;
25 
26     public IKTokenizer(Reader in) {
27         this(in, false);
28     }
29 
30     public IKTokenizer(Reader in, boolean useSmart) {
31         offsetAtt = addAttribute(OffsetAttribute.class);
32         termAtt = addAttribute(CharTermAttribute.class);
33         typeAtt = addAttribute(TypeAttribute.class);
34         _IKImplement = new IKSegmenter(input, useSmart);
35     }
36 
37     @Override
38     public boolean incrementToken() throws IOException {
39         // 清除所有的词元属性
40         clearAttributes();
41         Lexeme nextLexeme = _IKImplement.next();
42         if (nextLexeme != null) {
43             // 将Lexeme转成Attributes
44             // 设置词元文本
45             termAtt.append(nextLexeme.getLexemeText());
46             // 设置词元长度
47             termAtt.setLength(nextLexeme.getLength());
48             // 设置词元位移
49             offsetAtt.setOffset(nextLexeme.getBeginPosition(),
50                     nextLexeme.getEndPosition());
51             // 记录分词的最后位置
52             endPosition = nextLexeme.getEndPosition();
53             // 记录词元分类
54             typeAtt.setType(nextLexeme.getLexemeTypeString());
55             // 返会true告知还有下个词元
56             return true;
57         }
58         // 返会false告知词元输出完毕
59         return false;
60     }
61 
62     @Override
63     public void reset() throws IOException {
64         super.reset();
65         _IKImplement.reset(input);
66     }
67 
68     @Override
69     public final void end() {
70         // set final offset
71         int finalOffset = correctOffset(this.endPosition);
72         offsetAtt.setOffset(finalOffset, finalOffset);
73     }
74 
75 }

工具类：Lucene.java

  1 /**
  2  * 
  3  */
  4 package com.jjh.common;
  5 
  6 import java.io.IOException;
  7 import java.nio.file.Path;
  8 import java.util.Objects;
  9 
 10 import org.apache.lucene.analysis.Analyzer;
 11 import org.apache.lucene.document.Document;
 12 import org.apache.lucene.document.TextField;
 13 import org.apache.lucene.document.Field.Store;
 14 import org.apache.lucene.index.DirectoryReader;
 15 import org.apache.lucene.index.IndexWriter;
 16 import org.apache.lucene.index.IndexWriterConfig;
 17 import org.apache.lucene.index.Term;
 18 import org.apache.lucene.queryparser.classic.ParseException;
 19 import org.apache.lucene.queryparser.classic.QueryParser;
 20 import org.apache.lucene.queryparser.classic.QueryParserBase;
 21 import org.apache.lucene.search.IndexSearcher;
 22 import org.apache.lucene.search.Query;
 23 import org.apache.lucene.search.ScoreDoc;
 24 import org.apache.lucene.search.TopDocs;
 25 import org.apache.lucene.store.Directory;
 26 import org.apache.lucene.store.FSDirectory;
 27 import org.apache.lucene.store.NoLockFactory;
 28 import org.json.JSONArray;
 29 import org.json.JSONException;
 30 import org.json.JSONObject;
 31 import org.wltea.analyzer.lucene.IKAnalyzer;
 32 
 33 /**
 34  * @author Administrator
 35  *
 36  */
 37 public final class Lucene {
 38 
 39     private final Analyzer analyzer;
 40     private final IndexWriter indexWriter;
 41     private final Directory fsDirectory;
 42 
 43     /**
 44      * @throws IOException
 45      * 
 46      */
 47     Lucene(Path path) throws IOException {
 48         // TODO Auto-generated constructor stub
 49         // 索引初始化
 50         analyzer = new IKAnalyzer();
 51         IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
 52         iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
 53         fsDirectory = FSDirectory.open(path,NoLockFactory.INSTANCE);
 54         indexWriter = new IndexWriter(fsDirectory, iwConfig);
 55     }
 56 
 57     public void close() throws IOException {
 58         if (Objects.nonNull(indexWriter) && indexWriter.isOpen())
 59             indexWriter.close();
 60         if (Objects.nonNull(fsDirectory))
 61             fsDirectory.close();
 62         if (Objects.nonNull(analyzer))
 63             analyzer.close();
 64     }
 65 
 66     public Lucene addDocument(String ID, String content) throws IOException {
 67         // 对ID建立索引，存储原始值
 68         TextField postIdField = new TextField("id", ID, Store.YES);
 69         // 对Content建立索引，但不存储原始值
 70         TextField postContentField = new TextField("content", content, Store.NO);
 71         // 追加一个索引条目，相当于表里面的行
 72         Document doc = new Document();
 73         doc.add(postIdField);
 74         doc.add(postContentField);
 75         synchronized (this) {
 76             indexWriter.addDocument(doc);
 77             indexWriter.commit();
 78         }
 79         return this;
 80     }
 81 
 82     public Lucene delete(String ID) throws IOException {
 83         // 做删除标志
 84         synchronized (this) {
 85             indexWriter.deleteDocuments(new Term("id", ID));
 86             indexWriter.forceMergeDeletes();
 87             indexWriter.commit();
 88         }
 89         return this;
 90     }
 91 
 92     /**
 93      * 两次查询，适合数据量比较大，不会导致内存溢出
 94      * 
 95      * @param content:要查询的内容,未人为分词
 96      * @param pageIndex:第几页
 97      * @param pageSize:每页的记录数
 98      * @return:返回json:{"total":100,"rows":[id1,id2...]}
 99      * @throws IOException
100      * @throws ParseException
101      * @throws JSONException
102      */
103     public JSONObject findByPagination(String content, int pageIndex, int pageSize)
104             throws IOException, JSONException, ParseException {
105         JSONObject json = new JSONObject();
106         try (DirectoryReader ireader = DirectoryReader.open(fsDirectory)) {
107             IndexSearcher isearcher = new IndexSearcher(ireader);
108             QueryParser qp = new QueryParser("content", analyzer);
109             qp.setDefaultOperator(QueryParserBase.AND_OPERATOR);
110             Query query = qp.parse(content);
111             // 得到上一次分页的最后一条记录
112             ScoreDoc lastSd = Lucenes.getLastScoreDoc(json, pageIndex, pageSize, query, isearcher);
113             TopDocs tds = isearcher.searchAfter(lastSd, query, pageSize);
114             json.put("rows", new JSONArray());
115             for (ScoreDoc sd : tds.scoreDocs) {
116                 // 通过文档id获取对应的文档
117                 Document doc = isearcher.doc(sd.doc);
118                 String id = doc.get("id");
119                 json.getJSONArray("rows").put(id);
120             }
121         }
122         return json;
123     }
124 
125     /*
126      * @param content:要查询的内容,未人为分词
127      * 
128      * @return:返回id
129      */
130     public String findOne(String content) throws IOException, ParseException {
131         String id = null;
132         try (DirectoryReader ireader = DirectoryReader.open(fsDirectory)) {
133             IndexSearcher isearcher = new IndexSearcher(ireader);
134             QueryParser qp = new QueryParser("content", analyzer);
135             qp.setDefaultOperator(QueryParserBase.AND_OPERATOR);
136             Query query = qp.parse(content);
137             TopDocs tds = isearcher.search(query, 1);
138             if (tds.totalHits == 1) {
139                 // 通过文档id获取对应的文档
140                 Document doc = isearcher.doc(tds.scoreDocs[0].doc);
141                 id = doc.get("id");
142             }
143 
144         }
145         return id;
146     }
147 
148     public boolean hit(String keywords) throws IOException, ParseException {
149         boolean hit = false;
150         try (DirectoryReader ireader = DirectoryReader.open(fsDirectory);) {
151             IndexSearcher isearcher = new IndexSearcher(ireader);
152             QueryParser qp = new QueryParser("content", analyzer);
153             qp.setDefaultOperator(QueryParserBase.AND_OPERATOR);
154             Query query = qp.parse(keywords);
155             TopDocs topDocs = isearcher.search(query, 1);
156             hit = topDocs.totalHits >= 1;
157         }
158         return hit;
159     }
160 
161 }

工具类:Lucenes.java

  1 /**
  2  * 
  3  */
  4 package com.jjh.common;
  5 
  6 import java.io.IOException;
  7 import java.nio.file.Path;
  8 import java.nio.file.Paths;
  9 import java.util.HashMap;
 10 import java.util.Map;
 11 import java.util.concurrent.ExecutorService;
 12 import java.util.concurrent.Executors;
 13 import java.util.concurrent.TimeUnit;
 14 
 15 import org.apache.lucene.analysis.Analyzer;
 16 import org.apache.lucene.document.Document;
 17 import org.apache.lucene.document.TextField;
 18 import org.apache.lucene.document.Field.Store;
 19 import org.apache.lucene.index.DirectoryReader;
 20 import org.apache.lucene.index.IndexWriter;
 21 import org.apache.lucene.index.IndexWriterConfig;
 22 import org.apache.lucene.index.Term;
 23 import org.apache.lucene.queryparser.classic.ParseException;
 24 import org.apache.lucene.queryparser.classic.QueryParser;
 25 import org.apache.lucene.queryparser.classic.QueryParserBase;
 26 import org.apache.lucene.search.IndexSearcher;
 27 import org.apache.lucene.search.Query;
 28 import org.apache.lucene.search.ScoreDoc;
 29 import org.apache.lucene.search.TopDocs;
 30 import org.apache.lucene.store.Directory;
 31 import org.apache.lucene.store.FSDirectory;
 32 import org.apache.lucene.store.NoLockFactory;
 33 import org.json.JSONArray;
 34 import org.json.JSONException;
 35 import org.json.JSONObject;
 36 import org.wltea.analyzer.lucene.IKAnalyzer;
 37 
 38 /**
 39  * @author Administrator
 40  *
 41  */
 42 public final class Lucenes {
 43     private static final Object lock = new Object();
 44 
 45     // 对象容器
 46     private static final Map<String, Lucene> lucenes = new HashMap<>();
 47 
 48     /**
 49      * @throws IOException
 50      * 
 51      */
 52     private Lucenes() throws IOException {
 53         // TODO Auto-generated constructor stub
 54 
 55     }
 56 
 57     // 工厂方法
 58     public static Lucene getInstance(String first, String... more) throws IOException {
 59         Path path = Paths.get(first, more);
 60         if (!lucenes.containsKey(path.toString()))
 61             synchronized (lucenes) {
 62                 if (!lucenes.containsKey(path.toString())) {
 63                     Lucene lucene = new Lucene(path);
 64                     lucenes.put(path.toString(), lucene);
 65                 }
 66             }
 67         return lucenes.get(path.toString());
 68     }
 69 
 70     public static Lucene getLucene(String first, String... more) {
 71         return lucenes.get(Paths.get(first, more).toString());
 72     }
 73 
 74     public static void remove(Lucene lucene) {
 75         lucenes.remove(lucene);
 76 
 77     }
 78 
 79     /**
 80      * @param ID:文档的ID值
 81      * @param content:非结构化的内容
 82      * @param indexDirectory:存储索引的目录
 83      * @throws IOException
 84      */
 85     static public void index(Path indexDirectory, String ID, String content) throws IOException {
 86         Analyzer analyzer = new IKAnalyzer();
 87         // 对ID建立索引，存储原始值
 88         TextField postIdField = new TextField("id", ID, Store.YES);
 89         // 对Content建立索引，但不存储原始值
 90         TextField postContentField = new TextField("content", content, Store.NO);
 91         // 追加一个索引条目，相当于表里面的行
 92         Document doc = new Document();
 93         doc.add(postIdField);
 94         doc.add(postContentField);
 95         IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
 96         iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
 97         synchronized (lock) {
 98             try (Directory fsDirectory = FSDirectory.open(indexDirectory, NoLockFactory.INSTANCE);
 99                     IndexWriter indexWriter = new IndexWriter(fsDirectory, iwConfig);) {
100                 indexWriter.addDocument(doc);
101             }
102 
103         }
104 
105     }
106 
107     /**
108      * @param indexDirectory:存储索引的目录
109      * @param ID
110      * @throws IOException
111      */
112     static public void delete(Path indexDirectory, String ID) throws IOException {
113         Analyzer analyzer = new IKAnalyzer();
114         IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
115         iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
116         try (Directory fsDirectory = FSDirectory.open(indexDirectory);
117                 IndexWriter indexWriter = new IndexWriter(fsDirectory, iwConfig);) {
118             // 做删除标志
119             indexWriter.deleteDocuments(new Term("id", ID));
120             indexWriter.forceMergeDeletes();
121             indexWriter.commit();
122 
123         }
124     }
125 
126     /**
127      * 两次查询，适合数据量比较大，不会导致内存溢出
128      * 
129      * @param indexDirectory:存放索引的目录
130      * @param content:要查询的内容,未人为分词
131      * @param pageIndex:第几页
132      * @param pageSize:每页的记录数
133      * @return:返回json:{"total":100,"rows":[id1,id2...]}
134      * @throws IOException
135      * @throws ParseException
136      * @throws JSONException
137      */
138     public static JSONObject findByPagination(Path indexDirectory, String content, int pageIndex, int pageSize)
139             throws IOException, JSONException, ParseException {
140         JSONObject json = new JSONObject();
141         Analyzer analyzer = new IKAnalyzer();
142         try (Directory fsDirectory = FSDirectory.open(indexDirectory);
143                 DirectoryReader ireader = DirectoryReader.open(fsDirectory)) {
144             IndexSearcher isearcher = new IndexSearcher(ireader);
145             QueryParser qp = new QueryParser("content", analyzer);
146             qp.setDefaultOperator(QueryParserBase.AND_OPERATOR);
147             Query query = qp.parse(content);
148             // 得到上一次分页的最后一条记录
149             ScoreDoc lastSd = Lucenes.getLastScoreDoc(json, pageIndex, pageSize, query, isearcher);
150             TopDocs tds = isearcher.searchAfter(lastSd, query, pageSize);
151             json.put("rows", new JSONArray());
152             for (ScoreDoc sd : tds.scoreDocs) {
153                 // 通过文档id获取对应的文档
154                 Document doc = isearcher.doc(sd.doc);
155                 String id = doc.get("id");
156                 json.getJSONArray("rows").put(id);
157             }
158         }
159         return json;
160     }
161 
162     public static boolean hasIndex(String first, String... paths) throws IOException {
163         try (Directory dir = FSDirectory.open(Paths.get(first, paths))) {
164             return DirectoryReader.indexExists(dir);
165         }
166     }
167 
168     public static ScoreDoc getLastScoreDoc(JSONObject json, int pageIndex, int pageSize, Query qp,
169             IndexSearcher isearcher) throws IOException, JSONException {
170         // TODO Auto-generated method stub
171         // 期望获取的记录数
172         int num = pageSize * (pageIndex - 1);
173         TopDocs tds = isearcher.search(qp, num == 0 ? 1 : num);
174         // 获取总的记录条数
175         json.put("total", tds.totalHits);
176         // 如果是第一页，直接返回
177         if (pageIndex == 1)
178             return null;
179         return tds.scoreDocs[num - 1];
180     }
181 
182     public static void main(String[] args) throws IOException, ParseException, JSONException, InterruptedException {
183         // delete(Paths.get("f:/jjh/admin", "lucene"),"1");
184         // ExecutorService
185         Lucene l=getInstance("f:/jjh", "lucene");
186         /*l.addDocument("1", "工商管理交通事故").commit();
187         l.addDocument("2", "华信智原你好").commit();*/
188         //l.close();
189         ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
190         pool.execute(() -> {
191             try {
192                 l.addDocument("1", "工商管理交通事故");
193             } catch (IOException e) {
194                 // TODO Auto-generated catch block
195                 e.printStackTrace();
196             }
197         });
198         pool.execute(() -> {
199             try {
200                 l.addDocument("2", "华信智原你好");
201             } catch (IOException e) {
202                 // TODO Auto-generated catch block
203                 e.printStackTrace();
204             }
205         });
206         pool.awaitTermination(5, TimeUnit.SECONDS);
207         pool.shutdownNow();
208         l.close();
209         // System.out.println(findByPagination(Paths.get("F:/jjh/lucene/lucene"),"大数据",1,10));
210         // System.out.println(hasIndex("f:/jjh/lucene"));
211     }
212 
213 }

工具类： IKAnalzyer.java

  1 /**
  2  * 
  3  */
  4 package com.jjh.common;
  5 
  6 import java.io.FileNotFoundException;
  7 import java.io.IOException;
  8 import java.io.Reader;
  9 import java.io.StringReader;
 10 import java.util.ArrayList;
 11 import java.util.Collections;
 12 import java.util.HashMap;
 13 import java.util.List;
 14 import java.util.Map;
 15 
 16 import org.apache.lucene.analysis.Analyzer;
 17 import org.apache.lucene.analysis.TokenStream;
 18 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 19 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 20 import org.apache.xmlbeans.XmlException;
 21 import org.json.JSONException;
 22 import org.wltea.analyzer.core.IKSegmenter;
 23 import org.wltea.analyzer.core.Lexeme;
 24 import org.wltea.analyzer.lucene.IKAnalyzer;
 25 
 26 /**
 27  * @author Administrator
 28  *
 29  */
 30 public final class IKAnalzyer {
 31 
 32     /**
 33      * 
 34      */
 35     private IKAnalzyer() {
 36         // TODO Auto-generated constructor stub
 37     }
 38     
 39     static public List<Map.Entry<String, Integer>>  analyze(String content) throws IOException
 40     {
 41         Analyzer analyzer = new IKAnalyzer();
 42         Map<String, Integer> map=new HashMap<>();
 43         TokenStream ts = null;
 44         try
 45         {
 46             ts = analyzer.tokenStream("myfield", new StringReader(content));
 47             CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
 48             ts.reset(); 
 49             for(;ts.incrementToken();)
 50             {
 51                 String s=term.toString();
 52                 if(map.containsKey(s))
 53                     map.put(s, (map.get(s)+1));
 54                 else
 55                     map.put(s, 1);
 56             }
 57             ts.end();
 58         }
 59         finally {
 60             analyzer.close();
 61             if(ts != null){
 62                 ts.close();
 63             }
 64         }
 65         List<Map.Entry<String, Integer>> list=new ArrayList<>(map.entrySet());
 66         Collections.sort(list, (e1,e2)->e2.getValue()-e1.getValue());
 67         return list;
 68     }
 69     
 70     static public List<Map.Entry<String, Integer>>  analyze(Reader input) throws IOException
 71     {
 72         IKSegmenter ik=new  IKSegmenter(input,true);
 73         Map<String, Integer> map=new HashMap<>();
 74         for(;;)
 75         {
 76             Lexeme l=ik.next();
 77             if(l==null)
 78                 break;
 79             else
 80             {
 81                 String s=l.getLexemeText();
 82                 if(map.containsKey(s))
 83                     map.put(s, (map.get(s)+1));
 84                 else
 85                     map.put(s, 1);
 86             }
 87             
 88         }
 89         List<Map.Entry<String, Integer>> list=new ArrayList<>(map.entrySet());
 90         Collections.sort(list, (e1,e2)->e2.getValue()-e1.getValue());
 91         return list;
 92     }
 93 
 94     public static void main(String[] args) throws FileNotFoundException, IOException, XmlException, OpenXML4JException, JSONException
 95     {
 96         for(Map.Entry<String, Integer> e: analyze(new StringReader("工商管理交通事故工商管理")))
 97             System.out.println(e.getKey());
 98         
 99     }
100 
101 
102 }

转载于:https://www.cnblogs.com/jjh-java/p/6795597.html

ahtjh2015

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene5工具类

以下是本人在实际开发中使用Lucene5时自己写的两个工具类，需要下载Lucene5，IKAnalyzer中文分词器V2012_FF，并且对Lucene5打补丁，修改包org.wltea.analyzer.lucene中的两个类，如下：IKAnalyzer.java 1 package org.wltea.analyzer.lucene; 2 import java.io...
复制链接

扫一扫