Lucene是apache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现全文检索的功能,或者是以此为基础建立起完整的全文检索引擎。Lucene是一套用于全文检索和搜寻的开源程式库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程式接口,能够做全文索引和搜寻。在Java开发环境里Lucene是一个成熟的免费开源工具。就其本身而言,Lucene是当前以及最近几年最受欢迎的免费Java信息检索程序库。人们经常提到信息检索程序库,虽然与搜索引擎有关,但不应该将信息检索程序库与搜索引擎相混淆。
HelloWorld测试
创建Maven工程,jar包
在pom.xml配置文件中导入lucene的对应jar包
<dependencies> <!-- 核心包lucene-core --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <!-- 查询解析lucene-queryparser --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <!-- 解析器lucene-analyzers-common --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> </dependencies> |
Indexer.java用于创建全文检索的索引
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory;
/** * * @ClassName: Indexer * @Description: TODO (写索引) * @author A我去 * @date 2019年10月24日下午9:26:46 */ public class Indexer {
private IndexWriter writer; //写索引实例
/** * 构造方法,实例化IndexWriter * @param indexDir * @throws Exception */ public Indexer(String indexDir)throws Exception{ Directory directory = FSDirectory.open(Paths.get(indexDir)); Analyzer analyzer = new StandardAnalyzer(); //标准分词器,只对英文有效 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); writer = new IndexWriter(directory, iwc); }
/** * * <b>Description</b><br> * (关闭写索引) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:35:48</b> */ public void close()throws Exception{ writer.close(); }
/** * * <b>Description</b><br> * (索引指定目录下的所有文件) * d:\lucene\data\** * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:37:16</b> */ public int index(String dataDir)throws Exception{ File[] files = new File(dataDir).listFiles(); //遍历目录下的所有文件 for(File file : files) { indexFile(file); } //返回索引了多少个文件 return writer.numRamDocs();
}
/** * * <b>Description</b><br> * (索引指定文件) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:41:02</b> */ private void indexFile(File file)throws Exception{ System.out.println("索引文件:"+file.getCanonicalPath()); Document document = getDocument(file); writer.addDocument(document); //添加索引 }
/** * * <b>Description</b><br> * (获取文档,文档里再设置每个字段) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:43:32</b> */ private Document getDocument(File file)throws Exception{ Document document = new Document(); //内容,不保存内容到索引 document.add(new TextField("contents", new FileReader(file))); //保存文件名到索引 document.add(new TextField("fileName", file.getName(), Field.Store.YES)); //保存完整路径到索引 document.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); return document; }
public static void main(String[] args){ String indexDir = "D:\\lucene"; //索引的输出位置 String dataDir = "D:\\lucene\\data"; //数据源 Indexer indexer = null; int numIndexed = 0; long start = System.currentTimeMillis(); try { indexer = new Indexer(indexDir); numIndexed = indexer.index(dataDir); } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("索引了"+numIndexed+" 个文件,使用了"+(end-start)+" 毫秒"); }
} |
在D:\lucene\data拷入指定的文件夹和文件
|
Searcher.java用于查询全文检索
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory;
public class Searcher {
public static void search(String indexDir, String q)throws Exception{ Directory directory = FSDirectory.open(Paths.get(indexDir)); IndexReader reader = DirectoryReader.open(directory); IndexSearcher is = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); //标准分词器 QueryParser parser = new QueryParser("contents", analyzer); Query query = parser.parse(q);
long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); //开始查询,得到Top10 long end = System.currentTimeMillis(); System.out.println("匹配"+q+",耗时"+(end-start)+"毫秒"+"查询到:"+hits.totalHits+ "条记录"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } reader.close(); }
public static void main(String[] args) { String indexDir = "D:\\lucene"; String q = "Zygmunt Saloni"; try { search(indexDir, q); } catch (Exception e) { e.printStackTrace(); } }
} |
文档操作
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.Before; import org.junit.Test;
public class IndexingTest01 {
private String ids[] = { "1", "2", "3" }; private String citys[] = { "qingdao", "nanjing", "shanghai" }; private String descs[] = { "Qingdao is a beautiful city.", "Nanjing is a city of culture.", "Shanghai is a bustling city." };
private Directory directory;
@Before public void setUp() throws Exception { directory = FSDirectory.open(Paths.get("D:\\lucene")); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++) { Document document = new Document(); document.add(new StringField("id", ids[i], Field.Store.YES)); //存索引 document.add(new StringField("city", citys[i], Field.Store.YES)); document.add(new StringField("desc", descs[i], Field.Store.NO)); //不存索引 writer.addDocument(document); //添加文档操作 } writer.close();//关闭 }
/** * * <b>Description</b><br> * (获取IndexWriter实例) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:35:36</b> */ private IndexWriter getWriter()throws Exception{ Analyzer analyzer = new StandardAnalyzer(); //标准分词器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory, iwc); return writer; }
/** * * <b>Description</b><br> * (测试写文档的数量) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:44:05</b> */ @Test public void testIndexWriter()throws Exception{ IndexWriter writer = getWriter(); System.out.println("写入了:"+writer.numDocs()+" 个文档"); writer.close(); }
/** * * <b>Description</b><br> * (测试读取文档) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:50:09</b> */ @Test public void testIndexReader()throws Exception{ IndexReader reader = DirectoryReader.open(directory); System.out.println("最大文档数量:"+reader.maxDoc()); System.out.println("实际文档数量:"+reader.numDocs()); reader.close(); }
/** * * <b>Description</b><br> * (测试删除,在合并前) * 只做标记,并没有真正的删除,访问量大的网站采用这种方式,等到空闲的时候再删除 * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:57:23</b> */ @Test public void testDelBeforeMerge()throws Exception{ IndexWriter writer = getWriter(); System.out.println("删除前的文档数量:"+writer.numDocs()); writer.deleteDocuments(new Term("id","1")); //删除id为1 writer.commit(); System.out.println("删除后最大文档数量:"+writer.maxDoc()); System.out.println("删除后实际文档数量:"+writer.numDocs()); }
/** * * <b>Description</b><br> * (测试删除,在合并后) * 真正的删除:大型系统中非常耗时,空闲时间在执行彻底删除 * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:04:02</b> */ @Test public void testDelAfterMerge()throws Exception{ IndexWriter writer = getWriter(); System.out.println("删除前的文档数量:"+writer.numDocs()); writer.deleteDocuments(new Term("id","1")); //删除id为1 writer.forceMergeDeletes(); //强制合并 writer.commit(); System.out.println("删除后最大文档数量:"+writer.maxDoc()); System.out.println("删除后实际文档数量:"+writer.numDocs()); }
/** * * <b>Description</b><br> * (更新文档)比较耗时 * 1.找到id为1的文档,2.删除,3.重新创建 * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:09:43</b> */ @Test public void testUpdate()throws Exception{ IndexWriter writer = getWriter(); Document document = new Document(); document.add(new StringField("id", "1", Field.Store.YES)); //存索引 document.add(new StringField("city", "qingdao", Field.Store.YES)); document.add(new StringField("desc", "xXXxXXXxXX", Field.Store.NO)); //不存索引 writer.updateDocument(new Term("id","1"), document); writer.close(); }
} |
文档域加权(可以使搜索排名提高)
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.Test;
public class IndexingTest02 {
private String ids[]={"1","2","3","4"}; private String authors[]={"Jack","Marry","John","Json"}; private String positions[]={"accounting","technician","salesperson","boss"}; private String titles[]={"Java is a good language.","Java is a cross platform language","Java powerful","You should learn java"}; private String contents[]={ "If possible, use the same JRE major version at both index and search time.", "When upgrading to a different JRE major version, consider re-indexing. ", "Different JRE major versions may implement different versions of Unicode,", "For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6," };
private Directory directory;
/** * * <b>Description</b><br> * (获取IndexWriter实例) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:22:40</b> */ private IndexWriter getWriter()throws Exception{ Analyzer analyzer = new StandardAnalyzer(); //标准分词器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory, iwc); return writer; }
/** * * <b>Description</b><br> * (生成索引) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:24:47</b> */ @Test public void index()throws Exception{ directory = FSDirectory.open(Paths.get("D:\\lucene")); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++) { Document document = new Document(); //不需要分词器解析的使用StringField document.add(new StringField("id", ids[i], Field.Store.YES)); //YES存索引 document.add(new StringField("author", authors[i], Field.Store.YES)); document.add(new StringField("position", positions[i], Field.Store.YES)); //需要分词解析的使用TextField document.add(new TextField("title", titles[i], Field.Store.YES)); document.add(new TextField("content", contents[i], Field.Store.NO)); writer.addDocument(document); //添加文档操作 } writer.close();//关闭 }
/** * * <b>Description</b><br> * (生成索引,并给BOSS的标题加权,提高排名) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:24:47</b> */ @Test public void indexBoost()throws Exception{ directory = FSDirectory.open(Paths.get("D:\\lucene")); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++) { Document document = new Document(); //不需要分词器解析的使用StringField document.add(new StringField("id", ids[i], Field.Store.YES)); //YES存索引 document.add(new StringField("author", authors[i], Field.Store.YES)); document.add(new StringField("position", positions[i], Field.Store.YES)); //需要分词解析的使用TextField TextField field = new TextField("title", titles[i], Field.Store.YES); if("boss".equals(positions[i])) { //默认为1f,高于1f加权,低于1f减权 field.setBoost(1.5f); } document.add(field); document.add(new TextField("content", contents[i], Field.Store.NO)); writer.addDocument(document); //添加文档操作 } writer.close();//关闭 }
/** * * <b>Description</b><br> * (搜索) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午11:32:57</b> */ @Test public void search()throws Exception{ directory = FSDirectory.open(Paths.get("D:\\lucene")); IndexReader reader = DirectoryReader.open(directory); IndexSearcher iSearcher = new IndexSearcher(reader);
String searchField = "title"; //查询的字段 String q = "java"; //查询的内容 Term term = new Term(searchField, q); Query query = new TermQuery(term); TopDocs hits = iSearcher.search(query, 10); //10为查询的条数 System.out.println("匹配"+q+",总共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = iSearcher.doc(scoreDoc.doc); System.out.println(document.get("author")); } reader.close(); }
} |
查询功能
创建Maven项目,jar包,导入对应的jar依赖
pom.xml文件
<dependencies> <!-- 核心包lucene-core --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <!-- 查询解析lucene-queryparser --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <!-- 解析器lucene-analyzers-common --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> </dependencies> |
创建包com.kingsoft.lucene
创建Indexer.java用于写索引
/** * * @ClassName: Indexer * @Description: TODO (写索引) * @author A我去 * @date 2019年10月24日下午9:26:46 */ public class Indexer {
private IndexWriter writer; //写索引实例
/** * 构造方法,实例化IndexWriter * @param indexDir * @throws Exception */ public Indexer(String indexDir)throws Exception{ Directory directory = FSDirectory.open(Paths.get(indexDir)); Analyzer analyzer = new StandardAnalyzer(); //标准分词器,只对英文有效 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); writer = new IndexWriter(directory, iwc); }
/** * * <b>Description</b><br> * (关闭写索引) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:35:48</b> */ public void close()throws Exception{ writer.close(); }
/** * * <b>Description</b><br> * (索引指定目录下的所有文件) * d:\lucene\data\** * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:37:16</b> */ public int index(String dataDir)throws Exception{ File[] files = new File(dataDir).listFiles(); //遍历目录下的所有文件 for(File file : files) { indexFile(file); }
//返回索引了多少个文件 return writer.numRamDocs(); }
/** * * <b>Description</b><br> * (索引指定文件) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:41:02</b> */ private void indexFile(File file)throws Exception{ System.out.println("索引文件:"+file.getCanonicalPath()); Document document = getDocument(file); writer.addDocument(document); //添加索引 }
/** * * <b>Description</b><br> * (获取文档,文档里再设置每个字段) * <br> * -------------------------------------------------<br> * <b>A我去 2019年10月24日 下午9:43:32</b> */ private Document getDocument(File file)throws Exception{ Document document = new Document(); //内容,不保存内容到索引 document.add(new TextField("contents", new FileReader(file))); //保存文件名到索引 document.add(new TextField("fileName", file.getName(), Field.Store.YES)); //保存完整路径到索引 document.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); return document; }
public static void main(String[] args){ String indexDir = "D:\\lucene"; //索引的输出位置 String dataDir = "D:\\lucene\\data"; //数据源 Indexer indexer = null; int numIndexed = 0; long start = System.currentTimeMillis(); try { indexer = new Indexer(indexDir); numIndexed = indexer.index(dataDir); } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("索引了"+numIndexed+" 个文件,使用了"+(end-start)+" 毫秒"); }
}
|
D盘创建lucene\data文件夹,并在data文件夹中创建多个文本文件用于测试使用
|
创建SearchTest.java用于测试多种搜索方法
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.After; import org.junit.Before; import org.junit.Test;
public class SearchTest {
private Directory directory; private IndexReader reader; private IndexSearcher is;
@Before public void setUp()throws Exception{ directory = FSDirectory.open(Paths.get("D:\\lucene")); reader = DirectoryReader.open(directory); is = new IndexSearcher(reader); }
@After public void tearDown()throws Exception{ reader.close(); }
/** * * <b>Description</b><br> * (对特定项进行搜索,不常用) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月26日 下午9:48:08</b> */ @Test public void testTermQuery()throws Exception{ String searchField = "contents"; String q = "particular"; Term term = new Term(searchField, q); Query query = new TermQuery(term); TopDocs hits = is.search(query, 10); System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } }
/** * * <b>Description</b><br> * (解析查询表达式,使用QueryParser比较常用) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月26日 下午9:58:59</b> */ @Test public void testQueryParser()throws Exception{ String searchField = "contents"; String q = "particular"; Analyzer analyzer = new StandardAnalyzer(); //标准分词器 QueryParser parser = new QueryParser(searchField, analyzer); Query query = parser.parse(q); TopDocs hits = is.search(query, 10); System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } }
/** * * <b>Description</b><br> * (解析查询表达式--加入条件或) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月26日 下午10:05:18</b> */ @Test public void testQueryParserOR()throws Exception{ String searchField = "contents"; //或者使用particular Unicode也可以表示或的关系 String q = "particular or Unicode"; Analyzer analyzer = new StandardAnalyzer(); //标准分词器 QueryParser parser = new QueryParser(searchField, analyzer); Query query = parser.parse(q); TopDocs hits = is.search(query, 10); System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } }
/** * * <b>Description</b><br> * (解析查询表达式--加入条件与) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月26日 下午10:05:18</b> */ @Test public void testQueryParserAND()throws Exception{ String searchField = "contents"; //要使用大写的AND关联两个字符 String q = "particular AND benchmarks"; Analyzer analyzer = new StandardAnalyzer(); //标准分词器 QueryParser parser = new QueryParser(searchField, analyzer); Query query = parser.parse(q); TopDocs hits = is.search(query, 10); System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } }
/** * * <b>Description</b><br> * (解析查询表达式--模糊查询) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月26日 下午10:05:18</b> */ @Test public void testQueryParserBlurry()throws Exception{ String searchField = "contents"; //要使用~进行模糊查询 String q = "part~"; Analyzer analyzer = new StandardAnalyzer(); //标准分词器 QueryParser parser = new QueryParser(searchField, analyzer); Query query = parser.parse(q); TopDocs hits = is.search(query, 10); System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档"); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("fullPath")); } } }
|
Lucene的其他查询方式
在com.kingsoft.lucene包下创建Indexer.java
public class Indexer {
private Integer ids[] = { 1, 2, 3 }; private String citys[] = { "aingdao", "nanjing", "shanghai" }; private String descs[] = { "Qingdao is a beautiful city.", "Nanjing is b city of culture.", "Shanghai is c bustling city." };
private Directory directory;
/** * * <b>Description</b><br> * (获取IndexWriter实例) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:35:36</b> */ private IndexWriter getWriter()throws Exception{ Analyzer analyzer = new StandardAnalyzer(); //标准分词器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory, iwc); return writer; }
/** * * <b>Description</b><br> * (生成索引) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月27日 下午9:01:21</b> */ private void index(String indexDir)throws Exception{ directory = FSDirectory.open(Paths.get(indexDir)); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++) { Document document = new Document(); document.add(new IntField("id", ids[i], Field.Store.YES)); document.add(new StringField("city", citys[i], Field.Store.YES)); document.add(new StringField("desc", descs[i], Field.Store.YES)); writer.addDocument(document); //添加文档 } writer.close(); }
public static void main(String[] args)throws Exception{ new Indexer().index("D:\\lucene"); }
} |
创建测试类SearchTest.java
import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.After; import org.junit.Before; import org.junit.Test;
public class SearchTest {
private Directory directory; private IndexReader reader; private IndexSearcher is;
@Before public void setUp() throws Exception { directory = FSDirectory.open(Paths.get("D:\\lucene")); reader = DirectoryReader.open(directory); is = new IndexSearcher(reader); }
@After public void tearDown() throws Exception { reader.close(); }
/** * * <b>Description</b><br> * (指定数字范围搜索) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月27日 下午9:47:10</b> */ @Test public void testNumRangeQuery()throws Exception{ //字段为id,从1开始到2,是否包含开始,是否包含结束 NumericRangeQuery<Integer> query = NumericRangeQuery .newIntRange("id", 1, 2, true, true); TopDocs hits = is.search(query, 10); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("id")); System.out.println(document.get("city")); System.out.println(document.get("desc")); } }
/** * * <b>Description</b><br> * (指定字符串开头搜素) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月27日 下午10:12:08</b> */ @Test public void testPreFixQuery()throws Exception{ //查询字段为city,开头为a PrefixQuery query = new PrefixQuery(new Term("city","a")); TopDocs hits = is.search(query, 10); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("id")); System.out.println(document.get("city")); System.out.println(document.get("desc")); } }
/** * * <b>Description</b><br> * (多条件组合搜素) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月27日 下午10:16:39</b> */ @Test public void testBooleanQuery()throws Exception{ NumericRangeQuery<Integer> query1 = NumericRangeQuery .newIntRange("id", 1, 2, true, true); PrefixQuery query2 = new PrefixQuery(new Term("city","a")); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); /** * Occur.MUST == AND关系 * Occur.SHOULD == OR关系 * Occur.MUST_NOT == 不包含 */ booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); TopDocs hits = is.search(booleanQuery.build(), 10); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("id")); System.out.println(document.get("city")); System.out.println(document.get("desc")); } }
}
|
中文分词,高亮显示
创建maven项目,jar文件
<dependencies> <!-- 核心包lucene-core --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <!-- 查询解析lucene-queryparser --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <!-- 解析器lucene-analyzers-common --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> <!-- 中文分词器lucene-analyzers-smartcn --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-smartcn</artifactId> <version>5.3.1</version> </dependency> <!-- 高亮显示lucene-highlighter --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>5.3.1</version> </dependency> <!-- 测试junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> </dependencies> |
创建Indexer.java创建索引
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory;
public class Indexer {
private Integer ids[] = { 1, 2, 3 }; private String citys[] = {"青岛","南京","上海"}; private String descs[] = { "青岛是一个美丽的城市。", "南京是一个有文化的城市。", "上海是一个繁华的城市。" };
private Directory directory;
/** * * <b>Description</b><br> * (获取IndexWriter实例) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月25日 下午10:35:36</b> */ private IndexWriter getWriter()throws Exception{ SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory, iwc); return writer; }
/** * * <b>Description</b><br> * (生成索引) * <br> * -------------------------------------------------<br> * <b>王欢 2019年10月27日 下午9:01:21</b> */ private void index(String indexDir)throws Exception{ directory = FSDirectory.open(Paths.get(indexDir)); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++) { Document document = new Document(); document.add(new IntField("id", ids[i], Field.Store.YES)); document.add(new StringField("city", citys[i], Field.Store.YES)); document.add(new TextField("desc", descs[i], Field.Store.YES)); writer.addDocument(document); //添加文档 } writer.close(); }
public static void main(String[] args)throws Exception{ new Indexer().index("D:\\lucene"); } } |
创建Searcher.java
import java.io.StringReader; import java.nio.file.Paths;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory;
public class Searcher {
public static void search(String indexDir, String q)throws Exception{ Directory directory = FSDirectory.open(Paths.get(indexDir)); IndexReader reader = DirectoryReader.open(directory); IndexSearcher is = new IndexSearcher(reader); SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); QueryParser parser = new QueryParser("desc", analyzer); Query query = parser.parse(q); long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); //开始查询,得到Top10 long end = System.currentTimeMillis(); System.out.println("匹配"+q+",耗时"+(end-start)+"毫秒"+"查询到:"+hits.totalHits+ "条记录"); /** * 得到关键字加粗,字体为红色 */ QueryScorer scorer = new QueryScorer(query); //计算得分,把得分高的片段显示出来 Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); //片段 SimpleHTMLFormatter simple = new SimpleHTMLFormatter("<b><font color='red'>","</font></b>"); Highlighter highlighter = new Highlighter(simple, scorer); highlighter.setTextFragmenter(fragmenter);
for(ScoreDoc scoreDoc : hits.scoreDocs) { Document document = is.doc(scoreDoc.doc); System.out.println(document.get("city")); System.out.println(document.get("desc")); String desc = document.get("desc"); if(desc!=null) { //得到权重最高的摘要 TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc)); //高亮显示 String bestFragment = highlighter.getBestFragment(tokenStream, desc); System.out.println(bestFragment); } } reader.close(); }
public static void main(String[] args) { String indexDir = "D:\\lucene"; String q = "南京文化"; try { search(indexDir, q); } catch (Exception e) { e.printStackTrace(); } }
} |
查询的结果是以html格式
匹配南京文化,耗时13毫秒查询到:1条记录 南京 南京是一个有文化的城市。 <b><font color='red'>南京</font></b>是一个有<b><font color='red'>文化</font></b>的城市。 |
本文只做自己的学习记录,不喜勿喷