注:在MyEclipse中可以通过Ctrl+Shift+R和通配符查询相关的资源。
1、全文检索的概念
<1>从大量的信息中快速、准确地查找出要的信息。
<2>搜索的内容是文本信息(不是多媒体)。
<3>根据文本的关键词进行搜索,而不是根据语义进行搜索。
<4>全面、快速、准确是衡量全文检索系统的关键指标。
<5>搜索时英文不区分大小写。
<6>结果列表由相关度排序。
<7>全文搜索有站内搜索和垂直搜索
2、全文搜索与数据搜索的区别
数据库搜索的缺点:
<1>搜索效果比较差。
<2>在搜索的结果中,有大量的数据被搜索出来,有很多数据是没有用的。
<3>查询速度在大量的情况下是很难做到快速的。
3、互联网搜索结构图
由上图可以看出,互联网搜索就是利用爬虫搜索信息并且在索引库中建立索引,这样索引库中的信息就可能与网络上的网页信息不一致,从而导致有时候利用引擎搜索的网页会找不到相关的网站。
4、lucene的大致结构框图
由上图可以看出,利用lucene的api进行增删改操作其实就是对索引库的操作。查询操作就是根据索引库的索引查询出内容然后返回。在存入索引库的时候,其实是存入对象的某些字段及其值,并且转换为Document,然后存入索引库。
5、lucene建立索引图
6、使用lucene
lucene的核心jar包有:
(核心包)lucene-core-3.0.1.jar
(分词器)lucene-analyzers-3.1.0.jar
(高亮器)lucene-highlighter-3.1.0.jar
lucene-memory-3.1.0.jar
<1>导入核心jar包
<2>创建一个需要用lucene进行操作的类
<3>利用lucene的API操作此类的对象。
cn.itheima.lucene.bean.Article.java
public class Article {
private Long id ;
private String title ;
private String content ;
public Long getId() {
return id ;
}
public void setId(Long id) {
this.id = id;
}
public String getTitle() {
return title ;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content ;
}
public void setContent(String content) {
this.content = content;
}
}
cn.itheima.lucene.test.LuceneTest.java
/**
-
1、创建article对象,并把该对象放入到索引库中
-
2、根据关键词检索article对象
/
public class LuceneTest {
/*
* 1、创建article对象
* 2、创建indexWriter对象
* 3、把article对象写入到索引库中
*/
@Test
public void testCreateIndex() throws Exception {
Article article = new Article();
article.setId(1L);
article.setTitle( “lucene可以用来做搜索引擎” );
article.setContent( “baidu、google搜索引擎公司” );Directory directory = FSDirectory. open(new File("./indexDir" )); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); //MaxFieldLength限制字段内容的大小 IndexWriter indexWriter = new IndexWriter(directory,analyzer,MaxFieldLength.LIMITED); Document document = new Document(); //name代表存放在索引库中的名称 Field field1 = new Field("id",article.getId().toString(),Store.YES,Index.NOT_ANALYZED); Field field2 = new Field("title" ,article.getTitle(),Store.YES,Index. ANALYZED); Field field3 = new Field("content" ,article.getContent(),Store.YES,Index. ANALYZED); document.add(field1); document.add(field2); document.add(field3); indexWriter.addDocument(document); indexWriter.close();
}
}
执行结果:
cn.itheima.lucene.test.LuceneTest.java
public class LuceneTest {
@Test
public void testSearch() throws Exception {
Directory directory = FSDirectory. open(new File("./indexDir" ));
IndexSearcher indexSearcher = new IndexSearcher(directory);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser queryParser = new QueryParser(Version.LUCENE_30 ,"title" ,analyzer);
//关键字
Query query = queryParser.parse( "lucene");
//TopDocs代表目录库的引用
TopDocs topDocs = indexSearcher.search(query,1);
//根据关键词搜索出来的总的记录数
int totalHits = topDocs.totalHits ;
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(ScoreDoc scoreDoc : scoreDocs){
//关键词对应的索引值
int index = scoreDoc.doc ;
//相关度得分
float score = scoreDoc.score ;
Document doc = indexSearcher.doc(index);
Article article = new Article();
article.setId(Long. parseLong(doc.get("id")));
article.setTitle(doc.get( "title"));
article.setContent(doc.get( "content"));
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
7、改写上面的程序
cn.itheima.lucene.utils.DocumentUtils.java
public class DocumentUtils {
public static Document article2Document(Article article){
Document document = new Document();
Field field1 = new Field("id",article.getId().toString(),Store.YES,Index.NOT_ANALYZED);
Field field2 = new Field("title" ,article.getTitle(),Store.YES,Index. ANALYZED);
Field field3 = new Field("content" ,article.getContent(),Store.YES,Index. ANALYZED);
document.add(field1);
document.add(field2);
document.add(field3);
return document;
}
public static Article document2Article(Document document){
Article article = new Article ();
article.setId(Long. parseLong(document.get("id")));
article.setTitle(document.get( "title"));
article.setContent(document.get( "content"));
return article;
}
}
cn.itheima.lucene.utils.LuceneUtils.java
public class LuceneUtils {
public static Directory directory;
public static Analyzer analyzer;
static{
try {
directory = FSDirectory.open( new File("./indexDir" ));
analyzer = new StandardAnalyzer(Version.LUCENE_30);
} catch (Exception e) {
e.printStackTrace();
}
}
}
cn.itheima.lucene.test.LuceneTest.java
public class LuceneTest {
@Test
public void testCreateIndex() throws Exception {
Article article = new Article();
article.setId(1L);
article.setTitle( "lucene可以用来做搜索引擎" );
article.setContent( "baidu、google搜索引擎公司" );
IndexWriter indexWriter = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
Document document = DocumentUtils. article2Document(article);
indexWriter.addDocument(document);
indexWriter.close();
}
@Test
public void testSearch() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer);
Query query = queryParser.parse( "lucene");
TopDocs topDocs = indexSearcher.search(query,3);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(ScoreDoc scoreDoc : scoreDocs){
int index = scoreDoc.doc ;
Document doc = indexSearcher.doc(index);
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
8、删除数据
Term就是对关键词的对象封装,包含两个属性:field、text。
LuceneTest.java
public class LuceneTest {
/**
* 删除并不是把原来的cfs文件删除掉了,而是把原来的基础上多了一个del文件。
*/
@Test
public void testDelete() throws Exception {
IndexWriter indexWriter = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
Term term = new Term("title" ,"lucene" );
indexWriter.deleteDocuments(term);
indexWriter.close();
}
}
9、更新操作
public class LuceneTest {
@Test
public void testUpdate() throws Exception {
IndexWriter indexWriter = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
Term term = new Term("title" ,"lucene" );
Article article = new Article();
article.setId(1L);
article.setTitle( "lucene可以用来做搜索引擎" );
article.setContent( "baidu搜索引擎公司" );
//Term为删除,Document为增加
indexWriter.updateDocument(term, DocumentUtils.article2Document(article));
indexWriter.close();
}
}
执行结果:
总结:更新操作实际上是先删除后增加。
10、保持数据库与索引库的同步
在一个系统中,如果索引功能存在,那么数据库和索引库应该是同时存在的。这个时候需要保证索引库的数据和数据库中的数据保持一致性。可以在数据库进行增删改操作的时候对索引库也进行相应的操作。这样就可以保证数据库与索引库的一致性。
11、同一个索引库不能同时存在两个IndexWriter
cn.itheima.lucene.test.IndexWriterTest.java
public class IndexWriterTest {
@Test
public void test () throws Exception {
IndexWriter indexWriter1 = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
IndexWriter indexWriter2 = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
}
}
执行结果:
总结:<1>由上图可见,只要有一个IndexWriter操作索引库,那么它就会对索引库加锁,只要这个indexWriter没有关闭从而解锁,那么其他的IndexWriter和IndexSearch就无法访问索引库。如果使用了IndexSearch去查询索引库,也获取不了任何东西。
<2>当indexWriter关闭的时候,释放IO流的资源并且释放锁。
<3>索引库的最多的操作是检索,后台维护的操作还是比较少的,所以不需要很多的IndexWriter。
12、索引库的优化
如果执行了多次上面的testCreateIndex,就会出现如下图所示的情况。
一般情况下,只要达到一定数目的cfs文件,那么lucene就会自动合并cfs文件从而达到优化的目的。不过有时候,需要手动进行优化。
cn.itheima.lucene.test.IndexWriterTest.java
public class LuceneTest {
@Test
public void testOptimize() throws Exception {
IndexWriter indexWriter = new IndexWriter(LuceneUtils. directory,LuceneUtils.analyzer,MaxFieldLength.LIMITED);
indexWriter. optimize();
indexWriter.close();
}
}
执行结果:
13、内存索引库的特点:
1、查询效率比较快
2、数据不是持久化数据
文件索引库的特点:
1、查询效率比较慢
2、数据是持久化的
所以可以将内存索引库和文件索引库结合起来。
cn.itheima.lucene.test.DirectoryTest.java
/**
-
将数据存放如内存索引库,然后查询显示出来
*/
public class DirectoryTest {@Test public void testRamDirectory() throws Exception { /** * 创建内存索引库 */ Directory ramDirectory = new RAMDirectory(); Article article = new Article(); article.setId(1L); article.setTitle( "lucene可以用来做搜索引擎" ); article.setContent( "baidu、google搜索引擎公司" ); IndexWriter indexWriter = new IndexWriter(ramDirectory,LuceneUtils.analyzer,MaxFieldLength.LIMITED); Document document = DocumentUtils.article2Document(article); indexWriter.addDocument(document); indexWriter.close(); this.showData(ramDirectory);
}
private void showData(Directory directory) throws Exception{ IndexSearcher indexSearcher = new IndexSearcher(directory); QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer); Query query = queryParser.parse( "lucene"); TopDocs topDocs = indexSearcher.search(query,3); ScoreDoc[] scoreDocs = topDocs. scoreDocs; List<Article> articles = new ArrayList<Article>(); for(ScoreDoc scoreDoc : scoreDocs){ int index = scoreDoc.doc ; Document doc = indexSearcher.doc(index); Article article = DocumentUtils.document2Article(doc); articles.add(article); } for(Article article : articles){ System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent()); }
}
}
执行结果:
cn.itheima.lucene.test.DirectoryTest.java
public class DirectoryTest {
/**
* 内存索引库和文件索引库合并的情况
*/
@Test
public void testFileAndRam() throws Exception {
/**
* 1、创建两个IndexWriter,一个对应文件索引库,一个对应内存索引库
* 2、把文件索引库复制到内存索引库
* 3、内存索引库与应用程序进行交互
* 4、将内存索引库同步到文件索引库
*/
Directory fileDirectory = FSDirectory. open(new File("./indexDir" ));
Directory ramDirectory = new RAMDirectory(fileDirectory);
IndexWriter ramIndexWriter = new IndexWriter(ramDirectory,LuceneUtils.analyzer,MaxFieldLength.LIMITED);
//如果下面的参数中不写true,那么就是追加索引,如果写了,就是清空,重新添加。
IndexWriter fileIndexWriter = new IndexWriter(fileDirectory,LuceneUtils.analyzer,true,MaxFieldLength. LIMITED);
Article article = new Article();
article.setId(2L);
article.setTitle( "lucene可以用来做搜索引擎2" );
article.setContent( "baidu、google搜索引擎公司2" );
ramIndexWriter.addDocument(DocumentUtils. article2Document(article));
ramIndexWriter.close();
//一定要在执行下面这条语句之前就要关闭ramIndexWriter,否则,数据就无法将新增的数据同步到文件IndexWriter中。
fileIndexWriter.addIndexesNoOptimize(ramDirectory);
fileIndexWriter.close();
this.showData(fileDirectory);
}
}
执行结果:
注:lucene提供了一些方法可以做很多个索引库出来(在一个项目中),可以针对某一个索引库进行检索,还可以针对合并的索引库进行检索。fileIndexWrriter.addIndexesNoOptimize(ramDirectory)。
14、分词器
针对不同的语言有不同的分词器。
cn.itheima.lucene.test.AnalyzerTest.java
public class AnalyzerTest {
@Test
public void testAnalyzer_En() throws Exception{
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
String text = "Creates a searcher";
this.testAnalyzer(analyzer, text);
}
//下面的这段能够将分词期分离出来的词打印出来
private void testAnalyzer(Analyzer analyzer,String text) throws Exception {
TokenStream tokenStream = analyzer.tokenStream( "content", new StringReader(text));
tokenStream.addAttribute(TermAttribute. class);
while(tokenStream.incrementToken()){
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
System. out.println(termAttribute.term());
}
}
}
执行结果:
总结:英文分词器分词的三个步骤:
<1>切分关键词
<2>去掉听用此
<3>把大写变为小写
单词法分词
cn.itheima.lucene.test.AnalyzerTest.java
public class AnalyzerTest {
@Test
public void testAnalyzer_CH1() throws Exception{
Analyzer analyzer = new ChineseAnalyzer();
String text = "黑马训练营";
this.testAnalyzer(analyzer, text);
}
}
执行结果:
二词法分词
cn.itheima.lucene.test.AnalyzerTest.java
public class AnalyzerTest {
@Test
public void testAnalyzer_CH1() throws Exception{
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String text = "黑马训练营" ;
this.testAnalyzer(analyzer, text);
}
}
执行结果:
IKAnalyzer的使用
<1>拷贝IKAnalyzer3.2.0Stable.jar。
<2>拷贝ext_stopword.dic(必须设置成UTF-8编码,否则无法使用)、IKAnalyzer.cfg.xml。
<3>使用IKAnalyzer进行分词解析。
cn.itheima.lucene.test.AnalyzerTest.java
public class AnalyzerTest {
@Test
public void testAnalyzer_CH3() throws Exception{
Analyzer analyzer = new IKAnalyzer();
String text = "黑马训练营";
this.testAnalyzer(analyzer, text);
}
}
执行结果:
注:之所以能够达到这样的效果,是因为IKAnalyzer的jar包中包含了很多的dic文件,里面设置了大量的中文词汇。
IKAnalyzer还提供了扩展配置,使用户可以自己添加dic文件(包括扩展字典和扩展停止词字典,并且必须是UTF-8编码类型的,项目也必须是UTF-8编码的)。
IKAnalyzer.cfg.xml
<?xml version ="1.0" encoding="UTF-8" ?> <entry key ="ext_dict" >/mydict.dic</ entry>
<entry key ="ext_stopwords" >/ext_stopword.dic</ entry>
mydict.dic
方立勋
执行结果:
ext_stopword.dic
方立勋
执行结果:
15、高亮
高亮的作用是:1、使关键词高亮 2、控制摘要的大小
cn.itheima.lucene.test.HighlighterTest.java
public class HighlighterTest {
@Test
public void testHighlighter() throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer);
Query query = queryParser.parse( "lucene");
/**
* 添加高亮器
*/
Scorer scorer = new QueryScorer(query);
/**
* 规定要高亮的文本的前缀和后缀,只适合网页
*/
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter = new Highlighter(formatter,scorer);
/**
* 设置摘要的大小
*/
Fragmenter fragmenter = new SimpleFragmenter(20);
highlighter.setTextFragmenter(fragmenter);
TopDocs topDocs = indexSearcher.search(query,3);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(ScoreDoc scoreDoc : scoreDocs){
int index = scoreDoc.doc ;
Document doc = indexSearcher.doc(index);
/**
* 使用高亮器
* LuceneUtils.analyzer 用分词器把高亮部分的词分出来
* field 针对哪个字段进行高亮
* documnent.get("title") 获取要高亮的字段
*/
String fragment = highlighter.getBestFragment(LuceneUtils.analyzer, "title", doc.get("title" ));
doc.getField( "title").setValue(fragment);
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
cn.itheima.lucene.test.HighlighterTest.java
public class HighlighterTest {
@Test
public void testHighlighter () throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer);
Query query = queryParser.parse( "lucene");
/**
* 添加高亮器
*/
Scorer scorer = new QueryScorer(query);
/**
* 规定要高亮的文本的前缀和后缀,只适合网页
*/
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Highlighter highlighter = new Highlighter(formatter,scorer);
/**
* 设置摘要的大小
*/
Fragmenter fragmenter = new SimpleFragmenter(20);
highlighter.setTextFragmenter(fragmenter);
TopDocs topDocs = indexSearcher.search(query,3);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(ScoreDoc scoreDoc : scoreDocs){
int index = scoreDoc.doc ;
Document doc = indexSearcher.doc(index);
/**
* 使用高亮器
* LuceneUtils.analyzer 用分词器把高亮部分的词分出来
* field 针对哪个字段进行高亮
* documnent.get("title") 获取要高亮的字段
*/
String fragment1 = highlighter.getBestFragment(LuceneUtils.analyzer, "title", doc.get("title" ));
String fragment2 = highlighter.getBestFragment(LuceneUtils.analyzer, "content", doc.get("content" ));
//如果这里不设置,那么就可能出现设置的值为null的情况
if(fragment1 != null){
doc.getField( "title").setValue(fragment1);
}
//if(fragment2 != null){
doc.getField( "content").setValue(fragment2);
//}
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
16、分页
cn.itheima.lucene.test.DispageTest.java
public class DispageTest {
@Test
public void testDispage() throws Exception{
testSearch(0, 25);
}
private void testSearch(int firstResult, int maxResult) throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer);
Query query = queryParser.parse( "lucene");
TopDocs topDocs = indexSearcher.search(query,25);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(int i = firstResult; i < (firstResult + maxResult); i++){
int index = scoreDocs[i].doc ;
Document doc = indexSearcher.doc(index);
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
cn.itheima.lucene.test.DispageTest.java
public class DispageTest {
@Test
public void testDispage () throws Exception{
testSearch(20, 30);
}
private void testSearch(int firstResult, int maxResult) throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer);
Query query = queryParser.parse( "lucene");
TopDocs topDocs = indexSearcher.search(query,25);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
//为了防止越界,所有有必要取最小值
int length = Math.min(topDocs. totalHits, firstResult + maxResult);
for(int i = firstResult; i < length; i++){
int index = scoreDocs[i].doc ;
Document doc = indexSearcher.doc(index);
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
执行结果:
17、搜索方式
cn.itheima.lucene.test.LuceneTest.java
public class LuceneTest {
@Test
public void testCreateIndexBatch() throws Exception {
IndexWriter indexWriter = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
for(int i = 1; i <= 25; i++){
Article article = new Article();
article.setId(Long. parseLong(i + “”));
article.setTitle( “lucene可以用来做搜索引擎” );
article.setContent( “baidu搜索引擎公司” );
indexWriter.addDocument(DocumentUtils. article2Document(article));
}
indexWriter.close();
}
}
cn.itheima.lucene.utils.DocumentUtils.java
public class DocumentUtils {
public static Document article2Document(Article article){
Document document = new Document();
//由于这里存储的是String类型,所以使用范围查询的时候并不能查询出来任何结果,应该使用lucene自己提供的类型转换器进行转换
//Field field1 = new Field("id",article.getId().toString(),Store.YES,Index.NOT_ANALYZED);
Field field1 = new Field("id",NumericUtils.longToPrefixCoded(article.getId()).toString(),Store. YES,Index.NOT_ANALYZED);
Field field2 = new Field("title" ,article.getTitle(),Store.YES,Index. ANALYZED);
Field field3 = new Field("content" ,article.getContent(),Store.YES,Index. ANALYZED);
document.add(field1);
document.add(field2);
document.add(field3);
return document;
}
public static Article document2Article(Document document){
Article article = new Article();
article.setId(NumericUtils. prefixCodedToLong(document.get("id")));
article.setTitle(document.get( "title"));
article.setContent(document.get( "content"));
return article;
}
}
cn.itheima.lucene.test.QueryTest.java
public class QueryTest {
/**
* 查询方式
* 关键字查询
* 查询所有的文档
* 范围查询
* 通配符查询 重点
* 短语查询
* boolean查询 重点
*/
@Test
public void testTermQuery() throws Exception {
//关键字查询就是把一个关键词封装在了一个对象中,根据该关键词进行查询
//下面的参数大小写不能写错,因为这里没有分词器,不会转换为小写
Term term = new Term("title" ,"lucene" );
Query query = new TermQuery(term);
testSearch(query);
}
@Test
public void testQueryAllDocs() throws Exception {
//查询所有的文档
Query query = new MatchAllDocsQuery();
this.testSearch(query);
}
@Test
public void testQueryRange() throws Exception{
//范围查询
Query query = NumericRangeQuery. newLongRange("id", 5L, 15L, true, true );
this.testSearch(query);
}
@Test
public void testQueryWildCard() throws Exception{
//通配符查询 *代表任意多个任意字符 ?代表任意一个字符
Term term = new Term("title" ,"l*?" );
Query query = new WildcardQuery(term);
this.testSearch(query);
}
@Test
public void testQueryPharse() throws Exception {
//短语查询
Term term = new Term("title" ,"lucene" );
Term term2 = new Term("title" ,"搜索" );
//因为不是同一个字段的,所以会报错
//Term term2 = new Term("content","baidu");
PhraseQuery query = new PhraseQuery();
//两个以上的短语查询,如果不设置位置参数,那么什么也查不出来
query.add(term,0);
query.add(term2,7);
//query.add(term2);
this.testSearch(query);
}
/**
* boolean查询
* Occur.MUST 必须满足该条件 and
* Occur.MUST_NOT 必须不能满足该条件
* Occur.SHOULD 可以有可以没有 or
*/
@Test
public void testBooleanQuery() throws Exception{
Term term = new Term("title" ,"l*" );
Query query = new WildcardQuery(term);
Term term2 = new Term("content" ,"baidu" );
Query query2 = new WildcardQuery(term2);
BooleanQuery booleanQuery = new BooleanQuery();
//Occur.MUST 必须满足该条件
//Occur.MUST_NOT 必须不能满足该条件
booleanQuery.add(query, Occur. MUST);
//执行下面的语句,不会执行出来任何结果
//booleanQuery.add(query, Occur.MUST_NOT);
booleanQuery.add(query, Occur. MUST);
this.testSearch(booleanQuery);
}
private void testSearch(Query query) throws Exception {
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory );
TopDocs topDocs = indexSearcher.search(query,25);
ScoreDoc[] scoreDocs = topDocs. scoreDocs;
List<Article> articles = new ArrayList<Article>();
for(ScoreDoc scoreDoc : scoreDocs){
int index = scoreDoc.doc ;
Document doc = indexSearcher.doc(index);
Article article = DocumentUtils. document2Article(doc);
articles.add(article);
}
for(Article article : articles){
System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent());
}
}
}
18、score
cn.itheima.lucene.test.LuceneTest.java
public class LuceneTest {
@Test
public void testCreateIndex() throws Exception {
Article article = new Article();
article.setId(26L);
article.setTitle( "lucene可以用来做搜索引擎" );
article.setContent( "baidu、google搜索引擎公司" );
IndexWriter indexWriter = new IndexWriter(LuceneUtils.directory ,LuceneUtils.analyzer,MaxFieldLength. LIMITED);
Document document = DocumentUtils. article2Document(article);
//通过下面这种方式可以提高score值,应用场景为竞价排名
document.setBoost(100);
indexWriter.addDocument(document);
indexWriter.close();
}
}
cn.itheima.lucene.test.ScoreTest.java
/**
-
1、相同的关键词,相同的结构
-
得分一样
-
2、相同的结构,不同的关键词
-
得分不一样(lucene和搜索的得分是不一样的,一般情况下,中文比英文的得分高)
-
3、不同的结构,相同的关键词
-
关键词出现的次数越多,得分越高
-
4、竞价
*/
public class ScoreTest {@Test public void testSearch() throws Exception { IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.directory ); QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_30 ,new String[]{"title" ,"content" },LuceneUtils.analyzer); Query query = queryParser.parse( "lucene"); TopDocs topDocs = indexSearcher.search(query,25); ScoreDoc[] scoreDocs = topDocs. scoreDocs; List< Article> articles = new ArrayList< Article>(); for(ScoreDoc scoreDoc : scoreDocs){ System. out.println(scoreDoc.score ); int index = scoreDoc.doc ; Document doc = indexSearcher.doc(index); Article article = DocumentUtils.document2Article(doc); articles.add(article); } for(Article article : articles){ System. out.println(article.getId() + "\n" + article.getTitle() + "\n" + article.getContent()); }
}
}