Lucene学习笔记之（四）特定项进行搜索

最新推荐文章于 2023-12-26 09:00:49 发布

孤师

最新推荐文章于 2023-12-26 09:00:49 发布

阅读量1.7k

点赞数 1

分类专栏： Lucene 文章标签： lucene 全文检索文档索引搜索

本文链接：https://blog.csdn.net/yang307511977/article/details/52071388

版权

Lucene 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

步骤一：创建maven现目

步骤二：配置pom.xml文件，代码如下：

                <!-- junit包
			因为是java程序，，需要用到@Test，这就是他的jar包下载。-->
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>

		<!-- lucene核心包 
			以下这三个是用在lucene的全部jar包，core是核心包，queryparser是查询jar包。
			查询被索引文件如果是全英文的情况下，pom.xml文件写这三个，就欧了!-->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>5.3.1</version>
		</dependency>

		<!-- 查询解析器 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>5.3.1</version>
		</dependency>

		<!-- 分析器 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>5.3.1</version>
		</dependency>
		<!-- 很明显，这个是查询被检索文件为全中文的情况下，
			加上以上的三个，再加上这两个就行了。
			值得提一下，“高亮显示”的jar包可加可不加，
			在这里面加上，是因为这个在后面会用到。
			但是还是建议大家把这个加上，懂得多也不是个错。
		 	-->
		<!-- 中文分词查询器smartcn -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-smartcn</artifactId>
			<version>5.3.1</version>
		</dependency>

		<!-- 高亮显示 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>5.3.1</version>
		</dependency>

步骤三：开始建包写代码

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * 对特定项搜索
 */
public class Indexer {
	
	//写索引的实例到指定目录下
	private IndexWriter writer;
	
	/**
	 * 构造方法：为了实例化IndexWriter
	 */
	private Indexer(String indexDir) throws Exception{
		
		//得到索引所在目录的路径
		Directory dir = FSDirectory.open(Paths.get(indexDir));
		
		//实例化分析器
		Analyzer analyzer = new StandardAnalyzer();
		
		//实例化IndexWriterConfig
		IndexWriterConfig con = new IndexWriterConfig(analyzer);
		
		//实例化IndexWriter
		writer = new IndexWriter(dir, con);
	
	}
	
	/**
	 * 关闭写索引
	 * @throws Exception
	 */
	public void close()throws Exception{
		
		writer.close();
	}
	
	
	/**
	 * 索引指定目录的所有文件
	 * @throws Exception 
	 */
	public int index(String dataDir) throws Exception{
		
		//定义文件数组，循环得出要加索引的文件
		File[] file = new File(dataDir).listFiles();
		
		for (File files : file) {
			
			//从这开始，对每个文件加索引
			indexFile(files);
		}
		
		//返回索引了多少个文件，有几个文件返回几个
		return writer.numDocs();
		
	}

	/**
	 * 索引指定文件
	 * @throws Exception 
	 */
	private void indexFile(File files) throws Exception {
		
		System.out.println("索引文件："+files.getCanonicalPath());
		
		//索引要一行一行的找，，在数据中为文档，所以要得到所有行，即文档
		Document document = getDocument(files);
		
		//开始写入,就把文档写进了索引文件里去了；
		writer.addDocument(document);
	
	}

	/**
	 * 获得文档，在文档里在设置两个字段
	 * 
	 * 获得文档，相当于数据库里的一行
	 * @throws Exception 
	 * */
	private Document getDocument(File files) throws Exception {
		
		Document doc = new Document();
		
		doc.add(new TextField("contents",new FileReader(files)));
		
		//Field.Store.YES：把文件名存索引文件里，上面没有就说明不需要加到索引文件里去
		doc.add(new TextField("FileName", files.getName(), Field.Store.YES));
		
		//把完整路径存在索引文件里
		doc.add(new TextField("fullPath", files.getCanonicalPath(),Field.Store.YES));
	
		//返回document
		return doc;
	}
	
	
	//开始测试写入索引
	public static void main(String[] args){
		
		//索引指定的路径
		String indexDir = "E:\\luceneDemo4";
		
		//被索引数据路径
		String dataDir = "E:\\luceneDemo4\\data";
		
		//写索引
		Indexer indexer = null;
		int numIndex = 0;
		
		//索引开始时间
		long start = System.currentTimeMillis();
		
		try {
			
			indexer = new  Indexer(indexDir);
			//将要索引的数据路径(int:因为这是要索引的数据，有多少就返回多少数量的索引文件)
			numIndex = indexer.index(dataDir);
			
		} catch (Exception e) {
			
			e.printStackTrace();
		}
		//索引结束时间
		long end = System.currentTimeMillis();
		
		//显示结果
		System.out.println("索引了  "+numIndex+"  个文件，花费了  "+(end-start)+"  毫秒");
		
	}

}

写完检索接下来就是根据特定项进行检索查询，代码如下：

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

/**
 * 对索引文档进行特定查询、解析表达式查询
 * @author SZQ
 */
public class searchDocumentDingEl {

	
	private Directory dir;
	private IndexReader reader;
	private IndexSearcher searcher;
	
	@Before
	public void setUp() throws Exception {
		
		//得到索引所在目录的路径
		dir = FSDirectory.open(Paths.get("E:\\luceneDemo4"));
		
		//通过dir得到的路径下的所有的文件
		reader = DirectoryReader.open(dir);
		
		//建立索引查询器
		searcher = new IndexSearcher(reader);
		
	}

	@After
	public void tearDown() throws Exception {
		
		reader.close();
	}
	
	/**
	 * 对特定项搜索：对索引文档有的分词进行查询（对应图一）
	 * @throws Exception
	 */
	@Test
	public void testTermQuery()throws Exception{
		
		//定义要查询的索引
		String searchField = "contents";
		
		//根据contents要查询的对象
		String q = "lowercasing"; 
		
		//运用term来查找
		Term t = new Term(searchField,q);
		
		//通过term得到query对象
		Query query = new TermQuery(t);
		
		//获得查询的hits
		TopDocs hits = searcher.search(query, 10);
		
		//显示结果
		System.out.println("匹配 '"+q+"'，总共查询到"+hits.totalHits+"个文档");
		
		//循环得到文档，得到文档就可以得到数据
		for(ScoreDoc scoreDoc:hits.scoreDocs){
			
			Document doc=searcher.doc(scoreDoc.doc);
			
			System.out.println(doc.get("fullPath"));
		}
	}
}

如图一:

	/**
	 * 解析查询表达式（对应图二）
	 * @throws Exception
	 */
	@Test
	public void testQueryParser()throws Exception{
		
		String searchField="contents";
		String q="Rob* AND separab*";
		
		//实例化分析器
		Analyzer analyzer = new StandardAnalyzer();
		
		//建立查询解析器
		/**
		 * 第一个参数是要查询的字段；
		 * 第二个参数是分析器Analyzer
		 * */
		QueryParser parser=new QueryParser(searchField, analyzer);
		
		//根据传进来的p查找
		Query query=parser.parse(q);
		
		//开始查询
		/**
		 * 第一个参数是通过传过来的参数来查找得到的query；
		 * 第二个参数是要出查询的行数
		 * */
		TopDocs hits=searcher.search(query, 100);
		
		//遍历topDocs
		/**
		 * ScoreDoc:便利得到的文档
		 * scoreDocs:代表TopDocs 的文件数组
		 * @throws Exception 
		 * */
		System.out.println("匹配 "+q+"查询到"+hits.totalHits+"个记录");
		
		for(ScoreDoc scoreDoc:hits.scoreDocs){
			
			Document doc=searcher.doc(scoreDoc.doc);
			
			System.out.println(doc.get("fullPath"));
		}
		
	}

如图二：