Lucene简介（四）（简单使用）

森林公园

于 2019-05-15 12:17:44 发布

阅读量206

点赞数

分类专栏： Java Lucene 文章标签： lucene

本文链接：https://blog.csdn.net/SLN2432713617/article/details/90233536

版权

Java 同时被 2 个专栏收录

22 篇文章 0 订阅

订阅专栏

Lucene

5 篇文章 0 订阅

订阅专栏

0. 对指定目录中的文件进行索引并执行搜索

Lucene 版本为 8.0.0，需要 JDK 8.0 及以上版本。
注意：这里在遍历文档目录时，没有采用递归函数实现，而是使用 Files 工具类，Files 工具类效率更高。而且在使用递归函数时，递归的深度，受虚拟机方法栈深度限制。简单测试了一下，我的机器递归深度为 36631。递归对比循环，成本更高。
下面的索引与搜索程序的重点分别在 Field 类和 Query 类的使用。

1.1 索引程序如下：

import static org.junit.Assert.*;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;

/** 索引操作，这里实际索引的文件为 Java 的 JDK 中包含的源码文件 
 *<br> 即 C:\Program Files\Java\jdk1.8.0_201\src.zip 解压后的文档 */
public class Demo3_Indexing {
	private static ThreadLocal<Long> startTime = new ThreadLocal<Long>();
	
	@Test
	public void test_temp() throws Exception {
		String title = "String.java";
		System.out.println(title.substring(0, title.indexOf(".java")));
	}
	
	@Test
	public void test_path() throws Exception {
		System.out.println(Paths.get("doc", new String[0]));
	}
	
	private static String indexPathStr = "E:\\temp\\lucene-index";
	private static Path indexPath = null;
	private static String docsPathStr = "E:\\temp\\lucene-docs\\src";
	private static Path docsPath = null;
	private static File docsFile = null;
	private static int count = 0;
	
	private static boolean isInited_MinTileHashCodeValue = false;
	private static int minTitleHashCodeValue = 0;
	private static int maxTitleHashCodeValue = 0;
	private static boolean isInited_MinDocCreatedTimeValue = false;
	private static long minDocCreatedTimeValue = 0;
	private static long maxDocCreatedTimeValue = 0;
	
	private static void updateMinOrMaxTitleHashCodeValue(int hashCode) {
		if (hashCode > maxTitleHashCodeValue) {
			maxTitleHashCodeValue = hashCode;
		}
		if (!isInited_MinTileHashCodeValue && (isInited_MinTileHashCodeValue = true) == true) {
			minTitleHashCodeValue = hashCode;
		}
		if (hashCode < minTitleHashCodeValue) {
			minTitleHashCodeValue = hashCode;
		}
	}
	
	private static void updateMinOrMaxDocCreatedTimeValue(long createdTime) {
		if (createdTime > maxDocCreatedTimeValue) {
			maxDocCreatedTimeValue = createdTime;
		}
		if (!isInited_MinDocCreatedTimeValue && (isInited_MinDocCreatedTimeValue = true) == true) {
			minDocCreatedTimeValue = createdTime;
		}
		if (createdTime < minDocCreatedTimeValue) {
			minDocCreatedTimeValue = createdTime;
		}
	}
	
	static {
		docsPath = Paths.get(docsPathStr, new String[0]);
		if (Files.notExists(docsPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
			System.out.println("指定文件目录不存在，docsPath：" + docsPathStr);
			System.out.println("程序退出");
			System.exit(1);
		}
		docsFile = docsPath.toFile();
		// init index path
		indexPath = Paths.get(indexPathStr, new String[0]);
		if (Files.notExists(indexPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
			System.out.println("（不要慌，Lucene 会为我们创建的）指定索引文件不存在，indexPath：" + indexPathStr);
		}
	}
	
	public static void recurveFiles(File file) {
		if (file.isDirectory()) {
			File[] files = file.listFiles();
			for(File f : files) {
				recurveFiles(f);
			}
		}else {
			count++;
		}
	}
	
	public static void walkPaths(final IndexWriter writer, Path path){
		try {
			if (Files.isDirectory(path, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
				Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
					@Override
					public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
						// 跳过目录
						if (!Files.isDirectory(file, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {							
							count++;
							doIndexing(writer, file);
						}
						return FileVisitResult.CONTINUE;
					}
					@Override
					public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
						return FileVisitResult.CONTINUE;
					}
				});
			}else {
				// path 为一个文件，而非目录
				count++;
				doIndexing(writer, path);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				writer.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	public static void doIndexing(IndexWriter writer, Path filePath) {
		try {
			long docCreatedTime = System.currentTimeMillis();
			Document document = new Document();
			// field 1, 'title' - StringField
			// stringField，使用原始字符串值作为一个 token（词汇单元），
			// 在查询时，使用完全匹配，且区分大小写，
			// 例如，原始字符串为 ‘String’，则在查询时，使用 ‘string’则匹配不到该文本（Document），
			// 必须使用 ‘String’ 即和原始字符串值完全一致时，才能匹配该文本（Document）
			String title = filePath.getFileName().toString();
			String titleNameWithoutSuffix = null;
			// 对 Java 源文件特殊对待
			if (title.endsWith("java")) {
				titleNameWithoutSuffix = title.substring(0, title.indexOf(".java"));
			}
			if (titleNameWithoutSuffix != null) {				
				// 1
//				document.add(new StringField("title", titleNameWithoutSuffix, Field.Store.YES));
				// 2
				document.add(new TextField("title", titleNameWithoutSuffix, Field.Store.YES));
			}else {	
				// 1
//				document.add(new StringField("title", title, Field.Store.YES));
				// 2
				document.add(new TextField("title", title, Field.Store.YES));
			}
			// field 2, 'createdTime' - LongPoint
			document.add(new LongPoint("createdTime", new long[] {docCreatedTime}));
			document.add(new StoredField("createdTimeValue", docCreatedTime));
			updateMinOrMaxDocCreatedTimeValue(docCreatedTime);
			// field 3, 'body' - TextField
			document.add(new TextField("body", 
					new InputStreamReader(Files.newInputStream(filePath, StandardOpenOption.READ)
							, StandardCharsets.UTF_8)));
			document.add(new StoredField("bodyValue", Files.readAllBytes(filePath)));
			// field 4, hashcode of title - IntPoint
			int titleHashCode = (titleNameWithoutSuffix == null ? title.hashCode() : titleNameWithoutSuffix.hashCode());
			document.add(new IntPoint("titleHashCode", new int[] {titleHashCode}));
			document.add(new StoredField("titleHashCodeValue", titleHashCode));
			updateMinOrMaxTitleHashCodeValue(titleHashCode);
			// field 5, path
			document.add(new StoredField("path", filePath.toString()));
			
			writer.addDocument(document);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) throws IOException {
		System.out.println("beginning...");
		count = 0;
		long st = System.currentTimeMillis();
//		recurveFiles(docsFile);
		long et = System.currentTimeMillis();
//		System.out.println("recurve file cost time " + (et - st) + "ms, count is " + count);
		count = 0;
		st = System.currentTimeMillis();
		// 
		Directory directory = FSDirectory.open(indexPath);
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig config = new IndexWriterConfig(analyzer);
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
		IndexWriter writer = new IndexWriter(directory, config);
		//
		walkPaths(writer, docsPath);
		et = System.currentTimeMillis();
		System.out.println("indxing file cost time " + (et - st) + "ms, count is " + count);
		System.out.println("title hash code range is from " + minTitleHashCodeValue + " to " + maxTitleHashCodeValue);
		System.out.println("document created time range is from " + minDocCreatedTimeValue 
				+ " to " + maxDocCreatedTimeValue);
	}
	
}

1.2 搜索程序如下：

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.xml.builders.PointRangeQueryBuilder;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/** 搜索索引 */
public class Demo3_Searching {
	private static String indexPathStr = "E:\\temp\\lucene-index";
	private static Path indexPath = null;
	
	static {
		// init index path
		indexPath = Paths.get(indexPathStr, new String[0]);
		if (Files.notExists(indexPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
			System.out.println("指定索引文件不存在，无法执行搜索，indexPath：" + indexPathStr);
			System.out.println("程序退出");
			System.exit(1);
		}
	}
	
	/** 解析字节流，返回其字符串表示 */
	public static String getStringValueFromBytes(byte[] bytes) {
		StringBuilder builder = new StringBuilder();
		if (bytes != null && bytes.length > 0) {
			InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(bytes)
					, StandardCharsets.UTF_8);
			try {
				char[] buf = new char[1024];
				int res = 0;
				while((res = reader.read(buf)) != -1) {
					builder.append(buf, 0, res);
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return builder.toString();
	}
	
	/** 执行查询操作 */
	public static void doSearching(IndexSearcher searcher, Query query, int histNum) {
		try {
			long startTime = System.currentTimeMillis();
			TopDocs topDocs = searcher.search(query, histNum);
			System.out.println("搜索耗时：" + (System.currentTimeMillis() - startTime) + "毫秒");
			if (topDocs != null) {
				System.out.println("搜索结果为： \n--------------------");
				ScoreDoc[] scoreDocs = topDocs.scoreDocs;
				for(ScoreDoc hit : scoreDocs) {
					Document doc = searcher.doc(hit.doc);
					// available field: title , createdTime / createdTimeValue
					// , body , titleHashCode / titleHashCodeValue
					String title = doc.get("title");
					String path = doc.get("path");
					Field titleHashCodeValue = (Field) doc.getField("titleHashCodeValue");
					Field bodyValue = (Field) doc.getField("bodyValue");					
					String bodyValueStr = getStringValueFromBytes(bodyValue.binaryValue().bytes);
					System.out.println("score=" + hit.score + ", title : " 
							+ title + ", hashCod=" + titleHashCodeValue.stringValue() 
							+ ", \npath : " + path);
					// 输出部分 body 值（因为 body 部分文本太长）
					System.out.println("[body]\n" + bodyValueStr.substring(0,
							(bodyValueStr.length() > 10 ? 10 : bodyValueStr.length())));
					// 打印分割线
					System.out.println("----------");
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) throws IOException, ParseException {
		Directory directory = FSDirectory.open(indexPath);
		IndexReader reader = DirectoryReader.open(directory);
		IndexSearcher searcher = new IndexSearcher(reader);
		Analyzer analyzer = new StandardAnalyzer();
		int histNum = 10;
		// 1 该查询方式区分大小写
		Query query = new TermQuery(new Term("body", "string"));
		// 2 查询解析器会将查询语句全部转为小写
//		String fieldName = "title";
//		QueryParser parser = new QueryParser(fieldName, analyzer);
//		Query query = parser.parse("String");
		// 3
		// phraseQuery ,使用完全匹配，并且区分大小写
//		Query query = new PhraseQuery("title", new String[] {"string"});
		// 4 布尔查询（组合多个查询）
//		Query query = new BooleanQuery.Builder()
//				.add(new TermQuery(new Term("body", "string")), Occur.MUST)
//				.add(new TermQuery(new Term("body", "file")), Occur.MUST)
//				.build();
		// 5 数值范围，精确查询
//		Query query = IntPoint.newRangeQuery("titleHashCode", -1808118735, -808118735);
		
		// do searching
		System.out.println("查询语句为：" + query);
		doSearching(searcher, query, histNum);
	}

}

森林公园

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene简介（四）（简单使用）

0. 对指定目录中的文件进行索引并执行搜索Lucene 版本为 8.0.0，需要 JDK 8.0 及以上版本。1.1 索引程序如下：import static org.junit.Assert.*;import java.io.File;import java.io.IOException;import java.io.InputStreamReader;import java...
复制链接

扫一扫