lucene的基础和lucene的高亮效果

最新推荐文章于 2021-02-12 12:40:30 发布

么卡

最新推荐文章于 2021-02-12 12:40:30 发布

阅读量784

点赞数

分类专栏： lucene 文章标签： lucene

本文链接：https://blog.csdn.net/weixin_43943548/article/details/100583130

版权

lucene 专栏收录该内容

0 篇文章 0 订阅

订阅专栏

前言

大家好，本章会简单的剖析下lucene的运行原理，并先用个简单的案例来讲述一下lucene的运行，然后再通过实际上用数据库传过来的值来演示lucene的搜索功能。

lucene的运行原理

在这里插入图片描述
简单的说就是lucene就是通过将数据库传过来的值给处理成索引文件。然后使用
第一步：将数据库源文件变成索引文件。
第二步：将索引文件根据特定的词找到。

那么一个一个的来。

一，将数据库源文件变成索引文件
首先我们先使用的是存储在硬盘上的text文件，其实可以直接认为是从数据库传过来的值
在这里插入图片描述
我们的目标就是传化成索引文件：

从基础的开始写起：

获取你要存放索引文件的位置

	//存储的位置
		FSDirectory dir = FSDirectory.open(Paths.get(indexDir));

就是感觉是用来将源文件来分词的，

//创建一个配置输出流配置对象(通过一个分词器)
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig conf = new IndexWriterConfig(analyzer);

根据这两个值就可以创建一个输出流，后面的意思就是：
你往这输出流添加多少doc那么就有相应的索引。

//根据存储位置和输出流配置对象，创建一个输出流
		IndexWriter indexWriter = new IndexWriter(dir, conf);

就比如说我们打算用来当数据源文件的files,我们只需给doc赋值，然后加入到indexWriter（输出流）里面就可以了

//将源文件的数据加入到哦输出流里面
		File[] files = new File(dataDir).listFiles();
		Document doc;
		for (File file : files) {
			doc = new Document();
			doc.add(new TextField("contents",new FileReader(file)));
			doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
			doc.add(new TextField("fileName", file.getName(),Field.Store.YES));
			indexWriter.addDocument(doc);
		}
		
		System.out.println("下面的索引文件有："+indexWriter.numDocs());

完成这些我们就能看到有以下的文件
在这里插入图片描述

在这里插入图片描述

通过存储位置实列一个输入流

	//存储的位置
		FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
	//拿到索引输入流(是借助文件工具类来打开的)
		DirectoryReader indexReader = DirectoryReader.open(dir);

然后就用这个indexReader实列一个对象(因为我们要通过这个去特定查询)

	//用索引输入流实列一个对象
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);

写好查询的内容

	String p = "EarlyTerminating-Collector";//查询内容
		
	//根据分词器确认查询的内容
		QueryParser queryParser = new QueryParser("contents", analyzer);
		Query parse = queryParser.parse(p);

根据indexSearcher对象并一个查询条件就能获取到我们要的内容集合(TopDocs)
然后将其遍历打印

		long start = System.currentTimeMillis();
		TopDocs topDocs = indexSearcher.search(parse, 10);
		long end = System.currentTimeMillis();
		System.out.println("匹配 "+p+" ，总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");
		
		for (ScoreDoc scordoc : topDocs.scoreDocs) {
			int docID = scordoc.doc;
			Document doc1 = indexSearcher.doc(docID);
			
			System.out.println("通过索引文件："+doc1.get("fullPath")+"拿数据");
		}

全部代码

package com.javaxl.lucene;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;

public class myDemo1 {
	
	public static void main(String[] args) throws IOException, ParseException {
		String indexDir = "D:\\temp\\lucene\\demo1";
		String dataDir = "D:\\temp\\lucene\\demo1\\data";
		
		//存储的位置
		FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
		
		//创建一个配置输出流配置对象(通过一个分词器)
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig conf = new IndexWriterConfig(analyzer);
		
		//根据存储位置和输出流配置对象，创建一个输出流
		IndexWriter indexWriter = new IndexWriter(dir, conf);
		
		//将源文件的数据加入到哦输出流里面
		File[] files = new File(dataDir).listFiles();
		Document doc;
		for (File file : files) {
			doc = new Document();
			doc.add(new TextField("contents",new FileReader(file)));
			doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
			doc.add(new TextField("fileName", file.getName(),Field.Store.YES));
			indexWriter.addDocument(doc);
		}
		
		System.out.println("下面的索引文件有："+indexWriter.numDocs());
		indexWriter.close();
		
		
		//-------------------------------------存储索引文件-----------------------
		
		String p = "EarlyTerminating-Collector";//查询内容
		
		//拿到索引输入流(是借助文件工具类来打开的)
		DirectoryReader indexReader = DirectoryReader.open(dir);
		
		//用索引输入流实列一个对象
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
		
		//根据分词器确认查询的内容
		QueryParser queryParser = new QueryParser("contents", analyzer);
		Query parse = queryParser.parse(p);
		
		long start = System.currentTimeMillis();
		TopDocs topDocs = indexSearcher.search(parse, 10);
		long end = System.currentTimeMillis();
		System.out.println("匹配 "+p+" ，总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");
		
		for (ScoreDoc scordoc : topDocs.scoreDocs) {
			int docID = scordoc.doc;
			Document doc1 = indexSearcher.doc(docID);
			
			System.out.println("通过索引文件："+doc1.get("fullPath")+"拿数据");
		}
		
		indexReader.close();
		
		
		
		
		
		
		
		
		
		
		
	}

}

查看分词器

可以参考这里面

实战

在这里插入图片描述

我们先获取数据库的数据，并转成索引文件

package com.javaxl.blog.web;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.PropertiesUtil;


/**
 * 构建lucene索引
 * @author Administrator
 * 1。构建索引	IndexWriter
 * 2、读取索引文件，获取命中片段
 * 3、使得命中片段高亮显示
 *
 */
public class IndexStarter {
	private static BlogDao blogDao = new BlogDao();
	public static void main(String[] args) {
		IndexWriterConfig conf = new IndexWriterConfig(new SmartChineseAnalyzer());
		Directory d;
		IndexWriter indexWriter = null;
		try {
			d = FSDirectory.open(Paths.get(PropertiesUtil.getValue("indexPath")));
			indexWriter = new IndexWriter(d , conf );
			
//			为数据库中的所有数据构建索引
			List<Map<String, Object>> list = blogDao.list(null, null);
			for (Map<String, Object> map : list) {
				Document doc = new Document();
				doc.add(new StringField("id", (String) map.get("id"), Field.Store.YES));
//				TextField用于对一句话分词处理
				doc.add(new TextField("title", (String) map.get("title"), Field.Store.YES));
				doc.add(new StringField("url", (String) map.get("url"), Field.Store.YES));
				indexWriter.addDocument(doc);
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InstantiationException e) {
			e.printStackTrace();
		} catch (IllegalAccessException e) {
			e.printStackTrace();
		} catch (SQLException e) {
			e.printStackTrace();
		}finally {
			try {
				if(indexWriter!= null) {
					indexWriter.close();
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}
}

那么我们在前台访问后台的时候给模糊查询进行一个索引查询就行

package com.javaxl.blog.web;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.http.HttpServletRequest;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.store.Directory;
import org.apache.struts2.ServletActionContext;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.LuceneUtil;
import com.javaxl.blog.util.PropertiesUtil;
import com.javaxl.blog.util.StringUtils;

/**
 * IndexReader
 * IndexSearcher
 * Highlighter
 * @author Administrator
 *
 */
public class BlogAction {
	private String title;
	private BlogDao blogDao = new BlogDao();

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String list() {
		try {
			HttpServletRequest request = ServletActionContext.getRequest();
			if (StringUtils.isBlank(title)) {
				List<Map<String, Object>> blogList = this.blogDao.list(title, null);
				request.setAttribute("blogList", blogList);
			}else {
				Directory directory = LuceneUtil.getDirectory(PropertiesUtil.getValue("indexPath"));
				DirectoryReader reader = LuceneUtil.getDirectoryReader(directory);
				IndexSearcher searcher = LuceneUtil.getIndexSearcher(reader);
				SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
//				拿一句话到索引目中的索引文件中的词库进行关键词碰撞
				Query query = new QueryParser("title", analyzer).parse(title);
				Highlighter highlighter = LuceneUtil.getHighlighter(query, "title");
				
				TopDocs topDocs = searcher.search(query , 100);
				//处理得分命中的文档
				List<Map<String, Object>> blogList = new ArrayList<>();
				Map<String, Object> map = null;
				ScoreDoc[] scoreDocs = topDocs.scoreDocs;
				for (ScoreDoc scoreDoc : scoreDocs) {
					map = new HashMap<>();
					Document doc = searcher.doc(scoreDoc.doc);
					map.put("id", doc.get("id"));
					String titleHighlighter = doc.get("title");
					if(StringUtils.isNotBlank(titleHighlighter)) {
						titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
					}
					map.put("title", titleHighlighter);
					map.put("url", doc.get("url"));
					blogList.add(map);
				}
				
				request.setAttribute("blogList", blogList);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return "blogList";
	}
}

在这里插入图片描述

实战解析

package com.javaxl.lucene;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.packed.DirectReader;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.StringUtils;
import com.opensymphony.xwork2.config.providers.DirectedGraph;

public class myDemo2 {
	public static void main(String[] args) throws IOException, InstantiationException, IllegalAccessException, SQLException, ParseException, InvalidTokenOffsetsException {
		
		
		String indexDir = "D:\\temp\\lucene\\demo1";
		
		//一，先创建一个文件输出流
		//存放索引的位置
		FSDirectory dir =  FSDirectory.open(Paths.get(indexDir));
		
		//分词器
		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		IndexWriterConfig conf = new IndexWriterConfig(analyzer);
		
		//文件输出流
		IndexWriter indexWriter = new IndexWriter(dir, conf);
		
		//二，将查询到的文件加入到输出流
		BlogDao bd = new BlogDao();
		List<Map<String, Object>> lists = bd.list(null, null);
		Document doc = null;
		for (Map<String, Object> list : lists) {
			doc = new Document();
			doc.add(new StringField("id", (String) list.get("id"), Field.Store.YES));
//			TextField用于对一句话分词处理
			doc.add(new TextField("title", (String) list.get("title"), Field.Store.YES));
			doc.add(new StringField("url", (String) list.get("url"), Field.Store.YES));
			indexWriter.addDocument(doc);
		}
		//关闭服务
		if(indexWriter!=null) {
			indexWriter.close();
		}
		
		
//		--------------------------------------------------------------------------------------
		
		
		//一，先根据索引位置打开
		//创建输入流
		DirectoryReader indexReader = DirectoryReader.open(dir);
		
		//根据输入流实列一个对象
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
		
		//根据分词器确认查询的内容
		QueryParser query = new QueryParser("title", analyzer);
		Query parse = query.parse("方法");
		
		//二，设置高亮
		//处理得分项文字
		Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
		//获得得分项文字
		Scorer fragmentScorer = new QueryTermScorer(parse, "title");
		
		
		//将设置好的合二为一
		Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
		highlighter.setTextFragmenter(new SimpleFragmenter(200));
		
		
		
		//三，获得得分项(topDocs)
		TopDocs topDocs = indexSearcher.search(parse, 100);
		
		//先搞好容器
		List<Map<String,Object>> blogList = new ArrayList<>();
		Map<String, Object> map = null;
		
		
		//遍历得分项
		for (ScoreDoc s : topDocs.scoreDocs) {
			map = new  HashMap<String, Object>();
			Document doc1 = indexSearcher.doc(s.doc);
			
			map.put("id", doc1.get("id"));
			String titleHighlighter = doc1.get("title");
			if(StringUtils.isNotBlank(titleHighlighter)) {
				titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
			}
			map.put("title", titleHighlighter);
			map.put("url", doc1.get("url"));
			blogList.add(map);
			
		}
		
		//测试，打印看看
		for (Map<String, Object> map11 : blogList) {
			System.out.println(map11);
		}
		
		//关闭输入流
		if(indexReader!=null) {
			indexReader.close();
		}
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
		
	}
	
	
	

}

实战工具类

luceneUtil.java

package com.javaxl.blog.util;

import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

/**
 * lucene工具类
 * @author Administrator
 *
 */
public class LuceneUtil {

	/**
	 * 获取索引文件存放的文件夹对象
	 * 
	 * @param path
	 * @return
	 */
	public static Directory getDirectory(String path) {
		Directory directory = null;
		try {
			directory = FSDirectory.open(Paths.get(path));
		} catch (IOException e) {
			e.printStackTrace();
		}
		return directory;
	}

	/**
	 * 索引文件存放在内存
	 * 
	 * @return
	 */
	public static Directory getRAMDirectory() {
		Directory directory = new RAMDirectory();
		return directory;
	}

	/**
	 * 文件夹读取对象
	 * 
	 * @param directory
	 * @return
	 */
	public static DirectoryReader getDirectoryReader(Directory directory) {
		DirectoryReader reader = null;
		try {
			reader = DirectoryReader.open(directory);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return reader;
	}

	/**
	 * 文件索引对象
	 * 
	 * @param reader
	 * @return
	 */
	public static IndexSearcher getIndexSearcher(DirectoryReader reader) {
		IndexSearcher indexSearcher = new IndexSearcher(reader);
		return indexSearcher;
	}

	/**
	 * 写入索引对象
	 * 
	 * @param directory
	 * @param analyzer
	 * @return
	 */
	public static IndexWriter getIndexWriter(Directory directory, Analyzer analyzer)

	{
		IndexWriter iwriter = null;
		try {
			IndexWriterConfig config = new IndexWriterConfig(analyzer);
			config.setOpenMode(OpenMode.CREATE_OR_APPEND);
			// Sort sort=new Sort(new SortField("content", Type.STRING));
			// config.setIndexSort(sort);//排序
			config.setCommitOnClose(true);
			// 自动提交
			// config.setMergeScheduler(new ConcurrentMergeScheduler());
			// config.setIndexDeletionPolicy(new
			// SnapshotDeletionPolicy(NoDeletionPolicy.INSTANCE));
			iwriter = new IndexWriter(directory, config);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return iwriter;
	}

	/**
	 * 关闭索引文件生成对象以及文件夹对象
	 * 
	 * @param indexWriter
	 * @param directory
	 */
	public static void close(IndexWriter indexWriter, Directory directory) {
		if (indexWriter != null) {
			try {
				indexWriter.close();
			} catch (IOException e) {
				indexWriter = null;
			}
		}
		if (directory != null) {
			try {
				directory.close();
			} catch (IOException e) {
				directory = null;
			}
		}
	}

	/**
	 * 关闭索引文件读取对象以及文件夹对象
	 * 
	 * @param reader
	 * @param directory
	 */
	public static void close(DirectoryReader reader, Directory directory) {
		if (reader != null) {
			try {
				reader.close();
			} catch (IOException e) {
				reader = null;
			}
		}
		if (directory != null) {
			try {
				directory.close();
			} catch (IOException e) {
				directory = null;
			}
		}

	}

	/**
	 * 高亮标签
	 * 
	 * @param query
	 * @param fieldName
	 * @return
	 */

	public static Highlighter getHighlighter(Query query, String fieldName)

	{
		Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
		Scorer fragmentScorer = new QueryTermScorer(query, fieldName);
		Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
		highlighter.setTextFragmenter(new SimpleFragmenter(200));
		return highlighter;
	}
}

propertiesUtils

package com.javaxl.blog.util;

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

/**
 * properties工具类
 * @author user
 *
 */
public class PropertiesUtil {

	/**
	 * 根据key获取value值
	 * @param key
	 * @return
	 */
	public static String getValue(String key){
		Properties prop=new Properties();
		InputStream in=new PropertiesUtil().getClass().getResourceAsStream("/lucene.properties");
		try {
			prop.load(in);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return prop.getProperty(key);
	}
}