前言
大家好,本章会简单的剖析下lucene的运行原理,并先用个简单的案例来讲述一下lucene的运行,然后再通过实际上用数据库传过来的值来演示lucene的搜索功能。
lucene的运行原理
简单的说就是lucene就是通过将数据库传过来的值给处理成索引文件。然后使用
第一步:将数据库源文件变成索引文件。
第二步:将索引文件根据特定的词找到。
那么一个一个的来。
一,将数据库源文件变成索引文件
首先我们先使用的是存储在硬盘上的text文件,其实可以直接认为是从数据库传过来的值
我们的目标就是传化成索引文件:
从基础的开始写起:
获取你要存放索引文件的位置
//存储的位置
FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
就是感觉是用来将源文件来分词的,
//创建一个配置输出流配置对象(通过一个分词器)
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
根据这两个值就可以创建一个输出流,后面的意思就是:
你往这输出流添加多少doc那么就有相应的索引。
//根据存储位置和输出流配置对象,创建一个输出流
IndexWriter indexWriter = new IndexWriter(dir, conf);
就比如说我们打算用来当数据源文件的files,我们只需给doc赋值,然后加入到indexWriter(输出流)里面就可以了
//将源文件的数据加入到哦输出流里面
File[] files = new File(dataDir).listFiles();
Document doc;
for (File file : files) {
doc = new Document();
doc.add(new TextField("contents",new FileReader(file)));
doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
doc.add(new TextField("fileName", file.getName(),Field.Store.YES));
indexWriter.addDocument(doc);
}
System.out.println("下面的索引文件有:"+indexWriter.numDocs());
完成这些我们就能看到有以下的文件
通过存储位置实列一个输入流
//存储的位置
FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
//拿到索引输入流(是借助文件工具类来打开的)
DirectoryReader indexReader = DirectoryReader.open(dir);
然后就用这个indexReader实列一个对象(因为我们要通过这个去特定查询)
//用索引输入流实列一个对象
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
写好查询的内容
String p = "EarlyTerminating-Collector";//查询内容
//根据分词器确认查询的内容
QueryParser queryParser = new QueryParser("contents", analyzer);
Query parse = queryParser.parse(p);
根据indexSearcher对象并一个查询条件就能获取到我们要的内容集合(TopDocs)
然后将其遍历打印
long start = System.currentTimeMillis();
TopDocs topDocs = indexSearcher.search(parse, 10);
long end = System.currentTimeMillis();
System.out.println("匹配 "+p+" ,总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");
for (ScoreDoc scordoc : topDocs.scoreDocs) {
int docID = scordoc.doc;
Document doc1 = indexSearcher.doc(docID);
System.out.println("通过索引文件:"+doc1.get("fullPath")+"拿数据");
}
全部代码
package com.javaxl.lucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
public class myDemo1 {
public static void main(String[] args) throws IOException, ParseException {
String indexDir = "D:\\temp\\lucene\\demo1";
String dataDir = "D:\\temp\\lucene\\demo1\\data";
//存储的位置
FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
//创建一个配置输出流配置对象(通过一个分词器)
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
//根据存储位置和输出流配置对象,创建一个输出流
IndexWriter indexWriter = new IndexWriter(dir, conf);
//将源文件的数据加入到哦输出流里面
File[] files = new File(dataDir).listFiles();
Document doc;
for (File file : files) {
doc = new Document();
doc.add(new TextField("contents",new FileReader(file)));
doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
doc.add(new TextField("fileName", file.getName(),Field.Store.YES));
indexWriter.addDocument(doc);
}
System.out.println("下面的索引文件有:"+indexWriter.numDocs());
indexWriter.close();
//-------------------------------------存储索引文件-----------------------
String p = "EarlyTerminating-Collector";//查询内容
//拿到索引输入流(是借助文件工具类来打开的)
DirectoryReader indexReader = DirectoryReader.open(dir);
//用索引输入流实列一个对象
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
//根据分词器确认查询的内容
QueryParser queryParser = new QueryParser("contents", analyzer);
Query parse = queryParser.parse(p);
long start = System.currentTimeMillis();
TopDocs topDocs = indexSearcher.search(parse, 10);
long end = System.currentTimeMillis();
System.out.println("匹配 "+p+" ,总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");
for (ScoreDoc scordoc : topDocs.scoreDocs) {
int docID = scordoc.doc;
Document doc1 = indexSearcher.doc(docID);
System.out.println("通过索引文件:"+doc1.get("fullPath")+"拿数据");
}
indexReader.close();
}
}
查看分词器
实战
我们先获取数据库的数据,并转成索引文件
package com.javaxl.blog.web;
import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.PropertiesUtil;
/**
* 构建lucene索引
* @author Administrator
* 1。构建索引 IndexWriter
* 2、读取索引文件,获取命中片段
* 3、使得命中片段高亮显示
*
*/
public class IndexStarter {
private static BlogDao blogDao = new BlogDao();
public static void main(String[] args) {
IndexWriterConfig conf = new IndexWriterConfig(new SmartChineseAnalyzer());
Directory d;
IndexWriter indexWriter = null;
try {
d = FSDirectory.open(Paths.get(PropertiesUtil.getValue("indexPath")));
indexWriter = new IndexWriter(d , conf );
// 为数据库中的所有数据构建索引
List<Map<String, Object>> list = blogDao.list(null, null);
for (Map<String, Object> map : list) {
Document doc = new Document();
doc.add(new StringField("id", (String) map.get("id"), Field.Store.YES));
// TextField用于对一句话分词处理
doc.add(new TextField("title", (String) map.get("title"), Field.Store.YES));
doc.add(new StringField("url", (String) map.get("url"), Field.Store.YES));
indexWriter.addDocument(doc);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}finally {
try {
if(indexWriter!= null) {
indexWriter.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
那么我们在前台访问后台的时候给模糊查询进行一个索引查询就行
package com.javaxl.blog.web;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.servlet.http.HttpServletRequest;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.store.Directory;
import org.apache.struts2.ServletActionContext;
import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.LuceneUtil;
import com.javaxl.blog.util.PropertiesUtil;
import com.javaxl.blog.util.StringUtils;
/**
* IndexReader
* IndexSearcher
* Highlighter
* @author Administrator
*
*/
public class BlogAction {
private String title;
private BlogDao blogDao = new BlogDao();
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String list() {
try {
HttpServletRequest request = ServletActionContext.getRequest();
if (StringUtils.isBlank(title)) {
List<Map<String, Object>> blogList = this.blogDao.list(title, null);
request.setAttribute("blogList", blogList);
}else {
Directory directory = LuceneUtil.getDirectory(PropertiesUtil.getValue("indexPath"));
DirectoryReader reader = LuceneUtil.getDirectoryReader(directory);
IndexSearcher searcher = LuceneUtil.getIndexSearcher(reader);
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
// 拿一句话到索引目中的索引文件中的词库进行关键词碰撞
Query query = new QueryParser("title", analyzer).parse(title);
Highlighter highlighter = LuceneUtil.getHighlighter(query, "title");
TopDocs topDocs = searcher.search(query , 100);
//处理得分命中的文档
List<Map<String, Object>> blogList = new ArrayList<>();
Map<String, Object> map = null;
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
map = new HashMap<>();
Document doc = searcher.doc(scoreDoc.doc);
map.put("id", doc.get("id"));
String titleHighlighter = doc.get("title");
if(StringUtils.isNotBlank(titleHighlighter)) {
titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
}
map.put("title", titleHighlighter);
map.put("url", doc.get("url"));
blogList.add(map);
}
request.setAttribute("blogList", blogList);
}
} catch (Exception e) {
e.printStackTrace();
}
return "blogList";
}
}
实战解析
package com.javaxl.lucene;
import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.packed.DirectReader;
import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.StringUtils;
import com.opensymphony.xwork2.config.providers.DirectedGraph;
public class myDemo2 {
public static void main(String[] args) throws IOException, InstantiationException, IllegalAccessException, SQLException, ParseException, InvalidTokenOffsetsException {
String indexDir = "D:\\temp\\lucene\\demo1";
//一,先创建一个文件输出流
//存放索引的位置
FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
//分词器
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
//文件输出流
IndexWriter indexWriter = new IndexWriter(dir, conf);
//二,将查询到的文件加入到输出流
BlogDao bd = new BlogDao();
List<Map<String, Object>> lists = bd.list(null, null);
Document doc = null;
for (Map<String, Object> list : lists) {
doc = new Document();
doc.add(new StringField("id", (String) list.get("id"), Field.Store.YES));
// TextField用于对一句话分词处理
doc.add(new TextField("title", (String) list.get("title"), Field.Store.YES));
doc.add(new StringField("url", (String) list.get("url"), Field.Store.YES));
indexWriter.addDocument(doc);
}
//关闭服务
if(indexWriter!=null) {
indexWriter.close();
}
// --------------------------------------------------------------------------------------
//一,先根据索引位置打开
//创建输入流
DirectoryReader indexReader = DirectoryReader.open(dir);
//根据输入流实列一个对象
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
//根据分词器确认查询的内容
QueryParser query = new QueryParser("title", analyzer);
Query parse = query.parse("方法");
//二,设置高亮
//处理得分项文字
Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
//获得得分项文字
Scorer fragmentScorer = new QueryTermScorer(parse, "title");
//将设置好的合二为一
Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
highlighter.setTextFragmenter(new SimpleFragmenter(200));
//三,获得得分项(topDocs)
TopDocs topDocs = indexSearcher.search(parse, 100);
//先搞好容器
List<Map<String,Object>> blogList = new ArrayList<>();
Map<String, Object> map = null;
//遍历得分项
for (ScoreDoc s : topDocs.scoreDocs) {
map = new HashMap<String, Object>();
Document doc1 = indexSearcher.doc(s.doc);
map.put("id", doc1.get("id"));
String titleHighlighter = doc1.get("title");
if(StringUtils.isNotBlank(titleHighlighter)) {
titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
}
map.put("title", titleHighlighter);
map.put("url", doc1.get("url"));
blogList.add(map);
}
//测试,打印看看
for (Map<String, Object> map11 : blogList) {
System.out.println(map11);
}
//关闭输入流
if(indexReader!=null) {
indexReader.close();
}
}
}
实战工具类
luceneUtil.java
package com.javaxl.blog.util;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
/**
* lucene工具类
* @author Administrator
*
*/
public class LuceneUtil {
/**
* 获取索引文件存放的文件夹对象
*
* @param path
* @return
*/
public static Directory getDirectory(String path) {
Directory directory = null;
try {
directory = FSDirectory.open(Paths.get(path));
} catch (IOException e) {
e.printStackTrace();
}
return directory;
}
/**
* 索引文件存放在内存
*
* @return
*/
public static Directory getRAMDirectory() {
Directory directory = new RAMDirectory();
return directory;
}
/**
* 文件夹读取对象
*
* @param directory
* @return
*/
public static DirectoryReader getDirectoryReader(Directory directory) {
DirectoryReader reader = null;
try {
reader = DirectoryReader.open(directory);
} catch (IOException e) {
e.printStackTrace();
}
return reader;
}
/**
* 文件索引对象
*
* @param reader
* @return
*/
public static IndexSearcher getIndexSearcher(DirectoryReader reader) {
IndexSearcher indexSearcher = new IndexSearcher(reader);
return indexSearcher;
}
/**
* 写入索引对象
*
* @param directory
* @param analyzer
* @return
*/
public static IndexWriter getIndexWriter(Directory directory, Analyzer analyzer)
{
IndexWriter iwriter = null;
try {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
// Sort sort=new Sort(new SortField("content", Type.STRING));
// config.setIndexSort(sort);//排序
config.setCommitOnClose(true);
// 自动提交
// config.setMergeScheduler(new ConcurrentMergeScheduler());
// config.setIndexDeletionPolicy(new
// SnapshotDeletionPolicy(NoDeletionPolicy.INSTANCE));
iwriter = new IndexWriter(directory, config);
} catch (IOException e) {
e.printStackTrace();
}
return iwriter;
}
/**
* 关闭索引文件生成对象以及文件夹对象
*
* @param indexWriter
* @param directory
*/
public static void close(IndexWriter indexWriter, Directory directory) {
if (indexWriter != null) {
try {
indexWriter.close();
} catch (IOException e) {
indexWriter = null;
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
directory = null;
}
}
}
/**
* 关闭索引文件读取对象以及文件夹对象
*
* @param reader
* @param directory
*/
public static void close(DirectoryReader reader, Directory directory) {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
reader = null;
}
}
if (directory != null) {
try {
directory.close();
} catch (IOException e) {
directory = null;
}
}
}
/**
* 高亮标签
*
* @param query
* @param fieldName
* @return
*/
public static Highlighter getHighlighter(Query query, String fieldName)
{
Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Scorer fragmentScorer = new QueryTermScorer(query, fieldName);
Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
highlighter.setTextFragmenter(new SimpleFragmenter(200));
return highlighter;
}
}
propertiesUtils
package com.javaxl.blog.util;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
/**
* properties工具类
* @author user
*
*/
public class PropertiesUtil {
/**
* 根据key获取value值
* @param key
* @return
*/
public static String getValue(String key){
Properties prop=new Properties();
InputStream in=new PropertiesUtil().getClass().getResourceAsStream("/lucene.properties");
try {
prop.load(in);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return prop.getProperty(key);
}
}