搜索引擎lucene入门

lucene是什么

Lucene是apache软件基金会发布的一个开放源代码的全文检索引擎工具包,由资深全文检索专家Doug Cutting所撰写,它是一个全文检索引擎的架构,提供了完整的创建索引和查询索引,以及部分文本分析的引擎,Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便在目标系统中实现全文检索的功能,或者是以此为基础建立起完整的全文检索引擎,Lucene在全文检索领域是一个经典的祖先,现在很多检索引擎都是在其基础上创建的,思想是相通的。

搜索:我们正常的数据库搜索就是通过 like 关键字来实现的,如果用户需要同时搜索多个列段的关键字,那么模糊查就会显的很短板,用户多输入一个字或者少输入一个子都可能会导致效果不一样

全文索引:然而通过全文搜索,它不再局限于列段关键字,而是数据库所有内容,并且能通过操作实现然我们搜索到的信息实现高亮效果,比如我们在百度的搜索效果:
在这里插入图片描述

lucene实现搜索原理

首先Lucene要创建一个索引库,然后搜素的内容传入进来,lucene对内容进行检索,然后生成一个个索引文件,然后通过lucene解析器把索引文件解析,然后把解析完的内容放入TopDocs对象中,最后我们在给内容加上高亮效果,最终索引的内容机会以高亮的效果呈现
在这里插入图片描述

lucene入门

lucene官网http://lucene.apache.org/
最新版本是要求jdk1.8以上

lucene所需pom依赖

<properties>
	<lucene.version>7.1.0</lucene.version>
</properties>
	<dependencies>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>${lucene.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-smartcn</artifactId>
			<version>${lucene.version}</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>${lucene.version}</version>
		</dependency>
	</dependencies>

这是我们下面测试需要用到的两个工具类

IndexCreate

package com.javaxl.lucene;

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;

/**
 * 配合Demo1.java进行lucene的helloword实现
 * @author Administrator
 *
 */
public class IndexCreate {
	private IndexWriter indexWriter;
	
	/**
	 * 1、构造方法 实例化IndexWriter
	 * @param indexDir
	 * @throws Exception
	 */
	public IndexCreate(String indexDir) throws Exception{
//		获取索引文件的存放地址对象
		FSDirectory dir = FSDirectory.open(Paths.get(indexDir));
//		标准分词器(针对英文)
		Analyzer analyzer = new StandardAnalyzer();
//		索引输出流配置对象
		IndexWriterConfig conf = new IndexWriterConfig(analyzer); 
		indexWriter = new IndexWriter(dir, conf);
	}
	
	/**
	 * 2、关闭索引输出流
	 * @throws Exception
	 */
	public void closeIndexWriter()  throws Exception{
		indexWriter.close();
	}
	
	/**
	 * 3、索引指定路径下的所有文件
	 * @param dataDir
	 * @return
	 * @throws Exception
	 */
	public int index(String dataDir) throws Exception{
		File[] files = new File(dataDir).listFiles();
		for (File file : files) {
			indexFile(file);
		}
		return indexWriter.numDocs();
	}
	
	/**
	 * 4、索引指定的文件
	 * @param file
	 * @throws Exception
	 */
	private void indexFile(File file) throws Exception{
		System.out.println("被索引文件的全路径:"+file.getCanonicalPath());
		Document doc = getDocument(file);
		indexWriter.addDocument(doc);
	}
	
	/**
	 * 5、获取文档(索引文件中包含的重要信息,key-value的形式)
	 * @param file
	 * @return
	 * @throws Exception
	 */
	private Document getDocument(File file) throws Exception{
		Document doc = new Document();
		doc.add(new TextField("contents", new FileReader(file)));
//		Field.Store.YES是否存储到硬盘
		doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES));
		doc.add(new TextField("fileName", file.getName(),Field.Store.YES));
		return doc;
	}
}

IndexUse

package com.javaxl.lucene;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;

/**
 * 配合Demo2.java进行lucene的helloword实现
 * @author Administrator
 *
 */
public class IndexUse {
	/**
	 * 通过关键字在索引目录中查询
	 * @param indexDir	索引文件所在目录
	 * @param q	关键字
	 */
	public static void search(String indexDir, String q) throws Exception{
		FSDirectory indexDirectory = FSDirectory.open(Paths.get(indexDir));
//		注意:索引输入流不是new出来的,是通过目录读取工具类打开的
		IndexReader indexReader = DirectoryReader.open(indexDirectory);
//		获取索引搜索对象
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
		Analyzer analyzer = new StandardAnalyzer();
		QueryParser queryParser = new QueryParser("contents", analyzer);
//		获取符合关键字的查询对象
		Query query = queryParser.parse(q);
		
		long start=System.currentTimeMillis();
//		获取关键字出现的前十次
		TopDocs topDocs = indexSearcher.search(query , 10);
		long end=System.currentTimeMillis();
		System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+topDocs.totalHits+"个记录");
		
		for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
			int docID = scoreDoc.doc;
//			索引搜索对象通过文档下标获取文档
			Document doc = indexSearcher.doc(docID);
			System.out.println("通过索引文件:"+doc.get("fullPath")+"拿数据");
		}
		indexReader.close();
	}
}

下面以一个一个的案例来说明lucene每一步的工作原理
下面所有用到的测试数据文件都是事先准备好了
Demo1生成索引:我首先准备好了数据源的text文件,然后引入我的数据文件,生成索引文件
在这里插入图片描述

package com.javaxl.lucene;

/**
 * 生成索引测试
 * @author Administrator
 *
 */
public class Demo1 {
	public static void main(String[] args) {
//		索引文件将要存放的位置
		String indexDir = "E:\\Demo\\lucene\\demo1";
//		数据源地址
		String dataDir = "E:\\Demo\\lucene\\demo1\\data";
		IndexCreate ic = null; 
		try {
			
			
			ic = new IndexCreate(indexDir);
			long start = System.currentTimeMillis();
			int num = ic.index(dataDir);
			long end = System.currentTimeMillis();
			System.out.println("检索指定路径下"+num+"个文件,一共花费了"+(end-start)+"毫秒");
		} catch (Exception e) {
			e.printStackTrace();
		}finally {
			try {
				ic.closeIndexWriter();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
}

运行完后就生成了对应的索引文件
在这里插入图片描述
然后我们对这些生成的索引文件进行搜索

package com.javaxl.lucene;

/**
 * 查询索引测试
 * @author Administrator
 *
 */
public class Demo2 {
	public static void main(String[] args) {
		String indexDir = "E:\\Demo\\lucene\\demo1";
		String q = "EarlyTerminating-Collector";
		try {
			IndexUse.search(indexDir, q);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

然后就检测到了两个文件里有这个关键词
在这里插入图片描述

lucene的查询方法

特定项查询和表达式查询,表达式查询也就是我们选择编译器来查询

package com.javaxl.lucene;

import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

/**
 * 特定项搜索
 * 查询表达式(queryParser)
 * @author Administrator
 *
 */
public class Demo5 {
	@Before
	public void setUp() {
		// 索引文件将要存放的位置
		String indexDir = "E:\\Demo\\lucene\\demo4";
		// 数据源地址
		String dataDir = "E:\\Demo\\lucene\\demo4\\data";
		IndexCreate ic = null;
		try {
			ic = new IndexCreate(indexDir);
			long start = System.currentTimeMillis();
			int num = ic.index(dataDir);
			long end = System.currentTimeMillis();
			System.out.println("检索指定路径下" + num + "个文件,一共花费了" + (end - start) + "毫秒");
			
			
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				ic.closeIndexWriter();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
	
	/**
	 * 特定项搜索
	 */
	@Test
	public void testTermQuery() {
		String indexDir = "E:\\Demo\\lucene\\demo4";
		
		String fld = "contents";
		String text = "indexformattoooldexception";
//		特定项片段名和关键字
		Term t  = new Term(fld , text);
		TermQuery tq = new TermQuery(t);
		try {
			FSDirectory indexDirectory = FSDirectory.open(Paths.get(indexDir));
//			注意:索引输入流不是new出来的,是通过目录读取工具类打开的
			IndexReader indexReader = DirectoryReader.open(indexDirectory);
//			获取索引搜索对象
			IndexSearcher is = new IndexSearcher(indexReader);
			
			
			TopDocs hits = is.search(tq, 100);
//			System.out.println(hits.totalHits);
			for(ScoreDoc scoreDoc: hits.scoreDocs) {
				Document doc = is.doc(scoreDoc.doc);
				System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");
				
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 查询表达式(queryParser)
	 */
	@Test
	public void testQueryParser() {
		String indexDir = "E:\\Demo\\lucene\\demo4";
//		获取查询解析器(通过哪种分词器去解析哪种片段)
		QueryParser queryParser = new QueryParser("contents", new StandardAnalyzer());
		try {
			FSDirectory indexDirectory = FSDirectory.open(Paths.get(indexDir));
//			注意:索引输入流不是new出来的,是通过目录读取工具类打开的
			IndexReader indexReader = DirectoryReader.open(indexDirectory);
//			获取索引搜索对象
			IndexSearcher is = new IndexSearcher(indexReader);
			
//			由解析器去解析对应的关键字
			TopDocs hits = is.search(queryParser.parse("indexformattoooldexception") , 100);
			for(ScoreDoc scoreDoc: hits.scoreDocs) {
				Document doc = is.doc(scoreDoc.doc);
				System.out.println("文件"+doc.get("fullPath")+"中含有该关键字");
				
			}
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
}

指定范文查询,比如id 1-10这种类型
指定字母开头的查询,这里是区分大小写的

package com.javaxl.lucene;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

/**
 * 指定数字范围查询
 * 指定字符串开头字母查询(prefixQuery)
 * @author Administrator
 *
 */
public class Demo6 {
	private int ids[]={1,2,3};
	private String citys[]={"qingdao","nanjing","shanghai"};
	private String descs[]={
			"Qingdao is a beautiful city.",
			"Nanjing is a city of culture.",
			"Shanghai is a bustling city."
	};
	private FSDirectory dir;
	
	/**
	 * 每次都生成索引文件
	 * @throws Exception
	 */
	@Before
	public void setUp() throws Exception {
		dir  = FSDirectory.open(Paths.get("E:\\Demo\\lucene\\demo4\\indexDir"));
		IndexWriter indexWriter = getIndexWriter();
		for (int i = 0; i < ids.length; i++) {
			Document doc = new Document();
			doc.add(new IntField("id", ids[i], Field.Store.YES));
			doc.add(new StringField("city", citys[i], Field.Store.YES));
			doc.add(new TextField("desc", descs[i], Field.Store.NO));
			indexWriter.addDocument(doc);
		}
		indexWriter.close();
	}
	
	/**
	 * 获取索引输出流
	 * @return
	 * @throws Exception
	 */
	private IndexWriter getIndexWriter()  throws Exception{
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig conf = new IndexWriterConfig(analyzer);
		return new IndexWriter(dir, conf );
	}
	
	/**
	 * 指定数字范围查询
	 * @throws Exception
	 */
	@Test
	public void testNumericRangeQuery()throws Exception{
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher is = new IndexSearcher(reader);
		
		NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id", 1, 2, true, true);
		TopDocs hits=is.search(query, 10);
		for(ScoreDoc scoreDoc:hits.scoreDocs){
			Document doc=is.doc(scoreDoc.doc);
			System.out.println(doc.get("id"));
			System.out.println(doc.get("city"));
			System.out.println(doc.get("desc"));
		}		
	}
	
	/**
	 * 指定字符串开头字母查询(prefixQuery)
	 * @throws Exception
	 */
	@Test
	public void testPrefixQuery()throws Exception{
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher is = new IndexSearcher(reader);
		
		PrefixQuery query=new PrefixQuery(new Term("city","n"));
		TopDocs hits=is.search(query, 10);
		for(ScoreDoc scoreDoc:hits.scoreDocs){
			Document doc=is.doc(scoreDoc.doc);
			System.out.println(doc.get("id"));
			System.out.println(doc.get("city"));
			System.out.println(doc.get("desc"));
		}	
	}
	
	@Test
	public void testBooleanQuery()throws Exception{
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher is = new IndexSearcher(reader);
		
		NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id", 1, 2, true, true);
		PrefixQuery query2=new PrefixQuery(new Term("city","s"));
		BooleanQuery.Builder booleanQuery=new BooleanQuery.Builder();
		booleanQuery.add(query1,BooleanClause.Occur.MUST);
		booleanQuery.add(query2,BooleanClause.Occur.MUST);
		TopDocs hits=is.search(booleanQuery.build(), 10);
		for(ScoreDoc scoreDoc:hits.scoreDocs){
			Document doc=is.doc(scoreDoc.doc);
			System.out.println(doc.get("id"));
			System.out.println(doc.get("city"));
			System.out.println(doc.get("desc"));
		}	
	}
}

组合查询,并添加高亮效果,这种查询方式也是我们最常用的

package com.javaxl.lucene;

import java.io.StringReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

public class Demo7 {
	private Integer ids[] = { 1, 2, 3 };
	private String citys[] = { "青岛", "南京", "上海" };
	// private String descs[]={
	// "青岛是个美丽的城市。",
	// "南京是个有文化的城市。",
	// "上海市个繁华的城市。"
	// };
	private String descs[] = { "青岛是个美丽的城市。",
			"南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10]",
			"上海市个繁华的城市。" };

	private FSDirectory dir;

	/**
	 * 每次都生成索引文件
	 * 
	 * @throws Exception
	 */
	@Before
	public void setUp() throws Exception {
		dir = FSDirectory.open(Paths.get("E:\\Demo\\lucene\\demo4\\indexDir"));
		IndexWriter indexWriter = getIndexWriter();
		for (int i = 0; i < ids.length; i++) {
			Document doc = new Document();
			doc.add(new IntField("id", ids[i], Field.Store.YES));
			doc.add(new StringField("city", citys[i], Field.Store.YES));
			doc.add(new TextField("desc", descs[i], Field.Store.YES));
			indexWriter.addDocument(doc);
		}
		indexWriter.close();
	}

	/**
	 * 获取索引输出流
	 * 
	 * @return
	 * @throws Exception
	 */
	private IndexWriter getIndexWriter() throws Exception {
//		Analyzer analyzer = new StandardAnalyzer();
		Analyzer analyzer = new SmartChineseAnalyzer();
		IndexWriterConfig conf = new IndexWriterConfig(analyzer);
		return new IndexWriter(dir, conf);
	}

	/**
	 * luke查看索引生成
	 * 
	 * @throws Exception
	 */
	@Test
	public void testIndexCreate() throws Exception {

	}

	/**
	 * 测试高亮
	 * 
	 * @throws Exception
	 */
	@Test
	public void testHeight() throws Exception {
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);

		SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
		QueryParser parser = new QueryParser("desc", analyzer);
		// Query query = parser.parse("南京文化");
		Query query = parser.parse("南京文明");
		TopDocs hits = searcher.search(query, 100);

		// 查询得分项
		QueryScorer queryScorer = new QueryScorer(query);
		// 得分项对应的内容片段
		SimpleSpanFragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
		// 高亮显示的样式
		SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span color='red'><b>", "</b></span>");
		// 高亮显示对象
		Highlighter highlighter = new Highlighter(htmlFormatter, queryScorer);
		// 设置需要高亮显示对应的内容片段
		highlighter.setTextFragmenter(fragmenter);

		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = searcher.doc(scoreDoc.doc);
			String desc = doc.get("desc");
			if (desc != null) {
				// tokenstream是从doucment的域(field)中抽取的一个个分词而组成的一个数据流,用于分词。
				TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));
				System.out.println("高亮显示的片段:" + highlighter.getBestFragment(tokenStream, desc));
			}
			System.out.println("所有内容:" + desc);
		}

	}

}

可以看到,在我指定的字段都加上了区分的样式
在这里插入图片描述

lucene项目实践

框架就是用的struts2,框架的配置就不发了
lucene.properties

url=jdbc:mysql://localhost:3306/test?autoReconnect=true
user=root
pwd=123
driver=com.mysql.jdbc.Driver
indexPath=E://Demo/lucene/blogCrawler/lucene

LuceneUtil

package com.javaxl.blog.util;

import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

/**
 * lucene工具类
 * @author Administrator
 *
 */
public class LuceneUtil {

	/**
	 * 获取索引文件存放的文件夹对象
	 * 
	 * @param path
	 * @return
	 */
	public static Directory getDirectory(String path) {
		Directory directory = null;
		try {
			directory = FSDirectory.open(Paths.get(path));
		} catch (IOException e) {
			e.printStackTrace();
		}
		return directory;
	}

	/**
	 * 索引文件存放在内存
	 * 
	 * @return
	 */
	public static Directory getRAMDirectory() {
		Directory directory = new RAMDirectory();
		return directory;
	}

	/**
	 * 文件夹读取对象
	 * 
	 * @param directory
	 * @return
	 */
	public static DirectoryReader getDirectoryReader(Directory directory) {
		DirectoryReader reader = null;
		try {
			reader = DirectoryReader.open(directory);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return reader;
	}

	/**
	 * 文件索引对象
	 * 
	 * @param reader
	 * @return
	 */
	public static IndexSearcher getIndexSearcher(DirectoryReader reader) {
		IndexSearcher indexSearcher = new IndexSearcher(reader);
		return indexSearcher;
	}

	/**
	 * 写入索引对象
	 * 
	 * @param directory
	 * @param analyzer
	 * @return
	 */
	public static IndexWriter getIndexWriter(Directory directory, Analyzer analyzer)

	{
		IndexWriter iwriter = null;
		try {
			IndexWriterConfig config = new IndexWriterConfig(analyzer);
			config.setOpenMode(OpenMode.CREATE_OR_APPEND);
			// Sort sort=new Sort(new SortField("content", Type.STRING));
			// config.setIndexSort(sort);//排序
			config.setCommitOnClose(true);
			// 自动提交
			// config.setMergeScheduler(new ConcurrentMergeScheduler());
			// config.setIndexDeletionPolicy(new
			// SnapshotDeletionPolicy(NoDeletionPolicy.INSTANCE));
			iwriter = new IndexWriter(directory, config);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return iwriter;
	}

	/**
	 * 关闭索引文件生成对象以及文件夹对象
	 * 
	 * @param indexWriter
	 * @param directory
	 */
	public static void close(IndexWriter indexWriter, Directory directory) {
		if (indexWriter != null) {
			try {
				indexWriter.close();
			} catch (IOException e) {
				indexWriter = null;
			}
		}
		if (directory != null) {
			try {
				directory.close();
			} catch (IOException e) {
				directory = null;
			}
		}
	}

	/**
	 * 关闭索引文件读取对象以及文件夹对象
	 * 
	 * @param reader
	 * @param directory
	 */
	public static void close(DirectoryReader reader, Directory directory) {
		if (reader != null) {
			try {
				reader.close();
			} catch (IOException e) {
				reader = null;
			}
		}
		if (directory != null) {
			try {
				directory.close();
			} catch (IOException e) {
				directory = null;
			}
		}

	}

	/**
	 * 高亮标签
	 * 
	 * @param query
	 * @param fieldName
	 * @return
	 */

	public static Highlighter getHighlighter(Query query, String fieldName)

	{
		Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
		Scorer fragmentScorer = new QueryTermScorer(query, fieldName);
		Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
		highlighter.setTextFragmenter(new SimpleFragmenter(200));
		return highlighter;
	}
}

PropertiesUtil

package com.javaxl.blog.util;

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

/**
 * properties工具类
 * @author user
 *
 */
public class PropertiesUtil {

	/**
	 * 根据key获取value值
	 * @param key
	 * @return
	 */
	public static String getValue(String key){
		Properties prop=new Properties();
		InputStream in=new PropertiesUtil().getClass().getResourceAsStream("/lucene.properties");
		try {
			prop.load(in);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return prop.getProperty(key);
	}
}

BolgDao

package com.javaxl.blog.dao;

import java.sql.SQLException;
import java.util.List;
import java.util.Map;

import com.javaxl.blog.util.JsonBaseDao;
import com.javaxl.blog.util.PageBean;
import com.javaxl.blog.util.StringUtils;


public class BlogDao extends JsonBaseDao{
	public List<Map<String,Object>> list(String title, PageBean pageBean) throws InstantiationException, IllegalAccessException, SQLException{
		String sql = "select * from t_lucene_crawler_blog where 1=1";
		if(StringUtils.isNotBlank(title)) {
			sql += " and title like '%"+title+"%'";
		}
		return super.executeQuery(sql, pageBean);
	}
	
	public int save(Map<String,String[]> paMap) throws InstantiationException, IllegalAccessException, SQLException, NoSuchFieldException, SecurityException, IllegalArgumentException{
		String sql = "insert into t_lucene_crawler_blog values(?,?,?,?,0)";
		return super.executeUpdate(sql, new String[] {"id","title","content","url"}, paMap);
	}
	
}

IndexStarter,这个就是创建索引文件的,第一次运行时需要先创建索引文件

package com.javaxl.blog.web;

import java.io.IOException;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.PropertiesUtil;


/**
 * 构建lucene索引
 * @author Administrator
 * 1。构建索引	IndexWriter
 * 2、读取索引文件,获取命中片段
 * 3、使得命中片段高亮显示
 *
 */
public class IndexStarter {
	private static BlogDao blogDao = new BlogDao();
	public static void main(String[] args) {
		IndexWriterConfig conf = new IndexWriterConfig(new SmartChineseAnalyzer());
		Directory d;
		IndexWriter indexWriter = null;
		try {
			d = FSDirectory.open(Paths.get(PropertiesUtil.getValue("indexPath")));
			indexWriter = new IndexWriter(d , conf );
			
//			为数据库中的所有数据构建索引
			List<Map<String, Object>> list = blogDao.list(null, null);
			for (Map<String, Object> map : list) {
				Document doc = new Document();
				doc.add(new StringField("id", (String) map.get("id"), Field.Store.YES));
//				TextField用于对一句话分词处理	java培训机构
				doc.add(new TextField("title", (String) map.get("title"), Field.Store.YES));
				doc.add(new StringField("url", (String) map.get("url"), Field.Store.YES));
				indexWriter.addDocument(doc);
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InstantiationException e) {
			e.printStackTrace();
		} catch (IllegalAccessException e) {
			e.printStackTrace();
		} catch (SQLException e) {
			e.printStackTrace();
		}finally {
			try {
				if(indexWriter!= null) {
					indexWriter.close();
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}
}

BlogAction

package com.javaxl.blog.web;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.http.HttpServletRequest;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.store.Directory;
import org.apache.struts2.ServletActionContext;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.LuceneUtil;
import com.javaxl.blog.util.PropertiesUtil;
import com.javaxl.blog.util.StringUtils;

/**
 * IndexReader
 * IndexSearcher
 * Highlighter
 * @author Administrator
 *
 */
public class BlogAction {
	private String title;
	private BlogDao blogDao = new BlogDao();

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String list() {
		try {
			HttpServletRequest request = ServletActionContext.getRequest();
			if (StringUtils.isBlank(title)) {
				List<Map<String, Object>> blogList = this.blogDao.list(title, null);
				request.setAttribute("blogList", blogList);
			}else {
				Directory directory = LuceneUtil.getDirectory(PropertiesUtil.getValue("indexPath"));
				DirectoryReader reader = LuceneUtil.getDirectoryReader(directory);
				IndexSearcher searcher = LuceneUtil.getIndexSearcher(reader);
				SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
//				拿一句话到索引目中的索引文件中的词库进行关键词碰撞
				Query query = new QueryParser("title", analyzer).parse(title);
				Highlighter highlighter = LuceneUtil.getHighlighter(query, "title");
				
				TopDocs topDocs = searcher.search(query , 100);
				//处理得分命中的文档
				List<Map<String, Object>> blogList = new ArrayList<>();
				Map<String, Object> map = null;
				ScoreDoc[] scoreDocs = topDocs.scoreDocs;
				for (ScoreDoc scoreDoc : scoreDocs) {
					map = new HashMap<>();
					Document doc = searcher.doc(scoreDoc.doc);
					map.put("id", doc.get("id"));
					String titleHighlighter = doc.get("title");
					if(StringUtils.isNotBlank(titleHighlighter)) {
						titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
					}
					map.put("title", titleHighlighter);
					map.put("url", doc.get("url"));
					blogList.add(map);
				}
				request.setAttribute("blogList", blogList);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return "blogList";
	}
}

BlogActionBak

package com.javaxl.blog.web;

import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.http.HttpServletRequest;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.struts2.ServletActionContext;

import com.javaxl.blog.dao.BlogDao;
import com.javaxl.blog.util.PropertiesUtil;
import com.javaxl.blog.util.StringUtils;


/**
 * IndexReader
 * IndexSearcher
 * Highlighter
 * @author Administrator
 *
 */
public class BlogActionBak {
	private String title;
	private BlogDao blogDao = new BlogDao();

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String execute() {
		try {
			HttpServletRequest request = ServletActionContext.getRequest();
			if (StringUtils.isBlank(title)) {
				List<Map<String, Object>> blogList = this.blogDao.list(title, null);
				request.setAttribute("blogList", blogList);
			}
			else {
//				拿到中文解析器
				SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
				IndexReader indexReader = DirectoryReader.open(FSDirectory.open(Paths.get(PropertiesUtil.getValue("indexPath"))));
				IndexSearcher searcher = new IndexSearcher(indexReader);
//				拿一句话到索引目中的索引文件中的词库进行关键词碰撞
				Query query = new QueryParser("title", analyzer).parse(title);
				TopDocs topDocs = searcher.search(query , 100);
				
//				将碰撞出来的关键词给点亮
				QueryScorer queryScorer = new QueryScorer(query);
//				以什么形式点亮关键词
				Formatter formatter = new SimpleHTMLFormatter("<span style='color:red;'><b>", "</span></b>");
				Highlighter highlighter = new Highlighter(formatter , queryScorer);
				
				List<Map<String, Object>> blogList = new ArrayList<>();
				Map<String, Object> map = null;
				ScoreDoc[] scoreDocs = topDocs.scoreDocs;
				for (ScoreDoc scoreDoc : scoreDocs) {
					map = new HashMap<>();
					Document doc = searcher.doc(scoreDoc.doc);
					map.put("id", doc.get("id"));
					String titleHighlighter = doc.get("title");
					if(StringUtils.isNotBlank(titleHighlighter)) {
						titleHighlighter = highlighter.getBestFragment(analyzer, "title", titleHighlighter);
					}
					map.put("title", titleHighlighter);
					map.put("url", doc.get("url"));
					blogList.add(map);
				}
				request.setAttribute("blogList", blogList);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return "blogList";
	}
}

blogList.jsp,就是一个简单的展示数据的jsp

<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
    <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Insert title here</title>
</head>
<body>
<form action="${pageContext.request.contextPath}/sy/blog_list.action"
		method="post">
		博客标题:<input type="text" name="title"> <input type="submit"
			value="确定">
	</form>
	<button id="add">添加</button>
	<button id="refresh">刷新全局索引</button>
	<table border="1" width="100%">
		<tr>
			<td>编号</td>
			<td>名称</td>
			<td>价格</td>
			<td>操作</td>
		</tr>
		<c:forEach items="${blogList }" var="blog">
			<tr>
				<td>${blog.id }</td>
				<td>${blog.title }</td>
				<td><a href="${blog.url }">${blog.title }</a></td>
				<td>
					<a href="">修改</a>
					<a href="">删除</a>
				</td>
			</tr>
		</c:forEach>
	</table>
</body>
</html>

最后呈现的效果
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

不愿秃头的阳某

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值