Lucene
是apache软件基金会发布的一个开放源代码的全文检索引擎工具包,提供了完整的创建索引和查询索引,以及部分文本分析的引擎;它是根据关健字来搜索的文本搜索工具,只能在某个网站内部搜索文本内容,不能跨网站搜索。
Lucene中存的是一系列的二进制压缩文件和一些控制文件,位于计算机的硬盘上,
统称为索引库,索引库有二部份组成:
(1)原始记录 存放的是原始记录信息,Lucene为存入的内容分配一个唯一的编号
(2)词汇表 按照一定的拆分策略(即分词器)将原始记录中的每个字符拆开后,存入供搜索的表,存放的是经过分词器拆分出来的词汇和该词汇在原始记录表中的编号
为什么网站内部有些地方要用Lucene,而不全用SQL搜索
(1)SQL只能针对数据库表搜索,不能直接针对硬盘上的文本搜索
(2)SQL没有相关度排名
(3)SQL搜索结果没有关健字高亮显示
(4)SQL需要数据库的支持,数据库本身需要内存开销较大,例如:Oracle
(5)SQL搜索有时较慢,尤其是数据库不在本地时,超慢,例如:Oracle
使用Lucene的流程
创建索引库:
1) 创建JavaBean对象
2) 创建Docment对象
3) 将JavaBean对象所有的属性值,均放到Document对象中去,属性名可以和JavaBean相同或不同
4) 创建IndexWriter对象
5) 将Document对象通过IndexWriter对象写入索引库中
6) 关闭IndexWriter对象
根据关键字查询索引库中的内容:
1) 创建IndexSearcher对象
2) 创建QueryParser对象
3) 创建Query对象来封装关键字
4) 用IndexSearcher对象去索引库中查询符合条件的前100条记录,不足100条记录的以实际为准
5) 获取符合条件的编号
6) 用indexSearcher对象去索引库中查询编号对应的Document对象
7) 将Document对象中的所有属性取出,再封装回JavaBean对象中去,并加入到集合中保存,以备将之用
@Test
public void createIndexDB() throws Exception{
Article article = new Article(1,"培训"," Java培训机构");
Document document = new Document();
document.add(new Field("id",article.getId().toString(),Store.YES,Index.ANALYZED));
document.add(new Field("title",article.getTitle(),Store.YES,Index.ANALYZED));
document.add(new Field("content",article.getContent(),Store.YES,Index.ANALYZED));
Directory directory = FSDirectory.open(new File("E:/LuceneDBDBDBDBDBDBDBDBDB"));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
MaxFieldLength maxFieldLength = MaxFieldLength.LIMITED;
IndexWriter indexWriter = new IndexWriter(directory,analyzer,maxFieldLength);
indexWriter.addDocument(document);
indexWriter.close();
}
@Test
public void findIndexDB() throws Exception{
List<Article> articleList = new ArrayList<Article>();
String keywords = "传";
Directory directory = FSDirectory.open(new File("E:/LuceneDBDBDBDBDBDBDBDBDB"));
Version version = Version.LUCENE_30;
Analyzer analyzer = new StandardAnalyzer(version);
QueryParser queryParser = new QueryParser(version,"content",analyzer);
Query query = queryParser.parse(keywords);
IndexSearcher indexSearcher = new IndexSearcher(directory);
TopDocs topDocs = indexSearcher.search(query,10);
for(int i=0;i<topDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int no = scoreDoc.doc;
Document document = indexSearcher.doc(no);
String id = document.get("id");
String title = document.get("title");
String content = document.get("content");
Article article = new Article(Integer.parseInt(id),title,content);
articleList.add(article);
}
for(Article article : articleList){
System.out.println(article.getId()+":"+article.getTitle()+":"+article.getContent());
}
}
创建LuceneUtil工具类,使用反射,封装通用的方法
public class LuceneUtil {
private static Directory directory ;
private static Analyzer analyzer ;
private static Version version;
private static MaxFieldLength maxFieldLength;
static{
try {
directory = FSDirectory.open(new File("E:/LuceneDBDBDBDBDBDBDBDBDB"));
version = Version.LUCENE_30;
analyzer = new StandardAnalyzer(version);
maxFieldLength = MaxFieldLength.LIMITED;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static Directory getDirectory() {
return directory;
}
public static Analyzer getAnalyzer() {
return analyzer;
}
public static Version getVersion() {
return version;
}
public static MaxFieldLength getMaxFieldLength() {
return maxFieldLength;
}
public static Document javabean2documemt(Object obj) throws Exception{
Document document = new Document();
Class clazz = obj.getClass();
java.lang.reflect.Field[] reflectFields = clazz.getDeclaredFields();
for(java.lang.reflect.Field field : reflectFields){
field.setAccessible(true);
String fieldName = field.getName();
String init = fieldName.substring(0,1).toUpperCase();
String methodName = "get" + init + fieldName.substring(1);
Method method = clazz.getDeclaredMethod(methodName,null);
String returnValue = method.invoke(obj,null).toString();
document.add(new Field(fieldName,returnValue,Store.YES,Index.ANALYZED));
}
return document;
}
public static Object document2javabean(Document document,Class clazz) throws Exception{
Object obj = clazz.newInstance();
java.lang.reflect.Field[] reflectFields = clazz.getDeclaredFields();
for(java.lang.reflect.Field field : reflectFields){
field.setAccessible(true);
String fieldName = field.getName();
String fieldValue = document.get(fieldName);
BeanUtils.setProperty(obj,fieldName,fieldValue);
}
return obj;
}
}
.
使用LuceneUtil工具类,重构
public class SecondLucene {
@Test
public void createIndexDB() throws Exception{
Article article = new Article(1,"Java培训","Java培训机构");
Document document = LuceneUtil.javabean2documemt(article);
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
indexWriter.addDocument(document);
indexWriter.close();
}
@Test
public void findIndexDB() throws Exception{
List<Article> articleList = new ArrayList<Article>();
String keywords = "传";
QueryParser queryParser = new QueryParser(LuceneUtil.getVersion(),"content",LuceneUtil.getAnalyzer());
Query query = queryParser.parse(keywords);
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtil.getDirectory());
TopDocs topDocs = indexSearcher.search(query,10);
for(int i=0;i<topDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int no = scoreDoc.doc;
Document document = indexSearcher.doc(no);
Article article = (Article) LuceneUtil.document2javabean(document,Article.class);
articleList.add(article);
}
for(Article article : articleList){
System.out.println(article.getId()+":"+article.getTitle()+":"+article.getContent());
}
}
}
完成CURD操作
public class LuceneCURD {
@Test
public void addIndexDB() throws Exception{
Article article = new Article(1,"培训","传ava培训机构");
Document document = LuceneUtil.javabean2documemt(article);
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
indexWriter.addDocument(document);
indexWriter.close();
}
@Test
public void updateIndexDB() throws Exception{
Integer id = 1;
Article article = new Article(1,"培训","广州传智是一个Java培训机构");
Document document = LuceneUtil.javabean2documemt(article);
Term term = new Term("id",id.toString());
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
indexWriter.updateDocument(term,document);
indexWriter.close();
}
@Test
public void deleteIndexDB() throws Exception{
Integer id = 1;
Term term = new Term("id",id.toString());
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
indexWriter.deleteDocuments(term);
indexWriter.close();
}
@Test
public void deleteAllIndexDB() throws Exception{
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
indexWriter.deleteAll();
indexWriter.close();
}
@Test
public void searchIndexDB() throws Exception{
List<Article> articleList = new ArrayList<Article>();
String keywords = "传";
QueryParser queryParser = new QueryParser(LuceneUtil.getVersion(),"content",LuceneUtil.getAnalyzer());
Query query = queryParser.parse(keywords);
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtil.getDirectory());
TopDocs topDocs = indexSearcher.search(query,10);
for(int i = 0;i<topDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int no = scoreDoc.doc;
Document document = indexSearcher.doc(no);
Article article = (Article) LuceneUtil.document2javabean(document,Article.class);
articleList.add(article);
}
for(Article article : articleList){
System.out.println(article.getId()+":"+article.getTitle()+":"+article.getContent());
}
}
}
索引库优化
默认情况下,向索引库中增加一个Document对象时,索引库自动会添加一个扩展名叫*.cfs的二进制压缩文件,如果向索引库中存Document对象过多,那么*.cfs也会不断增加,同时索引库的容量也会不断增加,影响索引库的大小。
索引库优化方案
1合并cfs文件,合并后的cfs文件是二进制压缩字符,能解决是的文件大小和数量的问题
indexWriter.addDocument(document);
indexWriter.optimize();
indexWriter.close();
2设定合并因子,自动合并cfs文件,默认10个cfs文件合并成一个cfs文件
indexWriter.addDocument(document);
indexWriter.setMergeFactor(3);
indexWriter.close();
3使用RAMDirectory,类似于内存索引库,能解决是的读取索引库文件的速度问题,它能以空换时,提高速度快,但不能持久保存,因此启动时加载硬盘中的索引库到内存中的索引库,退出时将内存中的索引库保存到硬盘中的索引库,且内容不能重复。
Article article = new Article(1,"培训","传Java培训机构");
Document document = LuceneUtil.javabean2document(article);
Directory fsDirectory = FSDirectory.open(new File("E:/indexDBDBDBDBDBDBDBDB"));
Directory ramDirectory = new RAMDirectory(fsDirectory);
IndexWriter fsIndexWriter = new IndexWriter(fsDirectory,LuceneUtil.getAnalyzer(),true,LuceneUtil.getMaxFieldLength());
IndexWriter ramIndexWriter = new IndexWriter(ramDirectory,LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
ramIndexWriter.addDocument(document);
ramIndexWriter.close();
fsIndexWriter.addIndexesNoOptimize(ramDirectory);
fsIndexWriter.close();
搜索结果高亮
将与关健字相同的字符用红色显示
String keywords = "培训";
List<Article> articleList = new ArrayList<Article>();
QueryParser queryParser = new QueryParser(LuceneUtil.getVersion(),"content",LuceneUtil.getAnalyzer());
Query query = queryParser.parse(keywords);
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtil.getDirectory());
TopDocs topDocs = indexSearcher.search(query,1000000);
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter,scorer);
for(int i=0;i<topDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int no = scoreDoc.doc;
Document document = indexSearcher.doc(no);
String highlighterContent = highlighter.getBestFragment(LuceneUtil.getAnalyzer(),"content",document.get("content"));
document.getField("content").setValue(highlighterContent);
Article article = (Article) LuceneUtil.document2javabean(document,Article.class);
articleList.add(article);
}
for(Article article : articleList){
System.out.println(article);
}
}
搜索结果摘要
搜索结果内容太多,我们只想显示前几个字符, 必须与高亮一起使用
String keywords = "培训";
List<Article> articleList = new ArrayList<Article>();
QueryParser queryParser = new QueryParser(LuceneUtil.getVersion(),"content",LuceneUtil.getAnalyzer());
Query query = queryParser.parse(keywords);
IndexSearcher indexSearcher = new IndexSearcher(LuceneUtil.getDirectory());
TopDocs topDocs = indexSearcher.search(query,1000000);
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter,scorer);
Fragmenter fragmenter = new SimpleFragmenter(4);
highlighter.setTextFragmenter(fragmenter);
for(int i=0;i<topDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int no = scoreDoc.doc;
Document document = indexSearcher.doc(no);
String highlighterContent = highlighter.getBestFragment(LuceneUtil.getAnalyzer(),"content",document.get("content"));
document.getField("content").setValue(highlighterContent);
Article article = (Article) LuceneUtil.document2javabean(document,Article.class);
articleList.add(article);
}
for(Article article : articleList){
System.out.println(article);
}
}
搜索结果排序
Lucene是按相关度得分排序的,得分高排在前,得分低排在后,如果相关度得分相同,按插入索引库的先后次序排序
Lucene中的设置相关度得分
IndexWriter indexWriter = new IndexWriter(LuceneUtil.getDirectory(),LuceneUtil.getAnalyzer(),LuceneUtil.getMaxFieldLength());
document.setBoost(20F);
indexWriter.addDocument(document);
indexWriter.close();
按单个字段排序
Sort sort = new Sort(new SortField("id",SortField.INT,true));
TopDocs topDocs = indexSearcher.search(query,null,1000000,sort);
按多个字段排序
Sort sort = new Sort(new SortField("count",SortField.INT,true),new SortField("id",SortField.INT,true));
TopDocs topDocs = indexSearcher.search(query,null,1000000,sort);