一,概念介绍
目前系统中存在着大量的报文信息,每条报文的数据量较小,大概2000-3000字节左右,但是总体报文的条数较多.这些报文信息属于非结构化数据,目前查询这些海量非结构化数据的速度较慢,而通过全文检索技术能高效地管理这些非结构化数据。
全文检索技术是指计算机索引程序通过扫描文章中的每一个词,对每一个词建立一个索引,指明该词在文章中出现的次数和位置,当用户查询时,检索程序就根据事先建立的索引进行查找,并将查找的结果反馈给用户的检索方式。这个过程类似于通过字典中的检索字表查字的过程。
二,全文检索的实现过程
根据全文检索的定义可以看出全文检索大体分两个过程,索引创建和搜索索引。
索引创建:将数据提取信息,创建索引的过程。
搜索索引:得到用户的查询请求,搜索创建的索引,然后返回结果的过程。
索引创建:
1)有一系列被索引文件
2)被索引文件经过语法分析和语言处理形成一系列词
3)经过索引创建形成词典和反向索引表
4)通过索引存储将索引写入硬盘
搜索索引
1)用户输入查询语句。
2)对查询语句经过语法分析和语言分析得到一系列词
3)通过语法分析得到一个查询树
4)通过索引存储将索引读入到内存
5)利用查询树搜索索引,从而得到每个词(Term)的文档链表,对文档链表进行交,差,并得到结果文档
6)将搜索到的结果文档对查询的相关性进行排序
7)返回查询结果给用户
三,简单的代码
helloword代码
package com.liuzm.lucene.day1;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class HelloWorld {
// 数据源
private String doc1 = "hello java";
private String doc2 = "hello java world";
private String doc3 = "hello lucene world";
private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/firstIndex";
private Version matchVersion = Version.LUCENE_47;
/**
* 索引库的创建
* Directory : 对Lucene索引库的一个抽象
* Analyzer : 词法分析器,决定分词的力度
* Document : Lucene中存储的基础对象,代表表中一条数据
* Field : 表中数据的字段
* @throws Exception
*/
@Test
public void creatIndex() throws Exception {
// 准备索引的输出目录
Directory directory = FSDirectory.open(new File(path));
// 创建分析器
Analyzer analyzer = new StandardAnalyzer(matchVersion);
// 索引写入器的配置对象
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 每次创建索引库前先删除原先的索引库,即每次创建新的索引库
config.setOpenMode(OpenMode.CREATE);
// 创建索引写入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 添加和写入索引
FieldType fieldType = new FieldType();
// 设置是否需要存储
fieldType.setIndexed(true); // 是否支持搜索
fieldType.setStored(true); // 是否把数据放入数据区
fieldType.setTokenized(true); // 是否需要分词
// 创建文档
Document document1 = new Document();
document1.add(new Field("title", "doc1", fieldType));
document1.add(new Field("content", doc1, fieldType));
// document1.add(new TextField("title", "doc1", Store.YES));
// document1.add(new TextField("content", doc1, Store.YES));
indexWriter.addDocument(document1);
Document document2 = new Document();
document2.add(new TextField("title", "doc2", Store.YES));
document2.add(new TextField("content", doc2, Store.YES));
indexWriter.addDocument(document2);
Document document3 = new Document();
document3.add(new TextField("title", "doc3", Store.YES));
document3.add(new TextField("content", doc3, Store.YES));
indexWriter.addDocument(document3);
// 关闭流资源,并写入索引
indexWriter.close();
}
/**
* 索引的搜索
*
* @throws Exception
*/
@Test
public void searchIndex() throws Exception {
// 查询关键字
String queryStr = "hello";
// 准备索引的搜索目录
Directory directory = FSDirectory.open(new File(path));
// 创建分析器
Analyzer analyzer = new StandardAnalyzer(matchVersion);
// 索引读取器
DirectoryReader directoryReader = DirectoryReader.open(directory);
//IndexReader indexReader = IndexReader.open(directory);
// 创建索引搜索器
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
//IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 创建一个词法分析器
QueryParser parser = new QueryParser(matchVersion, "content", analyzer);
// 创建查询对象
Query query = parser.parse(queryStr);
// 使用索引搜索器查询,获取有符合条件的前100个文档
TopDocs topDocs = indexSearcher.search(query, 100);
// 输出文档个数
System.out.println(topDocs.totalHits);
// 获取文档
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// 遍历
for (ScoreDoc scoreDoc : scoreDocs) {
// 内部文档编号
int docId = scoreDoc.doc;
// 获取Document对象
Document document = indexSearcher.doc(docId);
System.out.println(document);
System.out.println("docId: " + docId);
System.out.println("title: " + document.get("title"));
System.out.println("content : " + document.get("content"));
System.out.println();
}
}
}
对于索引的CRUD
package com.liuzm.lucene.day1;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class IndexCRUD {
private String doc1 = "hello java";
private String doc2 = "hello java world";
private String doc3 = "hello lucene world";
private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/CRUDIndex";
private Version matchVersion = Version.LUCENE_47;
@Test
public void creatIndex() throws Exception {
// 准备索引输出目录
Directory directory = FSDirectory.open(new File(path));
// 创建分析器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引写入器配置
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
config.setOpenMode(OpenMode.CREATE);
// 创建索引写入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 添加和写入索引
FieldType fieldType = new FieldType();
// 设置是否存入数据区
fieldType.setStored(true);
// 创建文档
Document document1 = new Document();
document1.add(new TextField("id", "1", Store.YES));
document1.add(new TextField("title", "doc1", Store.YES));
document1.add(new TextField("content", doc1, Store.YES));
indexWriter.addDocument(document1);
Document document2 = new Document();
document2.add(new TextField("id", "2", Store.YES));
document2.add(new TextField("title", "doc2", Store.YES));
document2.add(new TextField("content", doc2, Store.YES));
indexWriter.addDocument(document2);
Document document3 = new Document();
document3.add(new TextField("id", "3", Store.YES));
document3.add(new TextField("title", "doc3", Store.YES));
document3.add(new TextField("content", doc3, Store.YES));
indexWriter.addDocument(document3);
// 关闭并写入所以呢
indexWriter.close();
}
/**
* 删除索引
*
* @throws Exception
*/
@Test
public void deleteIndex() throws Exception {
// 索引目录
Directory directory = FSDirectory.open(new File(path));
// 分词器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引写入器配置对象
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 索引写入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 调用删除方法,实现删除
// indexWriter.deleteDocuments(new Term("id","2"));
// 词法分析器
QueryParser parser = new QueryParser(matchVersion, "id", analyzer);
// 创建查询对象
Query query = parser.parse("3");
indexWriter.deleteDocuments(query);
// 关闭资源
indexWriter.close();
}
/**
* 更新索引
*
* @throws Exception
*/
@Test
public void updateIndex() throws Exception {
// 准备目录
Directory directory = FSDirectory.open(new File(path));
// 分词器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引写入器配置
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 创建索引写入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 准备更新数据
Document document = new Document();
document.add(new TextField("id", "2", Store.YES));
document.add(new TextField("title", "doc2", Store.YES));
document.add(new TextField("content", "hello更改后的doc2", Store.YES));
//document.add(new TextField("content", "更改后的doc2", Store.YES)); // 开头不能为中文???
// 调用更新方法,实现更新操作(先删除,后添加的操作)
indexWriter.updateDocument(new Term("id","2"), document);
// 关闭资源
indexWriter.close();
}
@Test
public void searchIndex() throws Exception {
// 准备搜索关键字
String QueryStr = "hello";
// 准备索引搜索目录
Directory directory = FSDirectory.open(new File(path));
// 创建分析器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引读取器
IndexReader indexReader = IndexReader.open(directory);
// 创建索引搜索器
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 创建词法分析器
QueryParser queryParser = new QueryParser(matchVersion, "content",analyzer);
// 创建查询对象
Query query = queryParser.parse(QueryStr);
// 使用搜索器的search方法检索,获取有符合条件的前N个文档
TopDocs topDocs = indexSearcher.search(query, 100);
System.out.println(topDocs.totalHits);
// 获取文档
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
int docId = scoreDoc.doc;
Document doc = indexSearcher.doc(docId);
System.out.println("docId :" + docId);
System.out.println("id: " + doc.get("id"));
System.out.println("title : " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println();
}
}
}
分词器测试
package com.liuzm.lucene.day1;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.junit.Test;
/**
* 分词器
* @author Administrator
*
*/
public class AnalyzerTest {
private String en = "ho my lady gaga";
private String cn = "迅雷不及掩耳盗铃儿响叮当仁不让";
private String str = "源代码教育FullText框架的学习,哈哈";
private Version matchVersion = Version.LUCENE_47;
public void testAnalyzer(Analyzer analyzer, String str){
try {
// 获取需要解析文本的“词元流”
TokenStream tokenStream = analyzer.tokenStream("name", str);
// 重置解析
tokenStream.reset();
// 循环获取解析结果
while(tokenStream.incrementToken()){
System.out.println(tokenStream);
}
// System.out.println(tokenStream.incrementToken());
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void testStandardAnalyzer() throws Exception {
testAnalyzer(new StandardAnalyzer(matchVersion ), cn);
}
@Test
public void testSimpleAnalyzer() throws Exception {
testAnalyzer(new SimpleAnalyzer(matchVersion), cn);
}
@Test
public void testChineseAnalyzer() throws Exception {
testAnalyzer(new ChineseAnalyzer(), cn);
}
/*
* 二分分词
*/
@Test
public void testCJKAnalyzer() throws Exception {
testAnalyzer(new CJKAnalyzer(matchVersion), cn);
}
@Test
public void testClassicAnalyzer() throws Exception {
testAnalyzer(new ClassicAnalyzer(matchVersion), cn);
}
/**
* 字典分词
* @throws Exception
*/
@Test
public void testSmartCnAnalyzer1() throws Exception {
CharArraySet stopWords = new CharArraySet(matchVersion, 10, true);
stopWords.add("的");
stopWords.add(",");
testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
}
@Test
public void testSmartCnAnalyzer2() throws Exception {
Reader reader = IOUtils.getDecodingReader(this.getClass().getClassLoader().getResourceAsStream("stopwords.txt"),IOUtils.CHARSET_UTF_8);
CharArraySet stopWords = CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, "//", Version.LUCENE_47));
testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
}
}