全文检索之lucene

最新推荐文章于 2024-09-06 18:50:44 发布

MrAlgernon

最新推荐文章于 2024-09-06 18:50:44 发布

阅读量136

点赞数

分类专栏： web 文章标签： java 开发工具数据结构与算法

本文链接：https://blog.csdn.net/MrAlgernon/article/details/84697537

版权

web 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

一,概念介绍

目前系统中存在着大量的报文信息，每条报文的数据量较小，大概2000-3000字节左右,但是总体报文的条数较多.这些报文信息属于非结构化数据,目前查询这些海量非结构化数据的速度较慢,而通过全文检索技术能高效地管理这些非结构化数据。
全文检索技术是指计算机索引程序通过扫描文章中的每一个词，对每一个词建立一个索引，指明该词在文章中出现的次数和位置，当用户查询时，检索程序就根据事先建立的索引进行查找，并将查找的结果反馈给用户的检索方式。这个过程类似于通过字典中的检索字表查字的过程。

二,全文检索的实现过程

根据全文检索的定义可以看出全文检索大体分两个过程,索引创建和搜索索引。
索引创建：将数据提取信息，创建索引的过程。

搜索索引：得到用户的查询请求，搜索创建的索引，然后返回结果的过程。

索引创建:

            1)有一系列被索引文件
            2)被索引文件经过语法分析和语言处理形成一系列词
            3)经过索引创建形成词典和反向索引表
            4)通过索引存储将索引写入硬盘
           搜索索引
            1)用户输入查询语句。
            2)对查询语句经过语法分析和语言分析得到一系列词
            3)通过语法分析得到一个查询树
            4)通过索引存储将索引读入到内存
            5)利用查询树搜索索引,从而得到每个词(Term)的文档链表,对文档链表进行交，差，并得到结果文档
            6)将搜索到的结果文档对查询的相关性进行排序
            7)返回查询结果给用户

三,简单的代码

helloword代码
package com.liuzm.lucene.day1;

import java.io.File;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

public class HelloWorld {
	// 数据源
	private String doc1 = "hello java";
	private String doc2 = "hello java world";
	private String doc3 = "hello lucene world";

	private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/firstIndex";
	private Version matchVersion = Version.LUCENE_47;

	/**
	 * 索引库的创建
	 * Directory ： 对Lucene索引库的一个抽象
	 * Analyzer  ： 词法分析器，决定分词的力度
	 * Document  ： Lucene中存储的基础对象，代表表中一条数据
	 * Field     ： 表中数据的字段
	 * @throws Exception
	 */
	@Test
	public void creatIndex() throws Exception {
		// 准备索引的输出目录
		Directory directory = FSDirectory.open(new File(path));
		// 创建分析器
		Analyzer analyzer = new StandardAnalyzer(matchVersion);
		// 索引写入器的配置对象
		IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
		// 每次创建索引库前先删除原先的索引库,即每次创建新的索引库
		config.setOpenMode(OpenMode.CREATE);
		// 创建索引写入器
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 添加和写入索引
		FieldType fieldType = new FieldType();
		// 设置是否需要存储
		fieldType.setIndexed(true); // 是否支持搜索
		fieldType.setStored(true); // 是否把数据放入数据区
		fieldType.setTokenized(true); // 是否需要分词
		// 创建文档
		Document document1 = new Document();
		document1.add(new Field("title", "doc1", fieldType));
		document1.add(new Field("content", doc1, fieldType));
		// document1.add(new TextField("title", "doc1", Store.YES));
		// document1.add(new TextField("content", doc1, Store.YES));
		indexWriter.addDocument(document1);

		Document document2 = new Document();
		document2.add(new TextField("title", "doc2", Store.YES));
		document2.add(new TextField("content", doc2, Store.YES));
		indexWriter.addDocument(document2);

		Document document3 = new Document();
		document3.add(new TextField("title", "doc3", Store.YES));
		document3.add(new TextField("content", doc3, Store.YES));
		indexWriter.addDocument(document3);
		// 关闭流资源,并写入索引
		indexWriter.close();
	}

	/**
	 * 索引的搜索
	 * 
	 * @throws Exception
	 */
	@Test
	public void searchIndex() throws Exception {
		// 查询关键字
		String queryStr = "hello";
		// 准备索引的搜索目录
		Directory directory = FSDirectory.open(new File(path));
		// 创建分析器
		Analyzer analyzer = new StandardAnalyzer(matchVersion);
		// 索引读取器
		DirectoryReader directoryReader = DirectoryReader.open(directory);
		//IndexReader indexReader = IndexReader.open(directory);
		// 创建索引搜索器
		IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
		//IndexSearcher indexSearcher = new IndexSearcher(indexReader);
		// 创建一个词法分析器
		QueryParser parser = new QueryParser(matchVersion, "content", analyzer);
		// 创建查询对象
		Query query = parser.parse(queryStr);
		// 使用索引搜索器查询,获取有符合条件的前100个文档
		TopDocs topDocs = indexSearcher.search(query, 100);
		// 输出文档个数
		System.out.println(topDocs.totalHits);
		// 获取文档
		ScoreDoc[] scoreDocs = topDocs.scoreDocs;
		// 遍历
		for (ScoreDoc scoreDoc : scoreDocs) {
			// 内部文档编号
			int docId = scoreDoc.doc;
			// 获取Document对象
			Document document = indexSearcher.doc(docId);
			System.out.println(document);
			System.out.println("docId: " + docId);
			System.out.println("title: " + document.get("title"));
			System.out.println("content : " + document.get("content"));
			System.out.println();
		}
	}

}

对于索引的CRUD
package com.liuzm.lucene.day1;

import java.io.File;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

public class IndexCRUD {
	private String doc1 = "hello java";
	private String doc2 = "hello java world";
	private String doc3 = "hello lucene world";

	private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/CRUDIndex";
	private Version matchVersion = Version.LUCENE_47;

	@Test
	public void creatIndex() throws Exception {
		// 准备索引输出目录
		Directory directory = FSDirectory.open(new File(path));
		// 创建分析器
		Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
		// 索引写入器配置
		IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
		config.setOpenMode(OpenMode.CREATE);
		// 创建索引写入器
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 添加和写入索引
		FieldType fieldType = new FieldType();
		// 设置是否存入数据区
		fieldType.setStored(true);
		// 创建文档
		Document document1 = new Document();
		document1.add(new TextField("id", "1", Store.YES));
		document1.add(new TextField("title", "doc1", Store.YES));
		document1.add(new TextField("content", doc1, Store.YES));
		indexWriter.addDocument(document1);

		Document document2 = new Document();
		document2.add(new TextField("id", "2", Store.YES));
		document2.add(new TextField("title", "doc2", Store.YES));
		document2.add(new TextField("content", doc2, Store.YES));
		indexWriter.addDocument(document2);

		Document document3 = new Document();
		document3.add(new TextField("id", "3", Store.YES));
		document3.add(new TextField("title", "doc3", Store.YES));
		document3.add(new TextField("content", doc3, Store.YES));
		indexWriter.addDocument(document3);

		// 关闭并写入所以呢
		indexWriter.close();
	}

	/**
	 * 删除索引
	 * 
	 * @throws Exception
	 */
	@Test
	public void deleteIndex() throws Exception {
		// 索引目录
		Directory directory = FSDirectory.open(new File(path));
		// 分词器
		Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
		// 索引写入器配置对象
		IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
		// 索引写入器
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 调用删除方法,实现删除
		// indexWriter.deleteDocuments(new Term("id","2"));
		// 词法分析器
		QueryParser parser = new QueryParser(matchVersion, "id", analyzer);
		// 创建查询对象
		Query query = parser.parse("3");
		indexWriter.deleteDocuments(query);
		// 关闭资源
		indexWriter.close();
	}

	/**
	 * 更新索引
	 * 
	 * @throws Exception
	 */
	@Test
	public void updateIndex() throws Exception {
		// 准备目录
		Directory directory = FSDirectory.open(new File(path));
		// 分词器
		Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
		// 索引写入器配置
		IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
		// 创建索引写入器
		IndexWriter indexWriter = new IndexWriter(directory, config);
		// 准备更新数据
		Document document = new Document();
		document.add(new TextField("id", "2", Store.YES));
		document.add(new TextField("title", "doc2", Store.YES));
		document.add(new TextField("content", "hello更改后的doc2", Store.YES));
		//document.add(new TextField("content", "更改后的doc2", Store.YES));	// 开头不能为中文???
		// 调用更新方法,实现更新操作(先删除,后添加的操作)
		indexWriter.updateDocument(new Term("id","2"), document);
		// 关闭资源
		indexWriter.close();
	}

	@Test
	public void searchIndex() throws Exception {
		// 准备搜索关键字
		String QueryStr = "hello";
		// 准备索引搜索目录
		Directory directory = FSDirectory.open(new File(path));
		// 创建分析器
		Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
		// 索引读取器
		IndexReader indexReader = IndexReader.open(directory);
		// 创建索引搜索器
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
		// 创建词法分析器
		QueryParser queryParser = new QueryParser(matchVersion, "content",analyzer);
		// 创建查询对象
		Query query = queryParser.parse(QueryStr);
		// 使用搜索器的search方法检索,获取有符合条件的前N个文档
		TopDocs topDocs = indexSearcher.search(query, 100);
		System.out.println(topDocs.totalHits);
		// 获取文档
		ScoreDoc[] scoreDocs = topDocs.scoreDocs;
		for (ScoreDoc scoreDoc : scoreDocs) {
			int docId = scoreDoc.doc;
			Document doc = indexSearcher.doc(docId);
			System.out.println("docId :" + docId);
			System.out.println("id: " + doc.get("id"));
			System.out.println("title : " + doc.get("title"));
			System.out.println("content: " + doc.get("content"));
			System.out.println();
		}

	}

}

分词器测试
package com.liuzm.lucene.day1;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.junit.Test;

/**
 * 分词器
 * @author Administrator
 *
 */
public class AnalyzerTest {
	private String en = "ho my lady gaga";
	private String cn = "迅雷不及掩耳盗铃儿响叮当仁不让";
	private String str = "源代码教育FullText框架的学习,哈哈";
	
	
	private Version matchVersion = Version.LUCENE_47;
	
	public void testAnalyzer(Analyzer analyzer, String str){
		try {
			// 获取需要解析文本的“词元流”
			TokenStream tokenStream = analyzer.tokenStream("name", str);
			// 重置解析
			tokenStream.reset();
			// 循环获取解析结果
			while(tokenStream.incrementToken()){
				System.out.println(tokenStream);
			}
//			System.out.println(tokenStream.incrementToken());
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	@Test
	public void testStandardAnalyzer() throws Exception {
		testAnalyzer(new StandardAnalyzer(matchVersion ), cn);
	}
	@Test
	public void testSimpleAnalyzer() throws Exception {
		testAnalyzer(new SimpleAnalyzer(matchVersion), cn);
	}
	@Test
	public void testChineseAnalyzer() throws Exception {
		testAnalyzer(new ChineseAnalyzer(), cn);
	}
	/*
	 * 二分分词
	 */
	@Test
	public void testCJKAnalyzer() throws Exception {
		testAnalyzer(new CJKAnalyzer(matchVersion), cn);
	}
	@Test
	public void testClassicAnalyzer() throws Exception {
		testAnalyzer(new ClassicAnalyzer(matchVersion), cn);
	}
	
	/**
	 * 字典分词
	 * @throws Exception
	 */
	@Test
	public void testSmartCnAnalyzer1() throws Exception {
		CharArraySet stopWords = new CharArraySet(matchVersion, 10, true);
		stopWords.add("的");
		stopWords.add(",");
		testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
	}
	
	@Test
	public void testSmartCnAnalyzer2() throws Exception {
		Reader reader =  IOUtils.getDecodingReader(this.getClass().getClassLoader().getResourceAsStream("stopwords.txt"),IOUtils.CHARSET_UTF_8);
		CharArraySet stopWords = CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, "//", Version.LUCENE_47));
		testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
	}
	
	
}

MrAlgernon

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
全文检索之lucene

一,概念介绍目前系统中存在着大量的报文信息，每条报文的数据量较小，大概2000-3000字节左右,但是总体报文的条数较多.这些报文信息属于非结构化数据,目前查询这些海量非结构化数据的速度较慢,而通过全文检索技术能高效地管理这些非结构化数据。全文检索技术是指计算机索引程序通过扫描文章中的每一个词，对每一个词建立一个索引，指明该词在文章中出现的次数和位置，当用...
复制链接

扫一扫

专栏目录