大数据 第十章 ElasticSearch(一)

ES基础入门

lucene知识

原理图
在这里插入图片描述
lucence快的原因:全文检索:在添加数据的时候,会对数据进行分词,将分词后的词建立索引,存储到索引库中,然后再将真正的内容即文档,保存到文档区域。
在查找时,将查询条件分词,先在索引库中查找,如果找到,会返回一个文档id,然后根据文档id,再到存储文档的区域查找真正的内容。
虽然在添加数据时,增加了时间和内存进行了存储,但是在查找的时候能极大提升查询速度。
lucene时一个单机版程序,ES时一个集群,底层用的时lucene,提供更方便的API。

java API使用lucene demo

<!-- lucene的核心 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>6.6.0</version>
        </dependency>

        <!-- lucene的分词器,有标准的英文相关的分词器,没有中文的 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>6.6.0</version>
        </dependency>

        <!-- 查询解析器 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>6.6.0</version>
        </dependency>

        <!-- 各种查询方式 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queries</artifactId>
            <version>6.6.0</version>
        </dependency>

        <!-- 关键字高亮 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-highlighter</artifactId>
            <version>6.6.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-demo</artifactId>
            <version>6.6.0</version>
        </dependency>
public class Article {

	private Long id;
	
	private String title;
	
	private String content;
	
	private String author;
	
	private String url;

	public Article(){}
	
	public Article(Long id, String title, String content, String author,
			String url) {
		super();
		this.id = id;
		this.title = title;
		this.content = content;
		this.author = author;
		this.url = url;
	}

	public Long getId() {
		return id;
	}

	public void setId(Long id) {
		this.id = id;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getAuthor() {
		return author;
	}

	public void setAuthor(String author) {
		this.author = author;
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}
	
	public Document toDocument(){
		//Lucene存储的格式(Map装的k,v)
		Document doc = new Document();
		//向文档中添加一个long类型的属性,建立索引
		doc.add(new LongPoint("id", id));
		//在文档中存储
		doc.add(new StoredField("id", id));

		//设置一个文本类型,会对内容进行分词,建立索引,并将内容在文档中存储
		doc.add(new TextField("title", title, Store.YES));
		//设置一个文本类型,会对内容进行分词,建立索引,存在文档中存储 / No代表不存储
		doc.add(new TextField("content", content, Store.YES));

		//StringField,不分词,建立索引,文档中存储
		doc.add(new StringField("author", author, Store.YES));

		//不分词,不建立索引,在文档中存储,
		doc.add(new StoredField("url", url));
		return doc;
	}
	
	public static Article parseArticle(Document doc){
		Long id = Long.parseLong(doc.get("id"));
		String title = doc.get("title");
		String content = doc.get("content");
		String author = doc.get("author");
	    String url = doc.get("url");
		Article article = new Article(id, title, content, author, url);
		return article;
	}

	@Override
	public String toString() {
		return "id : " + id + " , title : " + title + " , content : " + content + " , author : " + author + " , url : " + url;
	}
	
	
}
public class HelloWorld {


    /**
     * 往用lucene写入数据
     * @throws IOException
     */
    @Test
    public void testCreate() throws IOException {
        Article article = new Article();
        article.setId(108L);
        article.setAuthor("老王");
        article.setTitle("学习大数据");
        article.setContent("学数据,迎娶丁老师!");
        article.setUrl("http://www.edu360.cn/a10011");

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        FSDirectory fsDirectory = FSDirectory.open(Paths.get(indexPath));
        //创建一个标准分词器,一个字分一次
        //Analyzer analyzer = new StandardAnalyzer();
        Analyzer analyzer = new IKAnalyzer(true);
        //写入索引的配置,设置了分词器
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        //指定了写入数据目录和配置
        IndexWriter indexWriter = new IndexWriter(fsDirectory, indexWriterConfig);
        //创建一个文档对象
        Document document = article.toDocument();
        //通过IndexWriter写入
        indexWriter.addDocument(document);
        indexWriter.close();
    }

    @Test
    public void testSearch() throws IOException, ParseException {

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        Analyzer analyzer = new IKAnalyzer(true);
        //Analyzer analyzer = new IKAnalyzer(true);
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        //索引查询器
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        String queryStr = "数据";
        //创建一个查询条件解析器
        QueryParser parser = new QueryParser("content", analyzer);
        //对查询条件进行解析
        Query query = parser.parse(queryStr);

         //TermQuery将查询条件当成是一个固定的词
        //Query query = new TermQuery(new Term("url", "http://www.edu360.cn/a10010"));
        //在【索引】中进行查找
        TopDocs topDocs = indexSearcher.search(query, 10);

        //获取到查找到的文文档ID和得分
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            //从索引中查询到文档的ID,
            int doc = scoreDoc.doc;
            //在根据ID到文档中查找文档内容
            Document document = indexSearcher.doc(doc);
            //将文档转换成对应的实体类
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }

    @Test
    public void testDelete() throws IOException, ParseException {

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        Analyzer analyzer = new IKAnalyzer(true);
        FSDirectory fsDirectory = FSDirectory.open(Paths.get(indexPath));
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        IndexWriter indexWriter = new IndexWriter(fsDirectory, indexWriterConfig);

        //Term词条查找,内容必须完全匹配,不分词
        //indexWriter.deleteDocuments(new Term("content", "学好"));

        //QueryParser parser = new QueryParser("title", analyzer);
        //Query query = parser.parse("大数据老师");

        //LongPoint是建立索引的
        //Query query = LongPoint.newRangeQuery("id", 99L, 120L);
        Query query = LongPoint.newExactQuery("id", 105L);

        indexWriter.deleteDocuments(query);

        indexWriter.commit();
        indexWriter.close();
    }

    /**
     * lucene的update比较特殊,update的代价太高,先删除,然后在插入
     * @throws IOException
     * @throws ParseException
     */
    @Test
    public void testUpdate() throws IOException, ParseException {

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        StandardAnalyzer analyzer = new StandardAnalyzer();
        FSDirectory fsDirectory = FSDirectory.open(Paths.get(indexPath));
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        IndexWriter indexWriter = new IndexWriter(fsDirectory, indexWriterConfig);


        Article article = new Article();
        article.setId(106L);
        article.setAuthor("老王");
        article.setTitle("学好大数据,要找赵老师");
        article.setContent("迎娶白富美,走上人生巅峰!!!");
        article.setUrl("http://www.edu360.cn/a111");
        Document document = article.toDocument();

        indexWriter.updateDocument(new Term("author", "老王"), document);

        indexWriter.commit();
        indexWriter.close();
    }

    /**
     * 可以从多个字段中查找
     * @throws IOException
     * @throws ParseException
     */
    @Test
    public void testMultiField() throws IOException, ParseException {

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        Analyzer analyzer = new IKAnalyzer(true);
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        String[] fields = {"title", "content"};
        //多字段的查询转换器
        MultiFieldQueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
        Query query = queryParser.parse("老师");

        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Document document = indexSearcher.doc(doc);
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }

    /**
     * 查找全部的数据
     * @throws IOException
     * @throws ParseException
     */
    @Test
    public void testMatchAll() throws IOException, ParseException {

        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        Query query = new MatchAllDocsQuery();

        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Document document = indexSearcher.doc(doc);
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }

    /**
     * 布尔查询,可以组合多个查询条件
     * @throws Exception
     */
    @Test
    public void testBooleanQuery() throws Exception {
        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        Query query1 = new TermQuery(new Term("title", "老师"));
        Query query2 = new TermQuery(new Term("content", "丁"));
        BooleanClause bc1 = new BooleanClause(query1, BooleanClause.Occur.MUST);
        BooleanClause bc2 = new BooleanClause(query2, BooleanClause.Occur.MUST_NOT);
        BooleanQuery boolQuery = new BooleanQuery.Builder().add(bc1).add(bc2).build();
        System.out.println(boolQuery);

        TopDocs topDocs = indexSearcher.search(boolQuery, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Document document = indexSearcher.doc(doc);
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }

    @Test
    public void testQueryParser() throws Exception {
        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        //创建一个QueryParser对象。参数1:默认搜索域 参数2:分析器对象。
        QueryParser queryParser = new QueryParser("title", new IKAnalyzer(true));

        //Query query = queryParser.parse("数据");
		Query query = queryParser.parse("title:学好 OR title:学习");
        System.out.println(query);

        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Document document = indexSearcher.doc(doc);
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }


    @Test
    public void testRangeQuery() throws Exception {
        String indexPath = "/Users/zx/Documents/dev/lucene/index";
        DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);


        Query query = LongPoint.newRangeQuery("id", 107L, 108L);

        System.out.println(query);

        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Document document = indexSearcher.doc(doc);
            Article article = Article.parseArticle(document);
            System.out.println(article);
        }

        directoryReader.close();
    }
}

lucene的查询种类:

  1. term查询,不分词,严格匹配内容
  2. 分词查找
  3. 多字段查询
  4. Boolean查询
  5. 范围查询
  6. 确切值查询
  7. 查询所有

分词器:标准分词器StandardAmalyzer会把每个字都分词,中文最好的是IKAnalyzer,能够对中文智能分词。为了更智能,可以加上自己的扩展词库,在resources下新建IKAnalyzer.cfg.xml,进行分词的扩展和禁止的关键字设置。

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">  
<properties>  
	<comment>IK Analyzer 扩展配置</comment>
	<!--用户可以在这里配置自己的扩展字典 -->
	<entry key="ext_dict">ext.dic;</entry> 
	
	<!--用户可以在这里配置自己的扩展停止词字典-->
	<entry key="ext_stopwords">stopword.dic;</entry> 
	
</properties>
  • 20
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值