【ES】elasticsearch基于新闻的高亮搜索

基于新闻的高亮搜索

1. 自定义analyzer
# 自定义分词器, news_analyzer是自定义的分词器的名字
PUT news
{
  "settings": {
    "analysis": {
      "analyzer": {
        "news_analyzer" : {
          "char_filter": ["html_strip"],
          "tokenizer": "keyword",
          "filter": [
             "my_lexicon_filter"
          ]
        },
        "news_search_analyzer": {
          "char_filter": ["html_strip"],
          "tokenizer": "keyword",
          "filter": ["lowercase"]
        }
      },
      "filter": {
        "my_lexicon_filter": {
          "type": "pinyin",
          "keep_first_letter": true,
          "keep_full_pinyin": false,
          "keep_none_chinese": false,
          "keep_separate_first_letter": false,
          "keep_joined_full_pinyin": true,
          "keep_none_chinese_in_joined_full_pinyin": true,
          "none_chinese_pinyin_tokenize": false,
          "limit_first_letter_length": 16,
          "keep_original": true
        } 
      }
    }
  }
}
2. 定义mappings
PUT news/_mapping
{
    "properties": {
      "id": {
        "type": "long"
      },
      "title": {
        "type": "text",
        "analyzer": "hanlp_index"
      },
      "url": {
        "type": "keyword"
      },
      "content": {
        "type": "text",
        "analyzer": "hanlp_index"
      },
      "tags": {
        "type": "completion",
        "analyzer": "news_analyzer",
        "search_analyzer": "news_search_analyzer"
      }
    }
}

设置mappings的时候,可以指定 “dynamic”: false,意思是如果mappings中有些字段并没有指定,那么在新增数据的时候,该字段的数据会存入到es中,但是不会进行分词,但是可以被查出来。

3. 导入mysql的数据集

1.将news.sql导入mysql数据库

2.将mysql驱动包放在D:\elasticsearch\logstash-7.4.2\logstash-core\lib\jars目录下

3.将logstash-mysql-news.conf放在D:\elasticsearch\logstash-7.4.2\config目录下

4.进到logstash的bin目录下,执行:logstash.bat -f D:\elasticsearch\logstash-7.4.2\config\logstash-mysql-news.conf命令,开始导入数据。

4.编写suggestion与query

搜索要使用的suggestion

GET news/_search
{
  "_source": ["id"], 
  "suggest": {
    "tags_suggest": {
      "prefix": "中",
      "completion": {
        "field": "tags",
        "skip_duplicates": true,
        "size": 10
      }
    }
  }
}

**注: ** 在使用suggestion的时候,“skip_duplicates”: true,表示的意思是如果出现相同的建议,那么只会保留一个。

搜索要使用的query

GET news/_search
{
  "_source": ["url"], 
  "query": {
    "multi_match": {
      "query": "中国赴塞尔维亚抗疫专家",
      "fields": ["title", "content"]
    }
  },
  "highlight": {
    "post_tags": "</span>",
    "pre_tags": "<span>",
    "fields": {
      "title": {},
      "content": {}
    }
  }
}
5.依赖
<dependencies>
	<dependency>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-web</artifactId>
	</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework.data/spring-data-elasticsearch -->
	<dependency>
         <groupId>org.springframework.boot</groupId>
         <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
    </dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
	<dependency>
		<groupId>com.alibaba</groupId>
		<artifactId>fastjson</artifactId>
		<version>1.2.62</version>
	</dependency>
</dependencies>
6.编写ElasticsearchConfig
package com.qf.config;

import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.convert.support.DefaultConversionService;
import org.springframework.data.elasticsearch.config.ElasticsearchConfigurationSupport;
import org.springframework.data.elasticsearch.core.ElasticsearchEntityMapper;
import org.springframework.data.elasticsearch.core.ElasticsearchTemplate;
import org.springframework.data.elasticsearch.core.EntityMapper;
import java.net.InetAddress;
import java.net.UnknownHostException;

@Configuration
public class ElasticsearchConfig extends ElasticsearchConfigurationSupport {
    @Bean
    public Client elasticsearchClient() throws UnknownHostException {
        Settings settings = Settings.builder().put("cluster.name", "elasticsearch").build();
        TransportClient client = new PreBuiltTransportClient(settings);
        client.addTransportAddress(new TransportAddress(InetAddress.getByName("127.0.0.1"), 9300));
        return client;
    }

    @Bean(name = {"elasticsearchOperations", "elasticsearchTemplate"})
    public ElasticsearchTemplate elasticsearchTemplate() throws UnknownHostException {
        return new ElasticsearchTemplate(elasticsearchClient(), entityMapper());
    }

    // use the ElasticsearchEntityMapper
    @Bean
    @Override
    public EntityMapper entityMapper() {
        ElasticsearchEntityMapper entityMapper = new ElasticsearchEntityMapper(elasticsearchMappingContext(),
                new DefaultConversionService());
        entityMapper.setConversions(elasticsearchCustomConversions());
        return entityMapper;
    }
}
7.POJO类的编写
package com.qf.entity;

import org.springframework.data.elasticsearch.annotations.Document;

@Document(indexName = "news", type = "_doc")
public class News {

    private Integer id;
    private String url;
    private String title;
    private String content;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}
8. 编写NewsTipController
package com.qf.controller;

import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.search.suggest.SuggestBuilder;
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.data.elasticsearch.core.ElasticsearchTemplate;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

// 新闻提示搜索
@RestController
@RequestMapping("/tip")
public class NewsTipController {

    private ElasticsearchTemplate elasticsearchTemplate;

    public NewsTipController(@Qualifier("elasticsearchTemplate")
                                     ElasticsearchTemplate elasticsearchTemplate) {
        this.elasticsearchTemplate = elasticsearchTemplate;
    }

    @GetMapping
    public Set<String> movieSuggest(String text) {
        /**
         * 第一步构建 CompletionSuggestionBuilder
         */
        CompletionSuggestionBuilder titlePrefixSuggest = new CompletionSuggestionBuilder("tags")
                .prefix(text)
                .size(10)   //提示多少个次
                .skipDuplicates(true);  //忽略重复

        /**
         * 第二部在去构建 SuggestBuilder, 封装所有的建议形式
         */
        SuggestBuilder suggestBuilder = new SuggestBuilder()
                .addSuggestion("tag_prefix_suggestion", titlePrefixSuggest);


        /**
         * 第三部:构建搜索查询对象
         */
        SearchRequestBuilder searchRequestBuilder = elasticsearchTemplate.getClient()
                .prepareSearch("news")   //在哪个索引中搜索
                .suggest(suggestBuilder);


        // 开始查询
        SearchResponse response = searchRequestBuilder.get();

        Suggest suggestResult = response.getSuggest();  //获取搜索建议结果

        // 获取对应的搜索建议的结果
        Suggest.Suggestion suggestion = suggestResult.getSuggestion("tag_prefix_suggestion");

        Set<String> suggestionResult = new HashSet<>();

        List<Object> list = suggestion.getEntries();
        if(null != list && list.size() > 0){
            Object object = list.get(0);
            if(object instanceof CompletionSuggestion.Entry) {
                CompletionSuggestion.Entry resultEntry = (CompletionSuggestion.Entry)object;
                List<CompletionSuggestion.Entry.Option> options = resultEntry.getOptions();
                if(null != options && options.size() > 0) {
                    for(CompletionSuggestion.Entry.Option opt : options) {
                        Text txt = opt.getText();
                        suggestionResult.add(txt.toString());
                    }
                }
            }
        }

        return suggestionResult;
    }
}
9.编写NewsSearchController
package com.qf.controller;

import com.alibaba.fastjson.JSON;
import com.qf.entity.News;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.search.MultiMatchQuery;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.data.elasticsearch.core.ElasticsearchTemplate;
import org.springframework.data.elasticsearch.core.ResultsExtractor;
import org.springframework.data.elasticsearch.core.query.FetchSourceFilter;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
import org.springframework.data.elasticsearch.core.query.SearchQuery;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import org.elasticsearch.common.text.Text;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

// 新闻内容搜索以及前端高亮显示
@RestController
@RequestMapping("/news")
public class NewsSearchController {

    private ElasticsearchTemplate elasticsearchTemplate;

    public NewsSearchController(@Qualifier("elasticsearchTemplate")
                                     ElasticsearchTemplate elasticsearchTemplate) {
        this.elasticsearchTemplate = elasticsearchTemplate;
    }

    /**
     * GET news/_search
     * {
     *   "_source": ["url"],
     *   "query": {
     *     "multi_match": {
     *       "query": "中国",
     *       "fields": ["title", "content"]
     *     }
     *   },
     *   "highlight": {
     *     "pre_tags": "<font color='red'>",
     *     "post_tags": "</font>",
     *     "fields": {
     *       "title": {},
     *       "content": {}
     *     }
     *   }
     * }
     */
    @GetMapping("/search")
    public List<News> searchNews(String searchText) {

        MultiMatchQueryBuilder multiMatchQuery = new MultiMatchQueryBuilder(searchText, "title", "content");

        HighlightBuilder highlightBuilder = new HighlightBuilder()
                .preTags("<font color='red'>")
                .postTags("</font>")
                .field("title")
                .field("content");

        SearchQuery searchQuery = new NativeSearchQueryBuilder()
                .withIndices("news")
                .withQuery(multiMatchQuery)
                .withHighlightBuilder(highlightBuilder)
                .withSourceFilter(new FetchSourceFilter(new String[]{"url", "id", "title"}, new String[]{}))
                .build();

        /**
         * query() 方法的返回值就是实现了 ResultsExtractor 接口的 extract 的这个方法的返回值
         */
        return elasticsearchTemplate.query(searchQuery, new NewsResultsExtractor());
    }

    class NewsResultsExtractor implements ResultsExtractor<List<News>> {

        // 方法的 response, 就是查询之后的结果;但是我们需要处理成为 List<News>
        @Override
        public List<News> extract(SearchResponse response) {
            // 获取命中的搜索
            SearchHit[] hits = response.getHits().getHits();

            List<News> newsList = new ArrayList<>();

            for(SearchHit hit : hits) {
                // 获取到新闻的json数据,但是对于当前我们本身的案例来说,我们只取了 url
                String newsJson = hit.getSourceAsString();
                // 对news的json数据的反序列化
                News news = JSON.parseObject(newsJson, News.class);

                // 获取高亮的字段部分
                Map<String, HighlightField> highlightFieldMap = hit.getHighlightFields();

                HighlightField titleField  = highlightFieldMap.get("title");
                HighlightField contentField  = highlightFieldMap.get("content");

                // 处理title部分
                if(null != titleField) {
                    StringBuffer titles = new StringBuffer();
                    // 高亮的处理是将 包含了关键字的内容进行分段 截取,所以返回值为一个数组
                    Text[] titleFragments = titleField.getFragments();
                    if(null != titleFragments && titleFragments.length > 0) {
                        for (Text text : titleFragments) {
                            titles.append(text.toString());
                        }
                    }

                    news.setTitle(titles.toString());
                }

                if(null != contentField) {
                    StringBuffer contents = new StringBuffer();
                    // 高亮的处理是将 包含了关键字的内容进行分段 截取,所以返回值为一个数组
                    Text[] contentFragments = contentField.getFragments();
                    if(null != contentFragments && contentFragments.length > 0) {
                        for (Text text : contentFragments) {
                            contents.append(text.toString());
                        }
                    }

                    news.setContent(contents.toString());
                }

                newsList.add(news);
            }

            return newsList;
        }
    }
}
10.前端的实现
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="css/bootstrap.min.css">
    <link rel="stylesheet" href="css/jquery-ui.min.css">
    <script src="js/jquery-3.5.0.js"></script>
    <script src="js/jquery-ui.min.js"></script>
    <script src="js/bootstrap.min.js"></script>
    <script src="js/vue.js"></script>
    <script src="js/axios.min.js"></script>
    <style>
        .desc-text {
            height: 50px;
            overflow: hidden;
        }
        a,a:link, a:visited, a:hover, a:active {
            text-decoration: none;
        }
    </style>
</head>
<body>
<div class="container-fluid">
    <div class="row mt-3 pb-3 mb-3" style="border-bottom: 1px solid #e2e3e5;">
        <div class="col-10">
            <form class="form-inline" onsubmit="javascript: return false;">
                <div class="form-group col-6">
                    <input class="form-control col" id="search-text" onkeyup="searchNews()">
                </div>
                <button type="submit" class="btn btn-primary col-1">搜索一下</button>
            </form>
        </div>
    </div>
    <div id="app">
        <div v-for="n in news" class="row mb-3" :key="n.id">
            <div class="col-10">
                <h4><a target="_blank" :href="n.url"><span v-html="n.title"></span></a></h4>
                <p v-html="n.content">
                </p>
            </div>
        </div>
    </div>
</div>
</body>
<script>
    var vm = new Vue({
        el: '#app',
        data() {
            return {
                news: []
            }
        }
    })

    $('#search-text').autocomplete({
        delay: 300,  // 延迟查询,意思是当在输入框中多输入了一个词,多久往服务器发送请求
        max: 20,  // 指的是下拉列表中最多出现多少个次
        source: function(request, cb) {
            $.ajax({
                url: 'tip',
                data: {text: request.term},
                type: 'get',
                dataType: 'json',
                success: function(_data) {
                    let tips = [];
                    for(let i = 0; i < _data.length; i++) {
                        tips.push(_data[i]);
                    }
                    cb(tips);
                }
            })
        },
        minlength: 1   // 最低输入多少个字母就往服务器端发送请求
    })


    function searchNews() {

            let searchText = $('#search-text').val();  //拿到搜索内容
            if(searchText && searchText.trim()) {
                vm.news = [];
                axios.get('news/search?searchText=' + searchText)
                    .then(res => {
                    for(let i = 0; i < res.data.length; i++) {
                    vm.news.push(res.data[i])
                }
            })
            }
    }

</script>
</html>
11.页面效果

在这里插入图片描述

length: 1 // 最低输入多少个字母就往服务器端发送请求
})

function searchNews() {

        let searchText = $('#search-text').val();  //拿到搜索内容
        if(searchText && searchText.trim()) {
            vm.news = [];
            axios.get('news/search?searchText=' + searchText)
                .then(res => {
                for(let i = 0; i < res.data.length; i++) {
                vm.news.push(res.data[i])
            }
        })
        }
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
如果您下载了本程序,但是该程序存在问题无法运行,那么您可以选择退款或者寻求我们的帮助(如果找我们帮助的话,是需要追加额外费用的)。另外,您不会使用资源的话(这种情况不支持退款),也可以找我们帮助(需要追加额外费用) 爬虫(Web Crawler)是一种自动化程序,用于从互联网上收集信息。其主要功能是访问网页、提取数据并存储,以便后续分析或展示。爬虫通常由搜索引擎、数据挖掘工具、监测系统等应用于网络数据抓取的场景。 爬虫的工作流程包括以下几个关键步骤: URL收集: 爬虫从一个或多个初始URL开始,递归或迭代地发现新的URL,构建一个URL队列。这些URL可以通过链接分析、站点地图、搜索引擎等方式获取。 请求网页: 爬虫使用HTTP或其他协议向目标URL发起请求,获取网页的HTML内容。这通常通过HTTP请求库实现,如Python中的Requests库。 解析内容: 爬虫对获取的HTML进行解析,提取有用的信息。常用的解析工具有正则表达式、XPath、Beautiful Soup等。这些工具帮助爬虫定位和提取目标数据,如文本、图片、链接等。 数据存储: 爬虫将提取的数据存储到数据库、文件或其他存储介质中,以备后续分析或展示。常用的存储形式包括关系型数据库、NoSQL数据库、JSON文件等。 遵守规则: 为避免对网站造成过大负担或触发反爬虫机制,爬虫需要遵守网站的robots.txt协议,限制访问频率和深度,并模拟人类访问行为,如设置User-Agent。 反爬虫应对: 由于爬虫的存在,一些网站采取了反爬虫措施,如验证码、IP封锁等。爬虫工程师需要设计相应的策略来应对这些挑战。 爬虫在各个领域都有广泛的应用,包括搜索引擎索引、数据挖掘、价格监测、新闻聚合等。然而,使用爬虫需要遵守法律和伦理规范,尊重网站的使用政策,并确保对被访问网站的服务器负责。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值