Elasticsearch (爬虫)集成

1、部分效果展示

  • 同步到es

    es_list

  • 高亮查询

    list_high

2、ELK简介

  • “ELK”是三个开源项目的首字母缩写,这三个项目分别是:Elasticsearch、Logstash 和 Kibana。
  • Elasticsearch 是一个搜索和分析引擎。Logstash 是服务器端数据处理管道,能够同时从多个来源采集数据,转换数据,然后将数据发送到诸如 Elasticsearch 等“存储库”中。Kibana 则可以让用户在 Elasticsearch 中使用图形和图表对数据进行可视化。

3、启动es

  • 启动VMware虚拟机,在命令行切换到es用户,执行 /opt/elasticsearch-7.9.3/bin/elasticsearch -d

4、后端集成

4.1、导入依赖与yml配置
  • pom.xml
<!-- 继承目标父项目的依赖,控制统一版本号 -->
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.5.5</version>
        <!--relativePath:查找maven管理依赖包途径,如果是 <relativePath/>将始终从远程仓库中查找,不从本地查找-->
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

     <dependencies>
        <!-- webMagic 依赖 -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.8.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.8.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!--  es  依赖-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>

        <!--  mybatisPlus starter-->
        <dependency>
            <groupId>com.baomidou</groupId>
            <artifactId>mybatis-plus-boot-starter</artifactId>
            <version>3.5.3</version>
        </dependency>

        <!--  lombok 依赖-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.20</version>
        </dependency>

        <!--  web starter  Controller层-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--  mysql的驱动包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <!-- 单元测试 -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.13.2</version>
            <scope>test</scope>
        </dependency>

        <!-- springBoot  测试依赖-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
        </dependency>

        <!--  hutool  -->
        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.8.11</version>
        </dependency>

        <!--  分页插件 -->
        <dependency>
            <groupId>com.github.pagehelper</groupId>
            <artifactId>pagehelper-spring-boot-starter</artifactId>
            <version>1.4.1</version>
        </dependency>
    </dependencies>

XML

  • application.yml
server:
    port: 8989

spring:
    datasource:
        driver-class-name: com.mysql.cj.jdbc.Driver
        url: jdbc:mysql://localhost:3306/eee?serverTimezone=UTC&useUnicode=true&characterEncoding=utf-8
        username: root
        password: root

    elasticsearch:
        rest:
            uris: 192.168.126.133:9200

# 集成mybatis-plus
mybatis-plus:
    mapper-locations: classpath*:mapper/*.xml
    configuration:
        log-impl: org.apache.ibatis.logging.stdout.StdOutImpl

YAML

4.2、爬取房源信息
package com.llh;

import com.llh.domain.WebHouse;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.ArrayList;
import java.util.List;

/**
 * User: sunjunfu
 * DateTime: 2023/9/23 8:54
 */
public class House implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    @Override
    public void process(Page page) {
//        System.err.println(page.getHtml());

        // 标题
        List<String> titles = page.getHtml().xpath("//div[@class='zu-info']/h3/a/b/text()").all();
        // 价格
        List<String> prices = page.getHtml().xpath("//div[@class='zu-side']/p/strong/b/text()").all();
        // 中间人
        List<String> brokers = page.getHtml().xpath("//p[@class='details-item tag']/text()").all();
        // 规格
        List<String> b1s = page.getHtml().xpath("//p[@class='details-item tag']/b[1]/text()").all();
        List<String> b2s = page.getHtml().xpath("//p[@class='details-item tag']/b[2]/text()").all();
        System.err.println(titles);
        System.err.println(prices);
        System.err.println(brokers);
        System.err.println(b1s);
        System.err.println(b2s);

        List<WebHouse> list = new ArrayList<>();

        for (int i = 0; i < titles.size(); i++) {
            String title = titles.get(i);
            String price = prices.get(i);
            // 拼接规格
            String b1 = b1s.get(i);
            String b2 = b2s.get(i);
            String area = b1 + "室" + b2 + "厅";

            // 去掉空格再截取,或者中间人
            String str = brokers.get(i);
            String trim = str.trim();
            String broker = trim.split(" ")[1];

            // 给对象赋值
            WebHouse house = new WebHouse();
            house.setTitle(title);
            house.setPrice(price);
            house.setArea(area);
            house.setBroker(broker);

            // 添加到集合
            list.add(house);
        }
        System.out.println(list);
        page.putField("house", list);
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new House())
              .addUrl("https://bj.zu.anjuke.com/?t=1&from=0&comm_exist=on&kw=3%E5%B1%85%E5%AE%A4")
              .addPipeline(new Pipeline() {
                  @Override
                  public void process(ResultItems resultItems, Task task) {
                      WebHouse house = resultItems.get("house");
                  }
              })
              .thread(5)
              .run();
    }
}

Java

4.3、同步到es
  • 同步之前,需要在实体类加上 @Document 注解,以及要查询的字段指定分词器和类型

    house

  • 使用es提供的同步接口
package com.llh.repository;

import com.llh.domain.WebHouse;
import org.springframework.data.repository.CrudRepository;
import org.springframework.stereotype.Repository;

/**
 * User: sunjunfu
 * DateTime: 2023/9/23 9:56
 */
@Repository
public interface WebHouseRepository extends CrudRepository<WebHouse,Integer> {
}

Java

  • 在测试类调用爬取的数据同步到es
@Test
    public void test3(){
        Spider.create(new House())
              .addUrl("https://bj.zu.anjuke.com/?t=1&from=0&comm_exist=on&kw=3%E5%B1%85%E5%AE%A4")
              .addPipeline(new Pipeline() {
                  @Override
                  public void process(ResultItems resultItems, Task task) {
                      List<WebHouse> house = resultItems.get("house");
                      // 存入数据库
                      webHouseService.saveBatch(house);
                      // 全量同步到es
                      webHouseRepository.saveAll(house);
                  }
              })
              .thread(5)
              .run();
    }

Java

4.4、编写测试接口
  • WebHouseController.java
package com.llh.controller;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.llh.domain.WebHouse;
import com.llh.form.HouseForm;
import com.llh.repository.WebHouseRepository;
import com.llh.service.WebHouseService;
import com.llh.utils.Result;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
import org.springframework.data.elasticsearch.core.SearchHit;
import org.springframework.data.elasticsearch.core.SearchHits;
import org.springframework.data.elasticsearch.core.query.NativeSearchQuery;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * User: sunjunfu
 * DateTime: 2023/9/23 10:19
 */
@RestController
@RequestMapping("/house")
public class WebHouseController {
    @Autowired
    WebHouseService webHouseService;

    @Autowired
    WebHouseRepository webHouseRepository;

    @Autowired
    ElasticsearchRestTemplate elasticsearchRestTemplate;

    // 初始化 同步到es
    @PostConstruct
    public void init(){
        List<WebHouse> houses = webHouseService.list();
        webHouseRepository.saveAll(houses);
    }

    @RequestMapping("/list")
    public Result list(WebHouse house){

        // 高亮
        HighlightBuilder.Field[] highFiles = new HighlightBuilder.Field[1];
        // 将title字段高亮显示
        highFiles[0] = new HighlightBuilder.Field("title")
              .preTags("<font style='color: red'>")
              .postTags("</font>");

        // 分页( es 中 0 代表第一页)
        Pageable page = PageRequest.of(house.getPageNum()-1,house.getPageSize());

        // 组合查询
        BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
        if(StrUtil.isNotBlank(house.getTitle())){
            // 模糊查询
            MatchQueryBuilder matchQuery = QueryBuilders.matchQuery("title", house.getTitle());
            // 同时满足条件
            boolQueryBuilder.must(matchQuery);
        }

        // 构建查询条件
        NativeSearchQuery query = new NativeSearchQueryBuilder()
              .withQuery(boolQueryBuilder)
              .withPageable(page)
              .withHighlightFields(highFiles)
              .build();
        // 查询数据
        SearchHits<WebHouse> search = elasticsearchRestTemplate.search(query, WebHouse.class);
        // 获取总条数
        long totalHits = search.getTotalHits();
        List<WebHouse> list = new ArrayList<>();

        for (SearchHit<WebHouse> houseSearchHit : search) {
            // 获取数据对象
            WebHouse webHouse = houseSearchHit.getContent();

            // 获取高亮的属性
            Map<String, List<String>> highlightFields = houseSearchHit.getHighlightFields();
            // 获取title字段的数据
            List<String> title = highlightFields.get("title");
            if(CollUtil.isNotEmpty(title)){
                //  将高亮后的数据覆盖
                webHouse.setTitle(title.get(0));
            }
            // 存入集合
            list.add(webHouse);
        }

        // 将总条数和数据封装到对象
        HouseForm houseForm = new HouseForm(totalHits, list);
        return Result.success(houseForm);
    }

}

Java

5、前端页面

5.1、编写展示页面
  • ListView.vue
<script>
import axios from "axios";
import qs from "qs";

export default {
    data(){
        return{
            formInline:{
                pageNum:1,
                pageSize:6
            },
            tableData:[],
            total:0
        }
    },
    methods:{
        page(newPageNum) {
            this.formInline.pageNum = newPageNum
            this.onSubmit()
        },
        onSubmit(){
            axios.post("/house/list",qs.stringify(this.formInline)).then(res=>{
                this.tableData = res.data.data.list
                this.total = res.data.data.total
            })
        }
    },
    created() {
        this.onSubmit();
    }
}
</script>

<template>
<div>
    <el-form :inline="true" :model="formInline" class="demo-form-inline">
        <el-form-item label="标题">
            <el-input v-model="formInline.title" placeholder="标题" clearable="clearable"></el-input>
        </el-form-item>
        
        <el-form-item>
            <el-button type="primary" @click="onSubmit">查询</el-button>
        </el-form-item>
    </el-form>
    
    <el-table
        :data="tableData"
        border
        style="width: 100%;">
        <el-table-column
            prop="id"
            label="编号"
            width="180">
        </el-table-column>
        <el-table-column
            prop="title"
            label="标题"
            width="180">
            <template v-slot="scope">
                <span v-html="scope.row.title"></span>
            </template>
        </el-table-column>
        <el-table-column
            prop="price"
            label="价格/月"
            width="180">
        </el-table-column>
        <el-table-column
            prop="area"
            label="规格"
            width="180">
        </el-table-column>
        <el-table-column
            prop="broker"
            label="中间人"
            width="180">
        </el-table-column>
    </el-table>
    
    <el-pagination
        background
        layout="prev, pager, next"
        :page-size="formInline.pageSize"
        :total="total"
        @current-change="page">
    </el-pagination>
    
</div>
</template>

<style scoped>

</style>

JavaScript

  • 8
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值