java-爬虫-es

seeyoutlb

已于 2022-12-14 20:27:17 修改

阅读量950

点赞数 1

分类专栏：中间件文章标签： java 爬虫 elasticsearch

于 2022-12-14 20:26:43 首次发布

本文链接：https://blog.csdn.net/User_bie/article/details/128282885

版权

中间件专栏收录该内容

7 篇文章

订阅专栏

文章目录

文献：https://www.kuangstudy.com/bbs/1354069127022583809

1.数据来源：数据库、mq、爬虫

2.爬虫：获取想要的页面数据

1.导入依赖

jsoup：适合爬取解析网页信息等
tika：适合爬取电影、视频、音频等

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

2.爬取核心部分编码

public static void main(String[] args) throws Exception {
        new HtmlParseUtil().parseJD("vue").forEach(System.out::println);
    }

    public List<Content> parseJD(String keywords) throws Exception {
        // 获取请求：https://search.jd.com/Search?keyword=java
        String url = "https://search.jd.com/Search?keyword=" + keywords;
        //解析网页，document就是浏览器的document对象
        Document document = Jsoup.parse(new URL(url), 30000);
        //所有js操作都可以通过document对象进行操作

        // 获取商品列表
        Element jGoodsListElement = document.getElementById("J_goodsList");
        // 获取商品标签
        Elements li = jGoodsListElement.getElementsByTag("li");
        List<Content> goodsList = new ArrayList<>(li.size());
        for (Element el : li) {
            String img = el.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
            String price = el.getElementsByClass("p-price").eq(0).text();
            String title = el.getElementsByClass("p-name").eq(0).text();
            goodsList.add(new Content().setImg(img).setTitle(title).setPrice(price));
        }
        return goodsList;
    }

3.测试解析成功

4.封装对象

package com.bie.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;

/**
 * @author bjh
 * @date 2022/12/12
 */
@Data
@NoArgsConstructor
@AllArgsConstructor
@Accessors(chain = true)
public class Content {

    private String img;
    private String title;
    private String price;
}

5.引入es配置类

package com.bie.config;

import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

/**
 * @author bjh
 * @date 2022/12/9
 */

@Configuration
public class ElasticSearchClientConfig {

    @Bean
    public RestHighLevelClient restHighLevelClient() {
        RestHighLevelClient client = new RestHighLevelClient(
                RestClient.builder(
                        new HttpHost("192.168.229.132", 9200, "http")));
        return client;
    }
}

6.将HtmlParseUtil注册到spring

package com.bie.utils;

import com.bie.pojo.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * @author bjh
 * @date 2022/12/12
 */
@Component
public class HtmlParseUtil {

    public static void main(String[] args) throws Exception {
        new HtmlParseUtil().parseJD("vue").forEach(System.out::println);
    }

    public List<Content> parseJD(String keywords) throws Exception {
        // 获取请求：https://search.jd.com/Search?keyword=java
        String url = "https://search.jd.com/Search?keyword=" + keywords;
        //解析网页，document就是浏览器的document对象
        Document document = Jsoup.parse(new URL(url), 30000);
        //所有js操作都可以通过document对象进行操作

        // 获取商品列表
        Element jGoodsListElement = document.getElementById("J_goodsList");
        // 获取商品标签
        Elements li = jGoodsListElement.getElementsByTag("li");
        List<Content> goodsList = new ArrayList<>(li.size());
        for (Element el : li) {
            String img = el.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
            String price = el.getElementsByClass("p-price").eq(0).text();
            String title = el.getElementsByClass("p-name").eq(0).text();
            goodsList.add(new Content().setImg(img).setTitle(title).setPrice(price));
        }
        return goodsList;
    }
}

7.爬取的数据入es库

1.在es创建索引"jd_goods"
在这里插入图片描述

2.编写controller

package com.bie.controller;

import com.bie.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;

/**
 * @author bjh
 * @date 2022/12/12
 */
@RestController
public class ContentController {

    @Autowired
    private ContentService contentService;

    @GetMapping("/parse/{keyword}")
    public boolean parse(@PathVariable("keyword") String keyword) throws Exception {
        return contentService.parseContent(keyword);
    }

}

3.编写service

package com.bie.service;

import com.alibaba.fastjson.JSON;
import com.bie.pojo.Content;
import com.bie.utils.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.util.List;

/**
 * @author bjh
 * @date 2022/12/12
 */
@Service
public class ContentService {

    @Autowired
    private RestHighLevelClient restHighLevelClient;

    public boolean parseContent(String keywords) throws Exception {
        // 解析
        List<Content> contents = new HtmlParseUtil().parseJD(keywords);
        // 放入es
        BulkRequest bulkRequest = new BulkRequest();
        bulkRequest.timeout(TimeValue.timeValueSeconds(1));

        for (int i = 0; i < contents.size(); i++) {
            bulkRequest.add(new IndexRequest("jd_goods")
                    .source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
        }

        BulkResponse bulkResponse = this.restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
        return !bulkResponse.hasFailures();
    }

}

4.测试
爬取java相关：
访问：http://localhost:9090/parse/java
在这里插入图片描述

爬取vue相关：
在这里插入图片描述

8.空白文件初始化vue

初始化vue项目

#1.创建空文件夹
mkdir /home/test && cd /home/test
#2.初始化npm
npm init
#3.安装vue
npm install vue
#vue-min.js文件下载地址，将内容拷贝到vue-min.js文件中
https://cdn.staticfile.org/vue/2.6.11/vue.min.js

#4.安装axios 通信
npm install axios
#5.将初始化的vue目录，拷贝到boot项目中