Java爬虫——Springboot+JPA +HttpClient+Jsoup爬取京东产品信息(oracle)

网页端不登录可访问的网页,爬虫显示登录页面怎么办?

  • httpGet.setHeader("User-Agent","值");
  • 值可通过F12查看Header获取

一、添加Maven依赖

<dependencies>
    <!-- SpringMVC -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
        <version>2.6.0</version>
    </dependency>

    <!-- SpringData jpa -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-data-jpa</artifactId>
        <version>2.6.0</version>
    </dependency>

    <!-- oracle驱动依赖 -->
    <dependency>
        <groupId>com.oracle</groupId>
        <artifactId>ojdbc6</artifactId>
        <version>11.2.0.4</version>
    </dependency>

    <!-- 用于抓取数据 httpClient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.13</version>
    </dependency>

    <!-- jsoup解析html -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>

    <!--lang库的工具包,例如StringUtils-->
    <dependency>
        <groupId>org.apache.commons</groupId>
        <artifactId>commons-lang3</artifactId>
        <version>3.11</version>
    </dependency>


    <!-- 日志依赖:springboot默认slf4j+logback -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-logging</artifactId>
    </dependency>

</dependencies>

二、添加application.properties配置文件

# oracle数据库配置
spring.datasource.driver-class-name=oracle.jdbc.OracleDriver
spring.datasource.url=jdbc:oracle:thin:@localhost:1521:ORCL
spring.datasource.username=root
spring.datasource.password=数据库密码

# jpa配置
spring.jpa.database=oracle
spring.jpa.show-sql=true
# 更新或者创建数据表结构
spring.jpa.hibernate.ddl-auto=update

三、代码实现

1. 编写实体类

1.1 JdItem.class
package com.crawler.entities;

import org.hibernate.annotations.Comment;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name = "crawler_jd_item")
public class JdItem {

    public JdItem() {
    }

    public JdItem(String title, Double price, String picture, String url, Date createTime, Date updateTime) {
        this.title = title;
        this.price = price;
        this.picture = picture;
        this.url = url;
        this.createTime = createTime;
        this.updateTime = updateTime;
    }

    @Id
    @GeneratedValue(strategy = GenerationType.SEQUENCE, generator="idSeqGenerator")
    @SequenceGenerator(name="idSeqGenerator", sequenceName = "jditem_id_seq",
            initialValue = 1, allocationSize = 1)
    @Comment("主键")
    private long id;

    @Comment("商品编号")
    private long sku;

    @Comment("商品标题")
    private String title;

    @Comment("商品价格")
    private Double price;

    @Comment("商品图片")
    private String picture;

    @Comment("商品详细地址")
    private String url;

    @Comment("创建时间")
    private Date createTime;

    @Comment("更新时间")
    private Date updateTime;

    //添加 get/set方法
}

2. 编写dao层

2.1 JdItemDao.java
package com.crawler.daos;


import com.crawler.entities.JdItem;
import org.springframework.data.jpa.repository.JpaRepository;

/**
 * 使用的是jpa,需要继承JpaRepository<实体类,实体类主键类型>
 */
public interface JdItemDao extends JpaRepository<JdItem, Long> {
}

3. 编写service层

3.1 service接口 JdItemService.java
package com.crawler.services;


import com.crawler.entities.JdItem;

import java.util.List;

public interface JdItemService {

    //保存商品
    public void save(JdItem jdItem);

    //根据条件查询商品
    public List<JdItem> findAll(JdItem jdItem);

}
3.2 service实现类 JdItemServiceImpl.java
package com.crawler.services.impl;

import com.crawler.daos.JdItemDao;
import com.crawler.entities.JdItem;
import com.crawler.services.JdItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Component;

import java.util.List;

@Component
public class JdItemServiceImpl implements JdItemService {

    @Autowired
    private JdItemDao jdItemDao;

    @Override
    public void save(JdItem jdItem) {
        this.jdItemDao.save(jdItem);
    }

    @Override
    public List<JdItem> findAll(JdItem jdItem) {
        //声明查询条件
        Example<JdItem> example = Example.of(jdItem);

        //根据查询条件进行查询数据
        List<JdItem> jdItemList = this.jdItemDao.findAll(example);

        return jdItemList;
    }
}

4. 封装HttpClient工具类

4.1 HttpUtils.java
package com.crawler.utils;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

@Component
public class HttpUtils {

    //创建连接池管理器
    private PoolingHttpClientConnectionManager cm;

    /**
     * 构造方法里初始化连接池管理器
     */
    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();
        //1.1 设置最大连接数
        this.cm.setMaxTotal(100);
        //1.2 设置每个主机地址(即访问地址)的最大连接数
        this.cm.setDefaultMaxPerRoute(10);
    }

    /**
     * 根据请求地址下载页面数据
     * @param url
     */
    public String doGetHtml(String url) throws IOException {
        CloseableHttpResponse response = getResponse(url);
        //4. 解析响应,返回结果
        if(response.getStatusLine().getStatusCode()==200){
            //4.1 判断响应体Entity是否为空,不为空可以使用EntityUtils
            if(response.getEntity() != null){
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                return content;
            }
        }
        //关闭response资源
        if(response!=null){
            response.close();
        }
        return "";
    }

    /**
     * 根据地址下载图片
     * 	分析:https://img12.360buy....462.jpg
     * 	url后缀是jpg表示图片格式的扩展名
     * @param imageUrl
     * @return
     */
    public String doGetImage(String imageUrl) throws IOException {
        CloseableHttpResponse response = getResponse(imageUrl);
        //4. 解析响应,返回结果
        if(response.getStatusLine().getStatusCode()==200){
            //4.1 判断响应体Entity是否为空,不为空可以使用EntityUtils
            if(response.getEntity() != null){

                //4.2 下载图片
                //4.2.1 获取图片的后缀(扩展名)
                String extName = imageUrl.substring(imageUrl.lastIndexOf("."));
                //4.2.2 创建图片名,重命名图片
                String picName = UUID.randomUUID().toString()+extName;
                //4.2.3 声明OutputStream
                OutputStream outputStream = new FileOutputStream(new File("./crawler_jd/images/"+picName));
                response.getEntity().writeTo(outputStream);
                //4.2.4 返回图片名称
                return picName;
            }
        }
        //关闭response资源
        if(response!=null){
            response.close();
        }
        return "";
    }


    //获取response
    private CloseableHttpResponse getResponse(String url) throws IOException {
        //1. 获取 httpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //2. 创建HttpGet请求对象,设置url地址
        HttpGet httpGet = new HttpGet(url);
        //2.1 设置请求配置信息
        httpGet.setConfig(this.getConfig());
        //2.2 设置请求头:可能遇上登录页面设置请求Request Headers中的User-Agent,告诉京东是浏览器访问
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36");
        //3. 使用HttpClient发起请求,获取响应
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }


    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(1000) //创建连接的最长时间
                .setConnectionRequestTimeout(500) //获取连接的最长时间
                .setSocketTimeout(10*1000) //数据传输的最长时间
                .build();
        return config;
    }


}

5. 实现数据爬虫

5.1 JdItemTask.java
package com.crawler.tasks;

import com.crawler.entities.JdItem;
import com.crawler.services.JdItemService;
import com.crawler.utils.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.Date;
import java.util.List;

@Component
public class JdItemTask {

    private static final Logger logger = LoggerFactory.getLogger(JdItemTask.class);

    @Autowired
    private HttpUtils httpUtils;

    @Autowired
    private JdItemService jdItemService;

    /**
     * 当下载任务完成后,间隔多长时间进行下一次的任务
     */
    @Scheduled(fixedDelay = 100*1000)
    public void jdItemTask() throws Exception{

        //1. 声明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=7e272b81e0c24f478b9d4721de77b53d&page=";

        //2. 按照页码page,对搜索结果进行遍历解析
        for(int pageValue=1; pageValue<10; pageValue=pageValue+2){
            String content = httpUtils.doGetHtml(url+pageValue);
            //3. 解析页面,获取商品数据并存储
            this.parseHtml(content);
        }

        logger.info("京东搜索手机数据抓取测试完成......");

    }

    /**
     *     private Date createTime;
     *     private Date updateTime;
     * @param content
     * @throws Exception
     */
    private void parseHtml(String content) throws Exception{
        //3.1 解析html获取Document
        Document doc = Jsoup.parse(content);
        Elements itemELements = doc.select("div#J_goodsList > ul > li");

        for(Element element: itemELements){
            logger.info("遍历li下面的元素: "+element.text());

            JdItem jdItem = new JdItem();
            //3.2 获取sku商品编号
            String sku = element.select("[data-sku]").attr("data-sku");
            jdItem.setSku(Long.parseLong(sku));

            //3.2.1 判断抓取的否已经抓取过存入数据库了
            List<JdItem> list = jdItemService.findAll(jdItem);
            if(list.size()>0){
                continue;
            }

            //3.2 获取商品标题
            Element titleElement = element.select(".p-name>a>em").first();
            jdItem.setTitle(titleElement.text());
            //3.3 获取商品价格
            Element priceElement = element.select(".p-price>strong>i").first();
            jdItem.setPrice(Double.valueOf(priceElement.text()));
            //3.4 获取商品图片
            String picUrl = element.select(".p-img>a>img").attr("data-lazy-img");
            jdItem.setPicture(httpUtils.doGetImage("https:"+picUrl));
            //3.5 获取商品详细地址url
            String url = element.selectFirst(".p-name>a").attr("href");
            jdItem.setUrl(url);

            jdItem.setCreateTime(new Date());
            jdItem.setUpdateTime(jdItem.getCreateTime());

            jdItemService.save(jdItem);
        }

    }
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

沐木金

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值