java爬虫（三）- 京东案例

子非我鱼

已于 2024-01-13 15:25:27 修改

阅读量574

点赞数 2

分类专栏： # java # 爬虫后端文章标签： java 爬虫

于 2021-06-25 18:50:22 首次发布

本文链接：https://blog.csdn.net/qq_45752401/article/details/118226444

版权

java 同时被 3 个专栏收录

45 篇文章 3 订阅

订阅专栏

后端

40 篇文章 1 订阅

订阅专栏

爬虫

6 篇文章 1 订阅

订阅专栏

在这里插入图片描述

📫 作者简介：「子非我鱼」，专注于研究全栈
🔥 三连支持：欢迎 ❤️关注、👍点赞、👉收藏三连，支持一下博主~

文章目录

步骤一：导入数据库

// 创建crawler数据库, 再创建表
CREATE TABLE `jd_item` (
  `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
  `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
  `title` varchar(100) DEFAULT NULL COMMENT '商品标题',
  `price` bigint(10) DEFAULT NULL COMMENT '商品价格',
  `pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
  `url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
  `created` datetime DEFAULT NULL COMMENT '创建时间',
  `updated` datetime DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`),
  KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表';

步骤二：导入所需依赖

<dependencies>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.11</version>
        </dependency>

        <!-- HttpClient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

        <!--Jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
    
      <!-- 通用Mapper启动器 -->
        <dependency>
            <groupId>tk.mybatis</groupId>
            <artifactId>mapper-spring-boot-starter</artifactId>
            <version>2.0.2</version>
        </dependency>

        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
    </dependencies>

步骤三：配置application.yml文件

server:
  port: 81
spring:
  datasource:
    driver-class-name: com.mysql.jdbc.Driver
    url: jdbc:mysql://localhost:3306/myqxin?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC
    username: root
    password: 9527

步骤四：创建pojo实体类

@Table(name = "jd_item")
@Data
public class Item {
    //主键
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    //标准产品单位（商品集合）
    private Long spu;
    //库存量单位（最小品类单元）
    private Long sku;
    //商品标题
    private String title;
    //商品价格
    private Double price;
    //商品图片
    private String pic;
    //商品详情地址
    private String url;
    //创建时间
    private Date created;
    //更新时间
    private Date updated;
}

步骤五：创建Mapper和Service

Mapper

/**
 * @author: myqxin
 * @Desc:
 * @create: 2021-06-25 14:24
 **/
@org.apache.ibatis.annotations.Mapper
public interface ItemMapper extends Mapper<Item> {
}

Service

@Service
public class ItemServiceImpl implements ItemService {

}

步骤六：创建启动类

@SpringBootApplication
@EnableScheduling   // 开启定时任务注解
public class MySpringBootStarter {
    public static void main(String[] args) {
        SpringApplication.run(MySpringBootStarter.class,args);
    }
}

步骤七：创建HttpUtils工具类

package com.czxy.music.web.config;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;

/**
 * @author: myqxin
 * @Desc:
 * @create: 2021-06-25 14:39
 **/
@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager pm;


    public HttpUtils() {
        this.pm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        this.pm.setMaxTotal(100);
        // 设置每个主机最大连接数
        this.pm.setDefaultMaxPerRoute(10);
    }

    /**
     * 根据请求地址下载页面数据
     *
     * @param url 网址
     * @return 页面数据
     */
    public String doGetHtml(String url) {
        // 获取 httpClient 对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.pm).build();

        // 创建 httpGet 请求对象，设置url地址
        HttpGet httpGet = new HttpGet(url);

        // 设置请求信息
        httpGet.setConfig(getConfig());

        // 设置请求头，伪装用户
        setHeaders(httpGet);
        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient发起请求，获取响应
            response = httpClient.execute(httpGet);
            // 解析响应，返回结果
            if (response.getStatusLine().getStatusCode() == 200) {
                // 判断响应体Entity是否为空，如果不为空就可以使用EntityUtils
                if (response.getEntity() != null) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    // 返回图片名称
                    return content;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭response
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        // 下载失败，返回空字符串
        return "";
    }

    /**
     * 下载图片
     *
     * @param url 地址
     * @return 图片名称
     */
    public String doGetImage(String url) {
        // 获取 httpClient 对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.pm).build();

        // 创建 httpGet 请求对象，设置url地址
        HttpGet httpGet = new HttpGet(url);

        // 设置请求信息
        httpGet.setConfig(getConfig());

        // 设置请求头，伪装用户
        setHeaders(httpGet);
        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient发起请求，获取响应
            response = httpClient.execute(httpGet);
            // 解析响应，返回结果
            if (response.getStatusLine().getStatusCode() == 200) {
                // 判断响应体Entity是否为空，如果不为空就可以使用EntityUtils
                if (response.getEntity() != null) {
                    // 下载图片
                    // 获取图片的后缀
                    String extName = url.substring(url.lastIndexOf("."));
                    // 创建图片名，重命名图片
                    String picName = UUID.randomUUID().toString() + extName;
                    // 声明 OutPutStream
                    FileOutputStream outputStream = new FileOutputStream(new File("C:\\Users\\myqxin\\Desktop\\Jdimages\\"+picName));
                    response.getEntity().writeTo(outputStream);
                    // 返回图片名称
                    return picName;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭response
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        // 下载失败，返回空字符串
        return "";
    }

    /**
     * 设置请求信息
     *
     * @return
     */
    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(1000)    // 创建连接的最长时间，单位：毫秒
                .setConnectionRequestTimeout(500)   // 获取连接的最长时间，单位：毫秒
                .setSocketTimeout(10000)    // 数据传输的最长时间，单位：毫秒
                .build();
        return config;
    }

    /**
     * 设置请求头
     *
     * @param httpGet
     */
    private void setHeaders(HttpGet httpGet) {
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
    }

}

注意: 使用HttpClient爬取数据时, 为了防止被网站拦截, 应该设置请求

步骤八：实现爬取京东手机信息数据（https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=cab755112a5e463e9ec1c356ac31fd1c&s=56&click=0&page=1）

package com.czxy.music.web.controller;

import com.czxy.music.pojo.Item;
import com.czxy.music.service.ItemService;
import com.czxy.music.web.config.HttpUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import javax.annotation.Resource;
import java.util.Date;
import java.util.List;

/**
 * @author: myqxin
 * @Desc:
 * @create: 2021-06-25 15:28
 **/
@Component
public class ItemTask {

    @Resource
    private HttpUtils httpUtils;
    @Resource
    private ItemService itemService;

    private static final ObjectMapper MAPPER = new ObjectMapper();


    @Scheduled(fixedDelay = 100 * 1000)
    public void itemTask() {
        // 声明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=cab755112a5e463e9ec1c356ac31fd1c&s=56&click=0&page=";
        for (int i = 1; i < 5; i = i + 2) {
            String html = httpUtils.doGetHtml(url + i);
            // 解析页面，获取商品数据并存储
            parse(html);
        }

        System.out.println("手机页面数据抓取完成");
    }

    private void parse(String html) {
        // 解析页面，获取Document对象
        Document doc = Jsoup.parse(html);

        // 获取 spu 信息
        Elements spuEles = doc.select("div#J_goodsList > ul > li");
        for (Element spuEle : spuEles) {
            // 排除没有data-spu的值
            if (StringUtils.isNoneBlank(spuEle.attr("data-spu"))) {
                // 获取 spu
                long spu = Long.parseLong(spuEle.attr("data-spu"));
                // 获取 sku 信息
                Elements skuEles = spuEle.select("li.ps-item");
                for (Element skuEle : skuEles) {
                    // 获取 sku
                    long sku = Long.parseLong(skuEle.select("[data-sku]").first().attr("data-sku"));
                    // 根据sku查询商品数据
                    Item item = new Item();
                    item.setSku(sku);
                    List<Item> list = itemService.findAll(item);
                    if (list.size()>0){
                        // 如果商品存在，就进行下一次循环，该商品不保存
                        continue;
                    }
                    // 设置商品的spu
                    item.setSpu(spu);

                    // 获取商品的详情的url
                    String itemUrl = "https://item.jd.com/" + sku + ".html";
                    item.setUrl(itemUrl);

                    // 获取商品的图片
                    String picUrl = "https:" + skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                    String picName = httpUtils.doGetImage(picUrl);
                    item.setPic(picName);

                    // 获取商品的价格
                    String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                    double price = 0;
                    try {
                        price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                    } catch (JsonProcessingException e) {
                        e.printStackTrace();
                    }
                    item.setPrice(price);
                    // 获取商品的标题
                    String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
                    String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                    item.setTitle(title);
                    //item.setTitle();
                    item.setCreated(new Date());
                    item.setUpdated(item.getCreated());

                    // 保存商品数据到数据库中

                    System.err.println(item);
                }
            }
        }

    }

}