爬虫介绍、HttpClient、Jsoup、爬虫案例

阿无，

已于 2022-01-22 15:02:02 修改

阅读量374

点赞数

分类专栏：爬虫文章标签：爬虫 apache java

于 2020-11-23 18:33:08 首次发布

本文链接：https://blog.csdn.net/weixin_44431371/article/details/109967194

版权

爬虫专栏收录该内容

7 篇文章 0 订阅

订阅专栏

网络爬虫介绍

网络爬虫（Web crawler），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本

在大数据时代，信息的采集是一项重要的工作，而互联网中的数据是海量的，如果单纯靠人力进行信息采集，不仅低效繁琐，搜集的成本也会提高。如何自动高效地获取互联网中我们感兴趣的信息并为我们所用是一个重要的问题，而爬虫技术就是为了解决这些问题而生的。

从功能上来讲，爬虫一般分为数据采集，处理，储存三个部分。爬虫从一个或若干初始网页的URL开始，获得初始网页上的URL，在抓取网页的过程中，不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件。

HttpClient抓取数据

pom

<dependencies>
        <!-- HttpClient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <!-- 日志 -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>
        <!--工具类，使用其中一个FileUtils工具类-->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>
        <!--jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>
        <!--字符串 处理工具-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.7</version>
        </dependency>
    </dependencies>

log4j.properties

# debug级别的日志是最详细的，A1就是定义下面哪些东西
log4j.rootLogger=DEBUG,A1
log4j.logger.cn.lx = DEBUG

# 控制台输出
log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
# 输出的格式
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

get请求

package com.itheima;

import org.apache.commons.io.FileUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.IOException;

public class HttpClientTest {

    public static void main(String[] args) {

        //  1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 2.创建HttpGet对象，填充请求目标地址
        HttpGet httpGet = new HttpGet("http://java.itcast.cn/?javaeezly");

        CloseableHttpResponse response = null;

        // 3. 发起请求
        try {
            response = httpClient.execute(httpGet);

            // 根据状态码是否为200判断请求是否成功
            if (response.getStatusLine().getStatusCode() == 200){

                // param1 响应体的对象
                String content = EntityUtils.toString(response.getEntity(), "utf-8");

                // 把内容输出到文件中
                // 第四个参数是 是否追加
                FileUtils.writeStringToFile(new File("D:\\itcast.html"),content,"utf-8",true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            if (httpClient != null){
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }


    }
}

post请求

package com.itheima;

import org.apache.commons.io.FileUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

public class HttpClientTest {

    public static void main(String[] args) throws UnsupportedEncodingException {

        //  1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 2.创建HttpGet对象，填充请求目标地址
        HttpPost httpPost = new HttpPost("http://java.itcast.cn/?javaeezly");


        List<NameValuePair> params = new ArrayList<>();

        params.add(new BasicNameValuePair("keys","java"));

        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8");

        // 把表单实体对象设置大瓶httpPost请求中
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response = null;

        // 3. 发起请求
        try {
            response = httpClient.execute(httpPost);

            // 根据状态码是否为200判断请求是否成功
            if (response.getStatusLine().getStatusCode() == 200){

                // param1 响应体的对象
                String content = EntityUtils.toString(response.getEntity(), "utf-8");

                // 把内容输出到文件中
                // 第四个参数是 是否追加
                FileUtils.writeStringToFile(new File("D:\\itcast_post.html"),content,"utf-8",true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            if (httpClient != null){
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }


    }
}

连接池

package com.itheima;

import org.apache.commons.io.FileUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

public class HttpClientTest {

    public static void main(String[] args) throws UnsupportedEncodingException {

        // 创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

        // 设置最大连接数
        cm.setMaxTotal(100);

        // 设置访问每个主机地址的最大连接数
        cm.setDefaultMaxPerRoute(20);



        //  1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 2.创建HttpGet对象，填充请求目标地址
        HttpPost httpPost = new HttpPost("http://java.itcast.cn/?javaeezly");

        RequestConfig config = RequestConfig.custom().
                setConnectTimeout(1000) //创建连接的最长时间
                .setConnectionRequestTimeout(500) //获取连接的最大时间
                .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间

                .build();



        httpPost.setConfig(config);


        List<NameValuePair> params = new ArrayList<>();

        params.add(new BasicNameValuePair("keys","java"));

        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf-8");

        // 把表单实体对象设置大瓶httpPost请求中
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response = null;

        // 3. 发起请求
        try {
            response = httpClient.execute(httpPost);

            // 根据状态码是否为200判断请求是否成功
            if (response.getStatusLine().getStatusCode() == 200){

                // param1 响应体的对象
                String content = EntityUtils.toString(response.getEntity(), "utf-8");

                // 把内容输出到文件中
                // 第四个参数是 是否追加
                FileUtils.writeStringToFile(new File("D:\\itcast_pool.html"),content,"utf-8",true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (response != null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

        }


    }
}

Jsoup解析数据

我们抓取到页面之后，还需要对页面进行解析。可以使用字符串处理工具解析页面，也可以使用正则表达式，但是这些方法都会带来很大的开发成本，所以我们需要使用一款专门解析html页面的技术。

jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。

jsoup的主要功能如下：

从一个URL，文件或字符串中解析HTML；
使用DOM或CSS选择器来查找、取出数据；
可操作HTML元素、属性、文本；

package com.itheima;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

public class JsoupTest {

    // 解析url
    @Test
    public void test1() throws IOException {
        Document document = Jsoup.parse(new URL("http://www.itcast.cn"), 10000);

        String title = document.getElementsByTag("title").text();
        System.out.println(title);
    }



    // 解析字符串
    @Test
    public void test2() throws IOException {

        // 使用工具类获取文件中的字符串
        String content = FileUtils.readFileToString(new File("D:\\itcast.html"), "utf-8");

        Document document = Jsoup.parse(content);

        String title = document.getElementsByTag("title").text();
        System.out.println(title);
    }


    // 解析文件
    @Test
    public void test3() throws IOException {


        Document document = Jsoup.parse(new File("D:\\itcast.html"),"utf-8");

        String title = document.getElementsByTag("title").text();
        System.out.println(title);
    }
}

dom方式解析和获取数据

要解析的html

<html>
 <head> 
  <title>传智播客官网-一样的教育,不一样的品质</title> 
 </head> 
 <body>
	<div class="city">
		<h3 id="city_bj" class="city_center">北京中心</h3>
		<fb:img src="/2018czgw/images/slogan.jpg" class="slogan"/>
		<div class="city_in">
			<div class="city_con" style="display: none;">
				<ul>
					<li id="test" class="class_a class_b">
						<a href="http://www.itcast.cn" target="_blank">
							<span class="s_name">北京</span>
						</a>
					</li>
					<li>
						<a href="http://sh.itcast.cn" target="_blank">
							<span class="s_name">上海</span>
						</a>
					</li>
					<li>
						<a href="http://gz.itcast.cn" target="_blank">
							<span abc="123" class="s_name">广州</span>
						</a>
					</li>
					<ul>
						<li>天津</li>
					</ul>					
				</ul>
			</div>
		</div>
	</div>
 </body>
</html>

package com.itheima;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.net.URL;

public class JsoupTest2 {

    @Test
    public void test1() throws IOException {
        Document document = Jsoup.parse(new File("D:\\itcast.html"), "utf-8");

        // 获取元素

        // 根据id获取
        String cityBj = document.getElementById("city_bj").text();
        System.out.println(cityBj);

        // 根据标签获取
        // 多个标签，获取第一个，也可以 .first
        String title = document.getElementsByTag("span").get(0).text();
        System.out.println(title);

        // 根据class获取
        String sName = document.getElementsByClass("s_name").text();
        System.out.println(sName);

        // 根据属性元素获取
        // <span abc="123" class="s_name">广州</span>
        // 同时class也是属性，只不过它的目标太多了，不常用
        String text = document.getElementsByAttribute("abc").text();
        System.out.println(text);
    }


    @Test
    public void test2() throws IOException {
        Document document = Jsoup.parse(new File("D:\\itcast.html"), "utf-8");

        Element cityBj = document.getElementById("city_bj");

        // 从元素中获取id
        // city_bj
        String id = cityBj.id();
        System.out.println(id);

        // 从元素中获取className
        // className获取到所有className，classNames 提供去重功能
        // city_center
        String s = cityBj.className();
        System.out.println(s);

        // 从元素中获取属性的值attr
        // city_center
        String aClass = cityBj.attr("class");
        System.out.println(aClass);

        // 从元素中获取所有属性
        // id="city_bj" class="city_center"
        Attributes attributes = cityBj.attributes();
        System.out.println(attributes);

        String text = cityBj.text();

        // html
        // 获取的不是自身的html，而是子html
        String html = cityBj.html();

    }

}

css选择器获取元素

package com.itheima;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import java.io.File;
import java.io.IOException;

public class JsoupTest3 {

    @Test
    public void test1() throws IOException {

        Document document = Jsoup.parse(new File("D:\\itcast.html"), "utf-8");

        // Selector

        // tagname:
        String title = document.select("title").text();
        System.out.println(title);

        // 通过标签查找元素，比如：span
        String h3 = document.select("h3").text();
        System.out.println(h3);

        // #id: 通过ID查找元素，比如：#city_bj
        String text = document.select("#city_bj").text();
        System.out.println(text);

        // .class:通过class名称查找元素，比如：.class_a
        String text1 = document.select(".city_center").text();
        System.out.println(text1);

        // [attribute]:利用属性查找元素，比如：[abc]
        String abc = document.select("[abc]").text();
        System.out.println(abc);


        // [attr = value]:利用属性值来查找元素，比如：[class=s_name]
        String attrValue = document.select("[abc=123]").text();
        System.out.println(attrValue);


    }


    @Test
    public void test2() throws IOException {

        Document document = Jsoup.parse(new File("D:\\itcast.html"), "utf-8");

        // Selector组合使用

//        el#id: 元素+ID，比如： h3#city_bj
        String text = document.select("h3#city_bj").text();
        System.out.println(text);

//        el.class: 元素+class，比如： li.class_a
        String text1 = document.select("li.class_a").text();
        System.out.println(text1);

//        el[attr]: 元素+属性名，比如： span[abc]
        String text2 = document.select("span[abc]").text();
        System.out.println(text2);

//        任意组合: 比如：span[abc].s_name
        String text3 = document.select("span[abc].s_name").text();
        System.out.println(text3);

//        ancestor child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li
        String text4 = document.select(".city_con li").text();
        System.out.println(text4);

//        parent > child: 查找某个父元素下的直接子元素，比如：.city_con > ul > li
        String text5 = document.select(".city_con > ul > li").text();
        System.out.println(text5);

//        parent > *: 查找某个父元素下所有直接子元素
        String text6 = document.select(".city_con > *").text();
        System.out.println(text6);


    }
}

爬虫抓取京东手机案例

并不能抓到所有想要的数据，只是对以上知识的一个掌握

pom

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<parent>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-parent</artifactId>
		<version>2.6.3</version>
		<relativePath/> <!-- lookup parent from repository -->
	</parent>
	<groupId>com.itheima</groupId>
	<artifactId>crawler-jdphone</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>crawler-jdphone</name>
	<description>Demo project for Spring Boot</description>
	<properties>
		<java.version>1.8</java.version>
	</properties>
	<dependencies>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>
		<dependency>
			<groupId>org.mybatis.spring.boot</groupId>
			<artifactId>mybatis-spring-boot-starter</artifactId>
			<version>2.2.1</version>
		</dependency>

		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.13</version>
		</dependency>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.9</version>
		</dependency>


		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
			<scope>test</scope>
		</dependency>
	</dependencies>

	<build>
		<plugins>
			<plugin>
				<groupId>org.springframework.boot</groupId>
				<artifactId>spring-boot-maven-plugin</artifactId>
			</plugin>
		</plugins>
	</build>

</project>

创建数据库的表

DROP TABLE if exists jd_item;

CREATE TABLE `jd_item` (
  `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
  `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
  `title` varchar(1000) DEFAULT NULL COMMENT '商品标题',
  `price` float(10,0) DEFAULT NULL COMMENT '商品价格',
  `pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
  `url` varchar(1500) DEFAULT NULL COMMENT '商品详情地址',
  `created` datetime DEFAULT NULL COMMENT '创建时间',
  `updated` datetime DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`),
  KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=217 DEFAULT CHARSET=utf8 COMMENT='京东商品';

application.yml

spring:
  datasource:
    driver-class-name: com.mysql.jdbc.Driver
    url: jdbc:mysql://localhost:3306/crawler
    username: root
    password: root
mybatis:
  type-aliases-package: com.itheima.crawlerjdphone.domain

springboot启动类

package com.itheima.crawlerjdphone;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;

@SpringBootApplication
@MapperScan("com.itheima.crawlerjdphone.mapper")
public class CrawlerJdphoneApplication {

	public static void main(String[] args) {
		SpringApplication.run(CrawlerJdphoneApplication.class, args);
	}


	// 创建httpClient连接池管理器
	@Bean
	public PoolingHttpClientConnectionManager createCm(){
		PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

		cm.setMaxTotal(100);
		cm.setDefaultMaxPerRoute(20);

		return cm;
	}

}

HttpClient工具类

package com.itheima.crawlerjdphone.utils;

import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

@Component
public class HttpClientUtils {

    @Autowired
    private PoolingHttpClientConnectionManager cm;

    public CloseableHttpClient getHttpClient(){
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .build();

        return httpClient;
    }

}

dao

package com.itheima.crawlerjdphone.mapper;

import com.itheima.crawlerjdphone.domain.JdItem;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;

import java.util.List;

public interface JdMapper {

    @Select("select * from jd_item")
    List<JdItem> findAll();

    @Insert({
            "<script>",
            "insert into jd_item(spu,sku,title,price,created,updated) values ",
            // collection 和 value对应   item代表循环中List的这一项IpMap
            "<foreach collection='jdItemList' item='item' separator=','>",
            "(#{item.spu}, #{item.sku}, #{item.title},#{item.price},#{item.created},#{item.updated})",
            "</foreach>",
            "</script>"
    })
    void insert(@Param("jdItemList") List<JdItem> jdItemList);
}

crawler核心类

package com.itheima.crawlerjdphone.crawler;

import com.itheima.crawlerjdphone.domain.JdItem;
import com.itheima.crawlerjdphone.mapper.JdMapper;
import com.itheima.crawlerjdphone.utils.HttpClientUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

// 实现爬虫核心代码
@Component
public class JdCrawler {

    @Autowired
    private HttpClientUtils httpClientUtils;

    @Autowired
    private JdMapper jdMapper;

    String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=2047cee03dfe4b85b4d05bf4d30a27b6&s=1&click=0&page=";

    private Thread thread;


    public void run() {

        CloseableHttpClient httpClient = httpClientUtils.getHttpClient();
        for (int i = 1; i < 10; i = i + 2) {

            HttpGet httpGet = new HttpGet(url + i);
            httpGet.setHeader("user-agent", " Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36");

            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);

                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity(), "utf-8");

                    Document document = Jsoup.parse(content);

                    // 解析页面获取商品数据


                    Elements elements = document.select(".gl-warp.clearfix > *");

                    List<JdItem> jdItemList = new ArrayList<>();
                    for (Element element : elements) {
                        String spu = element.attr("data-spu");
                        if (spu == "") {
                            spu = "0";
                        }
                        JdItem jdItem = new JdItem();
                        jdItem.setSpu(Long.valueOf(spu));

                        String sku = element.attr("data-sku");
                        jdItem.setSku(Long.valueOf(sku));

                        String price = element.select(".p-price i").text();
                        jdItem.setPrice(Double.valueOf(price));

                        String title = element.select(".p-name em").text();
                        jdItem.setTitle(title);

                        //商品图片
                        String picUrl = "http:" + element.select("div.p-img img").first().attr("source-data-lazy-img");
                        //下载图片并返回图片名
                        String pic = download(picUrl);
                        jdItem.setPic(pic);
                        //商品详情url,使用拼接
                        jdItem.setUrl("https://item.jd.com/" + jdItem.getSku() + ".html");

                        jdItem.setCreated(LocalDateTime.now());
                        jdItem.setUpdated(LocalDateTime.now());
                        jdItemList.add(jdItem);

                    }

                    jdMapper.insert(jdItemList);


                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (response != null) {
                    try {
                        response.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }


    private String download(String picUrl) {
        //获取HttpClient
        CloseableHttpClient httpClient = httpClientUtils.getHttpClient();

        HttpGet httpGet = new HttpGet(picUrl);
        httpGet.setHeader("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" +
                        " Chrome/73.0.3683.86 Safari/537.36");

        CloseableHttpResponse response = null;
        try {

            response = httpClient.execute(httpGet);

            if (response.getStatusLine().getStatusCode() == 200) {
                String uuid = UUID.randomUUID().toString();
                String extName = picUrl.substring(picUrl.lastIndexOf("."));
                String picName = uuid + extName;

                FileOutputStream outputSteam = new FileOutputStream(new File("D:\\images\\" + picName));
                response.getEntity().writeTo(outputSteam);

                //返回图片名
                return picName;
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
}

阿无，

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫介绍、HttpClient、Jsoup、爬虫案例

网络爬虫介绍网络爬虫（Web crawler），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本在大数据时代，信息的采集是一项重要的工作，而互联网中的数据是海量的，如果单纯靠人力进行信息采集，不仅低效繁琐，搜集的成本也会提高。如何自动高效地获取互联网中我们感兴趣的信息并为我们所用是一个重要的问题，而爬虫技术就是为了解决这些问题而生的。从功能上来讲，爬虫一般分为数据采集，处理，储存三个部分。爬虫从一个或若干初始网页的URL开始，获得初始网页上的URL，在抓取网页的过程中，不断从当前页面上抽取新的
复制链接

扫一扫

专栏目录