简单Java爬虫案例-HttpClient4.X+Jsoup爬取表情包

让线程再跑一会

已于 2023-03-31 08:27:46 修改

阅读量784

点赞数 2

分类专栏： Java爬虫文章标签： java 网络爬虫爬虫 Powered by 金山文档

于 2023-03-23 19:36:02 首次发布

本文链接：https://blog.csdn.net/m0_64261982/article/details/129737334

版权

Java爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

🐔🐔🐔作为一名真爱粉怎么能没有ikun的表情包?🐔🐔🐔

🍓使用到的技术

HttpClient4.x 也就是 org.apache.http.xxx 这个版本

Jsoup 1.15.3

坤图来源：斗图王

🍒主要思路

借助HttpClient通过GET方法请求资源

将返回的实体对象转为源码(html)

使用Jsoup解析出源码中图片地址

下载图片到本地

🍎坑

Target host is not specified [需要把文件地址写完整，URL地址必须加协议名称(http或https)]

🍇爬取流程

查看网页结构及编码

查看编码

查看网页结构

查看结构发现，我们可以用css选择器轻松解析出来

即 #post_container > div > li > div > a > img

# 代表id选择器

> 代表层级结构

编写请求和返回实体的代码

尽可能打包成方法，方便代码复用

public static HttpEntity getEntityByHttpGetMethod(String url) throws IOException {
        //生成GET请求对象
        HttpGet httpGet = new HttpGet(url);
        //执行请求并获得响应对象
        HttpResponse httpResponse = httpClient.execute(httpGet);
        //获得响应实体
        HttpEntity entity = httpResponse.getEntity();
        return entity;
    }

编写保存图片的代码

使用BufferedInputStream下载图片更好一些

public static void saveImg(String url,String savePath) throws IOException {
        //获取图片信息，做输出流
        InputStream in = getEntityByHttpGetMethod(url).getContent();
        //定义每次读取的最大值为 1KB
        byte[] buffer = new byte[1024];
        BufferedInputStream inputStream = new BufferedInputStream(in);
        int len = 0;
        //创建缓冲流
        FileOutputStream out = new FileOutputStream(new File(savePath));
        BufferedOutputStream outputStream = new BufferedOutputStream(out);
        //图片写入
        while ((len = inputStream.read(buffer,0,1024)) != -1){
            outputStream.write(buffer,0,len);
        }
        //关闭缓冲流资源
        inputStream.close();
        outputStream.close();
    }

🍈完整代码

package com.ikun;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.URLDecoder;
import java.net.URLEncoder;

/**
 * 使用工具 - HttpClient 4.X
 * 下载图片
 */
public class CrawlerImg {
    private static HttpClient httpClient = HttpClients.custom().build();
    private static String savePath = "D:\\ikun\\";

    public static void main(String[] args) throws IOException {
        String url = "https://www.doutuwang.com/?s=%E8%94%A1%E5%BE%90%E5%9D%A4";
        //String url = "https://www.doutub.com/search/%E8%94%A1%E5%BE%90%E5%9D%A4/1";
        //获得响应对象
        HttpEntity httpEntity = getEntityByHttpGetMethod(url);
        String html = EntityUtils.toString(httpEntity,"utf-8");
        //生成 Document对象
        Document doc = Jsoup.parse(html);
        Elements elements = doc.select("#post_container > div > li > div > a > img");
        System.out.println("共发现资源数目 "+elements.size());
        //下载100张
        int count = 0;
        for (Element element : elements){
            //System.out.println(element.text());
            count++;
            String picUrl = element.attr("src");
            System.out.println(picUrl);

            if (picUrl.contains("gif")){
                saveImg(picUrl,savePath+"00"+count+".gif");
            }else {
                saveImg(picUrl,savePath+"00"+count+".jpg");
            }
            if (count==100){
                break;
            }
            System.out.println("已下载到"+savePath);
        }
        System.out.println("共下载"+count+"张");
    }
    public static HttpEntity getEntityByHttpGetMethod(String url) throws IOException {
        //生成GET请求对象
        HttpGet httpGet = new HttpGet(url);
        //执行请求并获得响应对象
        HttpResponse httpResponse = httpClient.execute(httpGet);
        //获得响应实体
        HttpEntity entity = httpResponse.getEntity();
        return entity;
    }
    public static void saveImg(String url,String savePath) throws IOException {
        //获取图片信息，做输出流
        InputStream in = getEntityByHttpGetMethod(url).getContent();
        //定义每次读取的最大值为 1KB
        byte[] buffer = new byte[1024];
        BufferedInputStream inputStream = new BufferedInputStream(in);
        int len = 0;
        //创建缓冲流
        FileOutputStream out = new FileOutputStream(new File(savePath));
        BufferedOutputStream outputStream = new BufferedOutputStream(out);
        //图片写入
        while ((len = inputStream.read(buffer,0,1024)) != -1){
            outputStream.write(buffer,0,len);
        }
        //关闭缓冲流资源
        inputStream.close();
        outputStream.close();
    }
}