使用Java写个简单的爬取图片并保存本地

最新推荐文章于 2024-08-17 19:31:40 发布

此人很懒啊

最新推荐文章于 2024-08-17 19:31:40 发布

阅读量501

点赞数 8

分类专栏： JAVA爬虫文章标签： java 爬虫

本文链接：https://blog.csdn.net/m0_65470316/article/details/136568929

版权

JAVA爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本文介绍了如何使用Java通过Jsoup库抓取豆瓣电影Top250的页面内容，利用多线程并发下载电影图片，包括引入依赖、连接网页、解析HTML以及处理下载任务。

摘要由CSDN通过智能技术生成

使用Java实现爬虫

话不多说，直接开干。

1.1引入依赖

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.8</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

1.2编写主要代码

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;



public class Test {
    public static void main(String[] args) throws Exception {
        ThreadPoolExecutor executor = new ThreadPoolExecutor(20, 20, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
        //翻页使用
        int pageNum = 0;
        //电影排名计数
        int count = 1;

        for (int i = 0; i < 10; i++) {
            String url = "https://movie.douban.com/top250?start=" + pageNum + "&filter=";
            //解析网页（Jsoup返回Document就是浏览器Document对象）
            Document document = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0").get();
            //获取所有li标签
            Elements name = document.getElementsByClass("article");
            for (Element element : name) {
                Elements li = element.getElementsByTag("li");

                for (Element element1 : li) {
                    //获取标题
                    String title = count + "-" + element1.getElementsByTag("img").attr("alt");
                    //获取图片地址
                    String attr = element1.getElementsByTag("img").attr("src");
                    //下载图片
                    executor.submit(new DownloadRunnable(attr, "/Users/XXX/Desktop/img", title));
                    count++;
                }

            }
            pageNum += 25;
        }
        executor.shutdown();
    }

}

1.3 编写多线程下载任务

import org.springframework.util.StringUtils;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;


public class DownloadRunnable implements Runnable{
    private String urlPath;
    private String targetDirectory;
    private String title;

    public DownloadRunnable(String urlPath, String targetDirectory, String title) {
        this.urlPath = urlPath;
        this.targetDirectory = targetDirectory;
        this.title = title;
    }
    @Override
    public void run() {
        try {
            download(urlPath,targetDirectory,title);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
    public void download(String urlPath , String targetDirectory,String title) throws Exception {
        // 解决url中可能有中文情况
        System.out.println("url:"+ urlPath);
        URL url = new URL(urlPath);
        HttpURLConnection http = (HttpURLConnection)url.openConnection();
        http.setConnectTimeout(3000);
        // 设置 User-Agent 避免被拦截
        http.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)");
        String contentType = http.getContentType();
        System.out.println("contentType: "+ contentType);
        // 获取文件大小
        long length = http.getContentLengthLong();
        System.out.println("文件大小："+(length / 1024)+"KB");
        // 获取文件名
        String fileName = getFileName(http , urlPath,title);
        InputStream inputStream = http.getInputStream();
        byte[] buff = new byte[1024*10];
        OutputStream out = new FileOutputStream(new File(targetDirectory,fileName));
        int len ;
        int count = 0; // 计数
        while((len = inputStream.read(buff)) != -1) {
            out.write(buff, 0, len);
            out.flush();
            ++count ;
        }
        System.out.println("count:"+ count);
        // 关闭资源
        out.close();
        inputStream.close();
        http.disconnect();
    }
    private String getFileName(HttpURLConnection http , String urlPath,String title) throws UnsupportedEncodingException {
        String headerField = http.getHeaderField("Content-Disposition");
        String fileName = null ;
        if(null != headerField) {
            String decode = URLDecoder.decode(headerField, "UTF-8");
            fileName = decode.split(";")[1].split("=")[1].replaceAll("\"", "");
            System.out.println("文件名是： "+ fileName);
        }
        if(null == fileName) {
            // 尝试从url中获取文件名
            String[] arr  = urlPath.split("/");
            fileName = arr[arr.length - 1];
            System.out.println("url中获取文件名："+ fileName);
        }
        if (!StringUtils.isEmpty(title)){
            fileName = title+"."+fileName.split("\\.")[1];
        }
        return fileName;
    }
}