Java爬虫网络图片

最新推荐文章于 2024-08-12 16:47:32 发布

此生0517

最新推荐文章于 2024-08-12 16:47:32 发布

阅读量693

点赞数 2

文章标签： java 爬虫开发语言

本文链接：https://blog.csdn.net/mds0517/article/details/139694161

版权

可以根据图片url下载网络图片到本地

package DYBZ;

import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ImageDownloader {
    public static void main(String[] args) {
        //https://tse4-mm.cn.bing.net/th/id/OIP-C.jb-OE259cWU7Y_29TRf7bAHaEK?w=329&h=185&c=7&r=0&o=5&pid=1.7
        String imageUrl = "https://tse4-mm.cn.bing.net/th/id/OIP-C.jb-OE259cWU7Y_29TRf7bAHaEK?w=329&h=185&c=7&r=0&o=5&pid=1.7"; // 替换为实际的图片URL
        String destinationFilePath = "E:\\MDS\\JavaWork\\Book2\\src\\main\\resources\\image.png"; // 替换为实际的保存路径和文件名
        downloadImage(imageUrl, destinationFilePath);
    }
    //下载网络图片  imageUrl是网络图片的url  destinationFilePath是图片保存地址
    public static int downloadImage(String imageUrl, String destinationFilePath) {
        System.out.println("Downloading image from: " + imageUrl);
        System.out.println("Destination file path: " + destinationFilePath);
        int responseCode = 0;
        try {
            URL url = new URL(imageUrl);
            HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
            httpConn.setRequestMethod("GET");
            // 添加请求头信息（如果需要）
            httpConn.setRequestProperty("Cookie", "PHPSESSID=jv5tbv9kid1i099l4r12qikls1; Hm_lvt_862e8e4f50ca4af123854e8434f8698a=1715242922; Hm_lvt_a0b498a32fffa7c376b36f470e5b5efa=1715242922; Hm_lpvt_862e8e4f50ca4af123854e8434f8698a=1715313174; Hm_lpvt_a0b498a32fffa7c376b36f470e5b5efa=1715313174; UNCLICKPOP_11_zzlm=22; cf_clearance=QfPUubAZCnQrv1cDkKmxdX_3QvW37VIrheOBBg2kPMk-1715390374-1.0.1.1-n2o4TEFDEpgkr6tAB2QXhQKJpMIAZjjWfykqOagGGOoXlWaNdrNiS1mo1fHKC0y9soAMrK56g6wmN0BdMjkbCw");
            httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Linux; Android 11; MI 6 Build/RQ3A.222001.001) AppleWebKit/558.32 (KHTML, like Gecko) Version/4.0 Chrome/114.0.5172.87 Mobile Safari/547.24");
            httpConn.connect();
            responseCode = httpConn.getResponseCode();
            if (responseCode == HttpURLConnection.HTTP_OK) {
                InputStream inputStream = new BufferedInputStream(httpConn.getInputStream());
                FileOutputStream outputStream = new FileOutputStream(destinationFilePath);
                byte[] buffer = new byte[4096];
                int bytesRead;
                while ((bytesRead = inputStream.read(buffer)) != -1) {
                    outputStream.write(buffer, 0, bytesRead);
                }
                outputStream.close();
                inputStream.close();
                httpConn.disconnect();
                System.out.println("Image downloaded successfully.");
            } else {
                System.out.println("GET request not worked");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return responseCode;
    }
    //下载爬虫后的html的所有网络图片
    public void download_Html_IMG(String html) throws Exception {
        // 定义一个正则表达式来匹配 img 标签中的 src 属性值，并捕获文件名部分
        // 注意：这个正则表达式可能需要根据实际的HTML结构进行调整
        Pattern pattern = Pattern.compile("<img[^>]+src=\"([^\"]+\\.png)\"[^>]*>", Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(html);
        List<String> imageNames = new ArrayList<>();
        while (matcher.find()) {
            // matcher.group(1) 会捕获到第一个括号内的内容，即文件名
            String imageName = matcher.group(1);
            // 将网络图片地址加入列表中
            imageNames.add(imageName);
        }
        //有些图片是放在服务器里面的，可能没有域名，需要自己补全
        for (int i = 0; i < imageNames.size(); i++) {
            String imageUrl = imageNames.get(i); // 替换为实际的图片URL
            String destinationFilePath = "E:\\MDS\\JavaWork\\Book2\\src\\main\\resources\\image\\" + imageNames.get(i); // 替换为实际的保存路径和文件名

            int code = downloadImage(imageUrl, destinationFilePath);
            System.out.print(imageNames.get(i) + ":" + code + "\t");
        }
        System.out.println("\n");
    }

}