使用Java写个简单的爬取图片并保存本地

本文介绍了如何使用Java通过Jsoup库抓取豆瓣电影Top250的页面内容,利用多线程并发下载电影图片,包括引入依赖、连接网页、解析HTML以及处理下载任务。
摘要由CSDN通过智能技术生成

使用Java实现爬虫

话不多说,直接开干。

        1.1引入依赖
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.8</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
         1.2编写主要代码
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;



public class Test {
    public static void main(String[] args) throws Exception {
        ThreadPoolExecutor executor = new ThreadPoolExecutor(20, 20, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
        //翻页使用
        int pageNum = 0;
        //电影排名计数
        int count = 1;

        for (int i = 0; i < 10; i++) {
            String url = "https://movie.douban.com/top250?start=" + pageNum + "&filter=";
            //解析网页(Jsoup返回Document就是浏览器Document对象)
            Document document = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0").get();
            //获取所有li标签
            Elements name = document.getElementsByClass("article");
            for (Element element : name) {
                Elements li = element.getElementsByTag("li");

                for (Element element1 : li) {
                    //获取标题
                    String title = count + "-" + element1.getElementsByTag("img").attr("alt");
                    //获取图片地址
                    String attr = element1.getElementsByTag("img").attr("src");
                    //下载图片
                    executor.submit(new DownloadRunnable(attr, "/Users/XXX/Desktop/img", title));
                    count++;
                }

            }
            pageNum += 25;
        }
        executor.shutdown();
    }

}
1.3 编写多线程下载任务
import org.springframework.util.StringUtils;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;


public class DownloadRunnable implements Runnable{
    private String urlPath;
    private String targetDirectory;
    private String title;

    public DownloadRunnable(String urlPath, String targetDirectory, String title) {
        this.urlPath = urlPath;
        this.targetDirectory = targetDirectory;
        this.title = title;
    }
    @Override
    public void run() {
        try {
            download(urlPath,targetDirectory,title);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
    public void download(String urlPath , String targetDirectory,String title) throws Exception {
        // 解决url中可能有中文情况
        System.out.println("url:"+ urlPath);
        URL url = new URL(urlPath);
        HttpURLConnection http = (HttpURLConnection)url.openConnection();
        http.setConnectTimeout(3000);
        // 设置 User-Agent 避免被拦截
        http.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)");
        String contentType = http.getContentType();
        System.out.println("contentType: "+ contentType);
        // 获取文件大小
        long length = http.getContentLengthLong();
        System.out.println("文件大小:"+(length / 1024)+"KB");
        // 获取文件名
        String fileName = getFileName(http , urlPath,title);
        InputStream inputStream = http.getInputStream();
        byte[] buff = new byte[1024*10];
        OutputStream out = new FileOutputStream(new File(targetDirectory,fileName));
        int len ;
        int count = 0; // 计数
        while((len = inputStream.read(buff)) != -1) {
            out.write(buff, 0, len);
            out.flush();
            ++count ;
        }
        System.out.println("count:"+ count);
        // 关闭资源
        out.close();
        inputStream.close();
        http.disconnect();
    }
    private String getFileName(HttpURLConnection http , String urlPath,String title) throws UnsupportedEncodingException {
        String headerField = http.getHeaderField("Content-Disposition");
        String fileName = null ;
        if(null != headerField) {
            String decode = URLDecoder.decode(headerField, "UTF-8");
            fileName = decode.split(";")[1].split("=")[1].replaceAll("\"", "");
            System.out.println("文件名是: "+ fileName);
        }
        if(null == fileName) {
            // 尝试从url中获取文件名
            String[] arr  = urlPath.split("/");
            fileName = arr[arr.length - 1];
            System.out.println("url中获取文件名:"+ fileName);
        }
        if (!StringUtils.isEmpty(title)){
            fileName = title+"."+fileName.split("\\.")[1];
        }
        return fileName;
    }
}
1.4直接运行看效果

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值