Java爬虫爬取壁纸

最新推荐文章于 2021-12-14 14:10:55 发布

潇潇ꦿএ᭄

最新推荐文章于 2021-12-14 14:10:55 发布

阅读量207

点赞数 1

分类专栏：爬虫文章标签： spring 爬虫

本文链接：https://blog.csdn.net/qq_45157663/article/details/119250827

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

刚刚学习java爬虫，使用jsoup和httpclient写个小demo进行入门学习

1.引入依赖

		<dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.6</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

2.编写保存图片实体类

@Data
public class Picture {
    private String title;
    private String url;
}

3.实现方法

package com.teset.demo.utils;

import com.teset.demo.entity.Picture;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.stream.Collectors;

/**
 * 下载壁纸
 * @Author Administrator
 * @Date 2021/7/30 14:57
 */
public class BizhiDownloader {

    private CloseableHttpClient httpClient;
    private String referer;
    private String rootPath;
    private String filePath;
    private String picUrl;

    public BizhiDownloader() {
        httpClient = HttpClientUtil.getHttpClient();
    }

    public void start(List<String> urlList) {
        urlList.stream().forEach(url->{
            this.referer = url;

            String dirName = url.substring(22, url.length()-1);  //根据标题名字去创建目录
            //创建分类目录
            File path = new File("D:/bizhi/", dirName); //保存路径
            if (!path.exists() && !path.isDirectory()) {
                path.mkdir();
                rootPath = path.toString();
            }
            rootPath = path.toString();
            for (int i = 1; i <= 10; i++) {  //分页获取图片数据，简单获取几页就行了
                picUrl = url;
                if (i==1){
                    this.page(url);
                }else {
                    this.page(url + "index_" + i + ".html");
                }
            }
        });
    }


    /**
     * 分页获得数据
     */
    public void page(String url){
        System.out.println("url：" + url);
        String html = this.getHtml(url);
        Map<String, String> picMap = this.extractTitleUrl(html);

        if (picMap == null) {
            return ;
        }
        this.getPictureHtml(picMap);
    }

    /**
     * 获得图片页面数据
     */
    public void getPictureHtml(Map<String, String> picMap){
        //进入标题页，在标题页中再次分页下载。
        picMap.forEach((title, url)->{
            String html = this.getHtml(picUrl+url);
            if (html == null) {
                //如果返回的页面数据为 null，那就退出;
                return ;
            }
            Picture picture = this.extractPictureUrl(html);
            System.out.println("开始下载");

            SinglePictureDownloader downloader = new SinglePictureDownloader(picture, referer, rootPath);
            this.download(picture);
            try {
                Thread.sleep(500);
                System.out.println("爬取完一张图片，休息0.5秒。");
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        });
    }
    private Picture extractPictureUrl(String html) {
        Document doc = Jsoup.parse(html, "GBK");
        String title = doc.getElementsByTag("H1").first().text();

        //获得图片路径
        String src = doc.getElementsByAttributeValue("id", "img").first().getElementsByTag("img").attr("src");
        //获取图片的文件扩展名
        title = title + src.substring(src.lastIndexOf("."));
        return new Picture(title, picUrl+src);
    }


    /**
     * 获得页面数据
     * @param url
     * @return
     */
    private String getHtml(String url) {
        String html = null;
        HttpGet get = new HttpGet(url);
        get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
        get.setHeader("referer", url);
        try (CloseableHttpResponse response = httpClient.execute(get)) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == 200) {
                HttpEntity entity = response.getEntity();
                if (entity != null) {
                    html = EntityUtils.toString(entity, "GBK");  
                }
            }
            else {
                System.out.println(statusCode);
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return html;
    }

    /**
     * 抽取图片的url
     * @param html
     * @return
     */
    private Map<String, String> extractTitleUrl(String html) {
        if (html == null) {
            return null;
        }
        Document doc = Jsoup.parse(html, "UTF-8");
        Elements pictures = doc.select("ul.clearfix > li");
		//拿到图片地址
        Elements pictureA = pictures.stream()
                .map(pic->pic.getElementsByTag("a").first())
                .collect(Collectors.toCollection(Elements::new));

        return pictureA.stream().collect(Collectors.toMap(
                pic->pic.getElementsByTag("img").first().attr("alt"),
                pic->pic.attr("href")));
    }

    /**
     * 下载方法
     */
    public void download(Picture picture) {
        HttpGet get = new HttpGet(picture.getUrl());
        Random rand = new Random();
        //设置请求头
        get.setHeader("User-Agent", HeaderUtil.headers[rand.nextInt(HeaderUtil.headers.length)]);
        get.setHeader("referer", referer);

        System.out.println(referer);
        HttpEntity entity = null;
        try (CloseableHttpResponse response = httpClient.execute(get)) {
            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode == 200) {
                entity = response.getEntity();
                if (entity != null) {
                    File picFile = new File(rootPath, picture.getTitle());
                    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(picFile))) {
                        entity.writeTo(out);
                        System.out.println("下载完毕：" + picFile.getAbsolutePath());
                    }
                }
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                EntityUtils.consume(entity);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

4.启动

public static void main(String[] args){
        //要爬取的页面
        String[] urls = new String[] {
                "https://pic.netbian.com/",
                "https://pic.netbian.com/4kmeinv/",
                "https://pic.netbian.com/4kdongman/"
        };
        // 添加初始队列，启动爬虫
        List<String> urlList = new ArrayList<>(Arrays.asList(urls));
        BizhiDownloader spider = new BizhiDownloader();
        spider.start(urlList);
    }

爬取结果

潇潇ꦿএ᭄

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Java爬虫爬取壁纸

刚刚学习java爬虫，使用jsoup和httpclient写个小demo进行入门学习1.引入依赖 <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.6</version>
复制链接

扫一扫