从游民星空爬取每周壁纸

荒江钓叟

于 2019-09-22 15:11:26 发布

阅读量2.1k

点赞数 2

分类专栏：爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/u010663919/article/details/101158211

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

从游民星空爬取每周壁纸

从游民星空爬取每周壁纸

从游民星空爬取每周壁纸

比较喜欢游民星空的每周壁纸，但是一张一张点击保存比较费劲，刚好最近又在学习爬虫，所以就使用httpClient写了一个小爬虫，爬取每周壁纸，可能有些地方写的比较笨拙，如果哪位码友有好的建议，也可以留言。

1，新建一个Maven项目

pom.xml文件引用jar包如下：

<dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.8</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.24</version>
        </dependency>
        <dependency>
            <groupId>commons-lang</groupId>
            <artifactId>commons-lang</artifactId>
            <version>2.6</version>
        </dependency>
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>27.0.1-jre</version>
        </dependency>
        <dependency>
            <groupId>org.javatuples</groupId>
            <artifactId>javatuples</artifactId>
            <version>1.2</version>
        </dependency>
    </dependencies>

2，HttpClientUtils工具类

package com.demo.utils;

import com.alibaba.fastjson.JSON;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

import static java.util.regex.Pattern.compile;    

public class HttpClientUtils {

    public static HttpPost getHttpPost(String url) {
        HttpPost httpPost = new HttpPost(url);
        String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0";
        httpPost.setHeader(new BasicHeader("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"));
        httpPost.setHeader(new BasicHeader("User-Agent", userAgent));
        return httpPost;
    }

    public static HttpGet getHttpGET(String url) {
        HttpGet httpGet = new HttpGet(url);
        String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0";
        httpGet.setHeader(new BasicHeader("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2"));
        httpGet.setHeader(new BasicHeader("User-Agent", userAgent));
        return httpGet;
    }




    public static String getContent(HttpPost httpPost) throws IOException {
        String context = "";
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpResponse httpResponse = httpClient.execute(httpPost);
        HttpEntity entity = httpResponse.getEntity();
        if (entity != null) {
            context = EntityUtils.toString(entity, "utf-8");
        }
        EntityUtils.consume(entity);
        httpClient.close();
        return context;
    }
}`在这里插入代码片`

2，DownLoadUtils 工具类

package com.demo.utils;

import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;


public class DownLoadUtils {


    public static void download(String url, String path, String name) throws Exception {
        String[] split1 = url.split("\\?");
        url = split1[split1.length - 1];
        HttpGet httpGET = HttpClientUtils.getHttpGET(url);
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpResponse httpResponse = httpClient.execute(httpGET);
        HttpEntity entity = httpResponse.getEntity();
        String[] split = url.split("/");
        if (StringUtils.isBlank(name)) {
            name = split[split.length - 1];
        }
        if (entity != null) {
            InputStream inputStream = entity.getContent();
            try {
                File file = new File(path, name);
                FileOutputStream fout = new FileOutputStream(file);
                int len = -1;
                byte[] tmp = new byte[1024];
                while ((len = inputStream.read(tmp)) != -1) {
                    fout.write(tmp, 0, len);
                }
                fout.flush();
                fout.close();
            } finally {
                inputStream.close();
            }
        }
        EntityUtils.consume(entity);
        httpClient.close();
    }


}

3，GameSkySpider爬虫

package com.demo.spider;

import com.alibaba.fastjson.JSON;
import com.zcl.utils.DownLoadUtils;
import com.zcl.utils.HttpClientUtils;
import javafx.util.Pair;
import org.apache.commons.lang.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;

import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.compile;


public class GameSkySpider {

    /**
     * 获取想要下载的page
     * <p>
     * https://db2.gamersky.com/LabelJsonpAjax.aspx?callback=jQuery183007489476026424047_1569080090816&jsondata=
     * {"type":"updatenodelabel","isCache":true,"cacheTime":600,"nodeId":"20117","isNodeId":"true","page":6}&_=
     *
     * @param page
     * @return
     * @throws Exception
     */
    public static String page(String url, int page) throws Exception {
        Map<String, Object> map = new HashMap<>();
        map.put("type", "updatenodelabel");
        map.put("isCache", true);
        map.put("cacheTime", 600);
        map.put("nodeId", "20117");
        map.put("isNodeId", "true");
        map.put("page", page);
        HttpPost httpPost = HttpClientUtils.getHttpPost(url);
        List<NameValuePair> nvps = new ArrayList<>();
        nvps.add(new BasicNameValuePair("jsondata", JSON.toJSONString(map)));
        nvps.add(new BasicNameValuePair("callback", "jQuery183007489476026424047_1569080090816"));
        nvps.add(new BasicNameValuePair("_", System.currentTimeMillis() + ""));
        httpPost.setEntity(new UrlEncodedFormEntity(nvps, "utf-8"));
        String content = HttpClientUtils.getContent(httpPost);
        return content;
    }

    /**
     * 获取当前页面的每个item的url
     * jQuery183007489476026424047_1569080090816({"status":"ok","totalPages":29,"body":"\r\n                \r\n              \r\n                    \r\n\t\t\t\t\t<li>\r\n\t\t\t\t\t\t
     * <div class=\"img\"><a href=\"https://www.gamersky.com/ent/201803/1028766.shtml\" target=\"_blank\">\n
     * <img src=\"https://imgs.gamersky.com/upimg/2018/201803241937164375.jpg\" alt=\"每周精选壁纸：想人生，美景良辰堪惜\"
     * width=\"200\" height=\"110\" class=\"pe_u_thumb\" border=\"0\">\n          </a></div>\r\n\t\t\t\t\t\t<div class=\"con\">\r\n\t\t\t\t\t\t\t
     * <div class=\"tit\"><a href=\"https://www.gamersky.com/ent/201803/1028766.shtml\" target=\"_blank\" title=\"每周壁纸精选第81弹：想人生，美景良辰
     * 堪惜\">每周精选壁纸：想人生，美景良辰堪惜</a></div>\r\n\t\t\t\t\t\t\t<div class=\"txt\">所有图片皆来自于网络，若有侵权请联系邮箱：foxjr@gamersky.com，
     * 我们会及时删除。友情提示：点击图片即可获得高清无水印大图。</div>\r\n\t\t\t\t\t\t\t<div class=\"tme2\"><div class=\"time\">2018-03-24 49:03</div>
     * <div class=\"pls cy_comment\" data-sid=\"1028766\"></div></div>\r\n\t\t\t\t\t\t</div>\r\n\t\t\t\t\t</li>\r\n\t\t\t\t\t\r\n\t\t\t\t\t<
     * li>\r\n\t\t\t\t\t\t<div class=\"img\">
     * <p>
     * <a href=\"https://www.gamersky.com/ent/201801/1002394.shtml\" target=\"_blank\" title=\"每周壁纸精选第73弹：剑战横空金气肃，旌旗映日彩云飞\">
     *
     * @param context
     * @return
     */
    public static Pair<Integer, List<String>> items(String context) {
        List<String> urls = new ArrayList<>();
        if (StringUtils.isNotBlank(context)) {
            context = context.replace("\\r", "").replace("\\n", "").replace("\\t", "").replace("\\", "");
        }
        Matcher matcher = compile("<a href=.*?target=").matcher(context);
        Matcher matcher1 = compile("\"totalPages\":.*?,").matcher(context);
        Set<String> set = new HashSet<>();
        while (matcher.find()) {
            String url = matcher.group();
            String[] split = url.split("\"");
            if (split[1].endsWith("shtml")) {
                set.add(split[1]);
            }
        }
        int totalPage = 0;
        while (matcher1.find()) {
            String url = matcher1.group();
            String totalPageStr = url.split(":")[1];
            totalPage = compile("\\d{1,}").matcher(totalPageStr).find() ? Integer.parseInt(totalPageStr.replace(",", "")) : totalPage;
        }

        urls.addAll(set);
        Pair<Integer, List<String>> pair = new Pair<>(totalPage, urls);
        return pair;
    }

    /**
     * 获取item的内容
     *
     * @param url
     * @return
     * @throws Exception
     */
    public static String item(String url) throws Exception {
        HttpPost httpPost = HttpClientUtils.getHttpPost(url);
        String context = HttpClientUtils.getContent(httpPost);
        return context;
    }

    /**
     * 获取每个item里面有多少页
     * <p>
     * <!--{pe.begin.pagination}--><span id="pe100_page_contentpage" class="pagecss"><div class="page_css">
     * <b><a href="https://www.gamersky.com/ent/201801/1002394.shtml">1</a></b> <a href="https://www.gamersky.com/ent/201801/1002394_2.shtml">2</a>
     * <a href="https://www.gamersky.com/ent/201801/1002394_3.shtml">3</a> <a href="https://www.gamersky.com/ent/201801/1002394_4.shtml">4</a>
     * <a href="https://www.gamersky.com/ent/201801/1002394_5.shtml">5</a> <a href="https://www.gamersky.com/ent/201801/1002394_6.shtml">6</a>
     *
     * @param content
     * @param url
     * @return
     */
    public static int itemMaxPageSize(String content, String url) {
        String newUrl = url.replace(".shtml", "").replace("https", "http");
        int max1 = getMax(content, newUrl);
        url = url.replace(".shtml", "");
        int max2 = getMax(content, url);
        return max1 == 0 ? max2 : max1;
    }

    private static int getMax(String content, String newUrl) {
        int max = 0;
        Matcher matcher = compile("<a href=\"" + newUrl + "_.*?shtml\">").matcher(content);
        Pattern compile2 = compile("\\d{1,}");
        while (matcher.find()) {
            String group = matcher.group();
            if (StringUtils.isNotBlank(group)) {
                group = group.replace(newUrl, "").replace("_", "").replace(".shtml", "");
                Matcher matcher2 = compile2.matcher(group);
                if (matcher2.find()) {
                    int i = Integer.parseInt(matcher2.group());
                    if (i > max) {
                        max = i;
                    }
                }
            }
        }
        return max;
    }

    /**
     * 获取item每页包含image的url
     *
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_01origin_01_20181131846728.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_01small_02_20181131846AD1.jpg" border="0"></a></p>
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_02origin_03_20181131846EDF.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_02small_04_20181131846174.jpg" border="0"></a></p>
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_03origin_05_2018113184649E.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_03small_06_201811318467AE.jpg" border="0"></a></p>
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_04origin_07_20181131846289.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_04small_08_201811318464FC.jpg" border="0"></a></p>
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_05origin_09_2018113184669E.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_05small_10_201811318469E2.jpg" border="0"></a></p>
     * <p align="center"><a href="http://www.gamersky.com/showimage/id_gamersky.shtml?http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_06origin_11_20181131846D26.jpg" target="_blank"><img class="picact" alt="游民星空" src="http://img1.gamersky.com/image2018/01/20180113_zl_91_2/gamersky_06small_12_2018113184616A.jpg" border="0"></a></p>
     *
     * @param context
     * @return
     */
    public static List<String> image(String context) {
        List<String> urls = new ArrayList<>();
        Matcher matcher1 = compile("<a target=\"_blank\" href=\".*?><img class=").matcher(context);
        Matcher matcher2 = compile("<a href=.*? target=\"_blank\"><img class").matcher(context);
        Set<String> set = new HashSet<>();
        while (matcher1.find()) {
            String[] split = matcher1.group().split("\"");
            if (split[3].endsWith(".jpg")) {
                set.add(split[3]);
            }
        }
        if (null == set || set.size() <= 0) {
            while (matcher2.find()) {
                String[] split = matcher2.group().split("\"");
                if (split[1].endsWith(".jpg")) {
                    set.add(split[1]);
                }
            }
        }
        urls.addAll(set);
        return urls;
    }

    public static void spider() throws Exception {
        String path = "D:\\迅雷下载\\游民星空";
        String url = "https://db2.gamersky.com/LabelJsonpAjax.aspx";
        File file = new File(path);
        File[] files = file.listFiles();
        Set<String> set = new HashSet<>();
        if (null != files && files.length > 0) {
            for (File fileImage : files) {
                set.add(fileImage.getAbsoluteFile().getName());
            }
        }
        int imageNum = 1;
        Pair<Integer, List<String>> pair = items(page(url, 1));
        imageNum = downLoad(path, set, imageNum, 1, pair.getValue());
        Integer key = pair.getKey();
        for (int i = 2; i <= key; i++) {
            List<String> itemsUrl = items(page(url, i)).getValue();
            imageNum = downLoad(path, set, imageNum, i, itemsUrl);
        }
    }

    private static int downLoad(String path, Set<String> set, int imageNum, int i, List<String> itemsUrl) throws Exception {
        if (null != itemsUrl && itemsUrl.size() > 0) {
            for (String itemUrl : itemsUrl) {
                if (set.contains(itemUrl)) {
                    continue;
                }
                set.add(itemUrl);
                String item = item(itemUrl);
                int itemMaxPageSize = itemMaxPageSize(item, itemUrl);
                for (int j = 1; j <= itemMaxPageSize; j++) {
                    List<String> imageUrls = image(item);
                    if (null != imageUrls && imageUrls.size() > 0) {
                        for (String imageUrl : imageUrls) {
                            if (set.contains(imageUrl)) {
                                continue;
                            }
                            set.add(imageUrl);
                            DownLoadUtils.download(imageUrl, path, null);
                            System.out.println("下载第" + imageNum + "张");
                            imageNum++;
                        }
                    }
                }
            }
        }
        System.out.println("下载第" + i + "页");
        return imageNum;
    }

    public static void main(String[] args) throws Exception {
        spider();
    }
}