java抓取 某网站AE模板链接地址

所用maven依赖:

  <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>4.5.15</version>
        </dependency>
        <dependency>
            <groupId>cn.wanghaomiao</groupId>
            <artifactId>JsoupXpath</artifactId>
            <version>0.1.1</version>
        </dependency>

代码示例:




package com.xiaoyun;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;

import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class TestXpath {
    //目标页面
    private static String html = "https://www.newcger.com/aemoban/list_1.html";
    //最大页码
    private static String pageMaxNum = "";

    private static List<String> listHtml = new LinkedList<>();


    public static void main(String[] args) {
        long startTime = System.currentTimeMillis();

        getPanDown();
        //4066 - parallel
        //16610 - No parallel
        System.out.println("所用时间= " + (System.currentTimeMillis() - startTime));

    }

    private static void getPanDown() {
        //初始化获取最大页码
        init(html);

        StringBuilder sb = new StringBuilder("https://www.newcger.com/aemoban/list_");
        //获取所有页
//        int pageNum = Integer.parseInt(pageMaxNum);
        //获取前5页
        int pageNum = 5;

        for (int i = 1; i <= pageNum; i++) {
            String sHtml = sb.append(i).append(".html").toString();
            //获取所有list所有页面
            listHtml.add(sHtml);
        }

        listHtml.stream().parallel().forEach(html -> {
            String xpath = "//a[@class='tit']/@href";
            String xpathText = "//a[@class='tit']/text()";
            List<Object> nodeHtmlList = getHttpFormXpath(html, xpath);
            List<String> list = nodeHtmlList.stream().parallel().map(nodeHtml -> {
                String panDown = "//div[@class='fd_div']/ul//li/a/@href";
                String panDownPassword = "//div[@class='fd_div']/ul//li/a/text()";
                //获取网盘下载链接
                List<Object> panHtml = getHttpFormXpath(nodeHtml.toString(), panDown);
                //获取网盘提取码
                List<Object> panPassword = getHttpFormXpath(nodeHtml.toString(), panDownPassword);
                //两个网盘链接,一个百度 + 获取提取码 +一个ctFile
                return panHtml.get(0) + "," + panPassword.get(0).toString() + "," + panHtml.get(1);
            }).collect(Collectors.toList());
            //获取当前标题
            List<Object> nodeHtmlTitleList = getHttpFormXpath(html, xpathText);
            //key-当前标题,value-网盘链接
            Map<Object, String> map = listToMap(nodeHtmlTitleList, list);
            System.out.println(map);
        });
    }

    public static <K, V> Map<K, V> listToMap(List<K> keys, List<V> values) {
        return keys.stream().collect(Collectors.toMap(key -> key, key -> values.get(keys.indexOf(key))));
    }


    private static void init(String html) {
        String pageNumXpath = "//div[@class='pageNavByNum']/form/label/text()";
        List<Object> httpFormXpath = getHttpFormXpath(html, pageNumXpath);
        String num = httpFormXpath.get(0).toString();
        pageMaxNum = Pattern.compile("[^(0-9)]").matcher(num).replaceAll("").trim();
        System.out.println(pageMaxNum);

    }

    private static List<Object> getHttpFormXpath(String html, String xpathName) {
        String body = HttpUtil.createGet(html).execute().body();
        JXDocument jxDocument = new JXDocument(body);
        List<Object> sel = new ArrayList<>();
        try {
            sel = jxDocument.sel(xpathName);
        } catch (NoSuchAxisException e) {
            e.printStackTrace();
        } catch (NoSuchFunctionException e) {
            e.printStackTrace();
        } catch (XpathSyntaxErrorException e) {
            e.printStackTrace();
        }
        return sel;
    }
}

同理,另一个视频网站

package com.xiaoyun;

import cn.hutool.core.codec.Base64;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;
import lombok.Data;

import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class TestXpath02 {

    private static Map<String, String> cacheHtml = new HashMap();

    private static Map<String, AvDownEntity> cacheAvDown = new HashMap();

    private static List<String> list = Arrays.asList(
            "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlb0FWLw=="
//            "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlbzEv",
//            "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlbzIv",
//            "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlbzMv",
//            "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlbzQv"
    );
    private static Set<String> newSet = new HashSet<>();


    private static String domain = "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbQ==";


    public static void main(String[] args) {


        //初始化所有地址
        init();

        System.out.println(newSet);


        runHttp();

        System.out.println(cacheAvDown);


    }

    private static void runHttp() {
        Iterator<String> iterator = newSet.iterator();
        while (iterator.hasNext()) {
            String domainName = iterator.next();
            if (Optional.ofNullable(domainName).isPresent()) {

                String listHtmlXpath = "//h4[@class='title text-overflow']/a/@href";
                String listHtmlPhotoXpath = "//a[@class='stui-vodlist__thumb lazyload']/@data-original";
                List<Object> httpFormXpathString = getHttpFormXpath(domainName, listHtmlXpath);
                List<Object> httpFormXpathPhotoString = getHttpFormXpath(domainName, listHtmlPhotoXpath);

                List<String> htmlList = httpFormXpathString.stream().map(o -> {
                    String domainDecode = Base64.decodeStr(domain);
                    String htmlUrl = domainDecode + o.toString();
                    return htmlUrl;
                }).collect(Collectors.toList());

                Map<String, Object> map = listToMap(htmlList, httpFormXpathPhotoString);

                map.forEach((key, value) -> {
                    AvDownEntity avDownEntity = new AvDownEntity();
                    avDownEntity.setDomainUrl(key);
                    avDownEntity.setPrice(value.toString());
                    cacheAvDown.put(key, avDownEntity);
                });

                htmlList.stream().parallel().forEach(html -> {
                    String xpathVideo = "//ul[@class='stui-content__playlist clearfix']/li/a/@title";
                    String xpathDown = "//ul[@class='stui-content__playlist clearfix']/li/a/@href";
                    List<Object> httpFormXpathVideo = getHttpFormXpath(html, xpathVideo);
                    List<Object> httpFormXpathDown = getHttpFormXpath(html, xpathDown);

                    AvDownEntity avDownEntity = cacheAvDown.get(html);
                    avDownEntity.setDownUrl(httpFormXpathDown.get(0).toString());
                    avDownEntity.setTitle(httpFormXpathVideo.get(0).toString());
                    cacheAvDown.put(html, avDownEntity);

                    System.out.println(avDownEntity);

                });
            }
        }
    }

    private static void init() {

        String endNumXpath = "//li[last()]/a/@href";
        list.stream().parallel().forEach(domainHtml -> {

            String domainHtmlDecode = Base64.decodeStr(domainHtml);

            List<Object> listObj = getHttpFormXpath(domainHtmlDecode, endNumXpath);
            String[] split = listObj.get(3).toString().split("/");
            String str = split[split.length - 1];
            str = Pattern.compile("[^(0-9)]").matcher(str).replaceAll("").trim();
            int num = Integer.parseInt(str);

            for (int i = 0; i < num; i++) {
                String htmlNew = domainHtmlDecode + "index_" + i + ".html";
                newSet.add(htmlNew);
            }
        });
    }

    public static <K, V> Map<K, V> listToMap(List<K> keys, List<V> values) {
        return keys.stream().collect(Collectors.toMap(key -> key, key -> values.get(keys.indexOf(key))));
    }

    private static List<Object> getHttpFormXpath(String html, String xpathName) {
        String body = null;
        if (cacheHtml.get(html) != null) {
            body = cacheHtml.get(html);
        } else {
            try {
                Thread.sleep(200);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            body = HttpUtil.
                    createGet(html).
                    execute().body();

            cacheHtml.put(html, body);
        }

        JXDocument jxDocument = new JXDocument(body);
        List<Object> sel = new ArrayList<>();
        try {
            sel = jxDocument.sel(xpathName);
        } catch (NoSuchAxisException e) {
            e.printStackTrace();
        } catch (NoSuchFunctionException e) {
            e.printStackTrace();
        } catch (XpathSyntaxErrorException e) {
            e.printStackTrace();
        }
        return sel;
    }

    @Data
    static class AvDownEntity {
        private String domainUrl;
        private String title;
        private String downUrl;
        private String price;
    }
}

获取公网Ip地址

 private static Map<String, String> cacheMap = new HashMap();

    public static void main(String[] args) {
        String xpath = "//div[@id='ipv4']/a/text()";
        String xpathInformation = "//tr/td[@style='font-size:14px;']/text()";
        String html = "https://whatismyipaddress.com";
        List<Object> list = getHttpFormXpath(html, xpath);
        List<Object> addr = getHttpFormXpath(html, xpathInformation);
        System.out.println(list);
        System.out.println(addr);

    }

    private static List<Object> getHttpFormXpath(String html, String xpathName) {
        String body = null;
        if (cacheMap.get(html) != null) {
            body = cacheMap.get(html);
        } else {
            body = HttpUtil.
                    createGet(html).
                    execute().body();

            cacheMap.put(html, body);
        }
        return getObjects(xpathName, body);
    }

    static List<Object> getObjects(String xpathName, String body) {
        JXDocument jxDocument = new JXDocument(body);
        List<Object> sel = new ArrayList<>();
        try {
            sel = jxDocument.sel(xpathName);
        } catch (NoSuchAxisException e) {
            e.printStackTrace();
        } catch (NoSuchFunctionException e) {
            e.printStackTrace();
        } catch (XpathSyntaxErrorException e) {
            e.printStackTrace();
        }
        return sel;
    }

改进,加入map缓存,加入key MD5,避免多次查询:

package com.xiaoyun;

import cn.hutool.core.map.MapUtil;
import cn.hutool.crypto.digest.MD5;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;

import java.util.*;

public class IpAddrSelect {

    private static Map<String, String> cacheMap = new HashMap();

    private static Map<String, Map<String, List<Object>>> cacheSelMap = new HashMap();

    public static void main(String[] args) {
        String xpath = "//div[@id='ipv4']/a/text()";
        String xpathInformation = "//tr/td[@style='font-size:14px;']/text()";
        String html = "https://whatismyipaddress.com";
        List<Object> list = getHttpFormXpath(html, xpath);
        List<Object> addr = getHttpFormXpath(html, xpathInformation);
        System.out.println(list);
        System.out.println(addr);
        System.out.println(cacheSelMap);

    }

    public static List<Object> getHttpFormXpath(String html, String xpathName) {
        String body = null;
        if (cacheMap.get(html) != null) {
            body = cacheMap.get(html);
        } else {
            body = HttpUtil.
                    createGet(html).
                    execute().body();
            cacheMap.put(html, body);
        }
        return getObjects(xpathName, body);
    }

    private static List<Object> getObjects(String xpathName, String body) {

        String bodyMd5 = bodyMd5(body);

        Map<String, List<Object>> bodyMap = cacheSelMap.get(bodyMd5);


        List<Object> sel = new ArrayList<>();
        if (!MapUtil.isEmpty(bodyMap)) {
            if (sel != null && sel.size() > 0) {
                sel = bodyMap.get(xpathName);
            } else {
                sel = selAndCachePut(xpathName, body, bodyMap);
            }
        } else {
            Map<String, List<Object>> bodyNewMap = new HashMap<>();
            sel = selAndCachePut(xpathName, body, bodyNewMap);
        }
        return sel;
    }

    //将body转为Md5值,便于key更佳便利
    private static String bodyMd5(String body) {
        MD5 md5 = new MD5();
        return md5.digestHex16(body);
    }

    private static List<Object> selAndCachePut(String xpathName, String body, Map<String, List<Object>> bodyNewMap) {
        List<Object> sel = new ArrayList<>();
        try {
            JXDocument jxDocument = new JXDocument(body);
            sel = jxDocument.sel(xpathName);
            bodyNewMap.put(xpathName, sel);
            String bodyMd5 = bodyMd5(body);
            cacheSelMap.put(bodyMd5, bodyNewMap);
        } catch (NoSuchAxisException e) {
            e.printStackTrace();
        } catch (NoSuchFunctionException e) {
            e.printStackTrace();
        } catch (XpathSyntaxErrorException e) {
            e.printStackTrace();
        }
        return sel;
    }
}



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值