java实现逆向状态码521的JS网页

爬取网页时是否也遇到过状态码始终返回521?如果网页源代码返回类似以下内容,则该站使用了反爬措施。

<script>document.cookie=('_')+('_')+('j')+('s')+('l')+('_')+('c')+('l')+('e')+('a')+('r')+('a')+('n')+('c')+('e')+('_')+('s')+('=')+((+true)+'')+(7+'')+(-~{}+'')+((2<<1)+'')+(1+[0]-(1)+'')+((1+[4]>>1)+'')+((1|2)+'')+((+true)+'')+((+[])+'')+(~~[]+'')+('.')+(2+7+'')+(-~[6]+'')+(1+4+'')+('|')+('-')+(-~{}+'')+('|')+('K')+('C')+('a')+('J')+('p')+('j')+('k')+('i')+('Y')+('X')+(1+2+'')+('U')+('z')+(1+7+'')+('i')+('K')+((2<<1)+'')+('%')+(2+'')+('B')+('y')+('o')+('Y')+('B')+('B')+('q')+('E')+('Q')+('w')+('%')+(3+'')+('D')+(';')+(' ')+('M')+('a')+('x')+('-')+('a')+('g')+('e')+('=')+(1+2+'')+(2+4+'')+(~~{}+'')+(~~false+'')+(';')+(' ')+('P')+('a')+('t')+('h')+('=')+('/')+(';')+(' ')+('S')+('a')+('m')+('e')+('S')+('i')+('t')+('e')+('=')+('N')+('o')+('n')+('e')+(';')+(' ')+('S')+('e')+('c')+('u')+('r')+('e');location.href=location.pathname+location.search</script>

具体原理这里不再解释,下面给出java代码:

import cn.hutool.crypto.digest.DigestUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import lombok.Data;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.junit.Test;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.File;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Fetch521 {
    @Data
    private static class Result{
        private String js;
        private int statusCode;

    }

    private Result fetchBodyAndCookie(String url, Map<String, String> cookies, boolean fetchCookie) {
        Result result= new Result();
        Connection connect = Jsoup.connect(url)
                .ignoreHttpErrors(true)
                .ignoreContentType(true)
                .timeout(100000)
                .maxBodySize(0)
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
        if (cookies.size() > 0) {
            connect.cookies(cookies);
        }
        Connection.Response response = null;
        try {
            response = connect.execute();
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (fetchCookie) {
            cookies.putAll(response.cookies());
        }
        String js = response.body();
        int code=response.statusCode();
        result.setJs(js);
        result.setStatusCode(code);

        return result;
    }


    private String fetchCookie(String js, Map<String, String> cookies) {
        ScriptEngineManager manager = new ScriptEngineManager();
        ScriptEngine engine = manager.getEngineByName("JavaScript");
        String cookie_v = "";
        try {
            Object result = engine.eval(js);
            if (null != result) {
                String out = result.toString();
                cookie_v = StringUtils.substringBefore(out, ";");
                cookie_v = StringUtils.substringAfter(cookie_v, "__jsl_clearance_s=");
                cookies.put("__jsl_clearance_s", cookie_v.replace("", ""));
            }
        } catch (ScriptException e) {
            e.printStackTrace();
        }
        return cookie_v;
    }

    private String formatJS(String js){
        return js.replace("location.href=location.pathname+location.search", "").replace("<script>", "").replace("</script>", "").replace("document.cookie=", "");
    }

    @Test
    public void fetch() {
        String url = "https://xxxxxxx/public/5921/108921040.html";
        Map<String, String> cookies = new HashMap<>(8);
        Result result =fetchBodyAndCookie(url, cookies, true);
        String js=result.getJs();

        js = formatJS(js);
        fetchCookie(js, cookies);

        for (int x = 0; x < 30; x++) {
            result = fetchBodyAndCookie(url, cookies, false);
            js=result.getJs();
            
            js = formatJS(js);

            if (js.indexOf("};go({\"bts\":[") > -1) {
                Pattern pattern = Pattern.compile("};go\\((.*?)\\)");
                Matcher matcher = pattern.matcher(js);
                String json = "{}";
                boolean exit=false;
                if (matcher.find()) {
                    json = matcher.group(1);
                    //System.out.println("json:" + json);
                    JSONObject go = JSON.parseObject(json);
                    String chars = go.getString("chars");
                    String vales = "";
                    String ha = "";
                    outer:
                    for (int i = 0; i < chars.length(); i++) {
                        for (int j = 0; j < chars.length(); j++) {
                            vales = String.valueOf(go.getJSONArray("bts").get(0)) + String.valueOf(go.getString("chars").charAt(i)) + String.valueOf(go.getString("chars").charAt(j)) + String.valueOf(go.getJSONArray("bts").get(1));
                            if (go.getString("ha").equalsIgnoreCase("md5")) {
                                ha = DigestUtil.md5Hex(vales);
                            } else if (go.getString("ha").equalsIgnoreCase("sha1")) {
                                ha = DigestUtil.sha1Hex(vales);
                            } else if (go.getString("ha").equalsIgnoreCase("sha256")) {
                                ha = DigestUtil.sha256Hex(vales);
                            }
                            if (ha.equalsIgnoreCase(go.getString("ct"))) {
                                cookies.put("__jsl_clearance_s", vales);
                                result =fetchBodyAndCookie(url,cookies,false);
                                js=result.getJs();
                                if(js.startsWith("<script>document.cookie=('_')+('_')") || js.endsWith("location.href=location.pathname+location.search</script>")){
                                    js = formatJS(js);
                                    break outer;
                                }else{
                                    exit=true;
                                }

                            }
                        }
                    }
                }
                if(exit) {
                    break;
                }
            }
            fetchCookie(js, cookies);
        }
    }
}

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值