爬取网页时是否也遇到过状态码始终返回521?如果网页源代码返回类似以下内容,则该站使用了反爬措施。
<script>document.cookie=('_')+('_')+('j')+('s')+('l')+('_')+('c')+('l')+('e')+('a')+('r')+('a')+('n')+('c')+('e')+('_')+('s')+('=')+((+true)+'')+(7+'')+(-~{}+'')+((2<<1)+'')+(1+[0]-(1)+'')+((1+[4]>>1)+'')+((1|2)+'')+((+true)+'')+((+[])+'')+(~~[]+'')+('.')+(2+7+'')+(-~[6]+'')+(1+4+'')+('|')+('-')+(-~{}+'')+('|')+('K')+('C')+('a')+('J')+('p')+('j')+('k')+('i')+('Y')+('X')+(1+2+'')+('U')+('z')+(1+7+'')+('i')+('K')+((2<<1)+'')+('%')+(2+'')+('B')+('y')+('o')+('Y')+('B')+('B')+('q')+('E')+('Q')+('w')+('%')+(3+'')+('D')+(';')+(' ')+('M')+('a')+('x')+('-')+('a')+('g')+('e')+('=')+(1+2+'')+(2+4+'')+(~~{}+'')+(~~false+'')+(';')+(' ')+('P')+('a')+('t')+('h')+('=')+('/')+(';')+(' ')+('S')+('a')+('m')+('e')+('S')+('i')+('t')+('e')+('=')+('N')+('o')+('n')+('e')+(';')+(' ')+('S')+('e')+('c')+('u')+('r')+('e');location.href=location.pathname+location.search</script>
具体原理这里不再解释,下面给出java代码:
import cn.hutool.crypto.digest.DigestUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import lombok.Data;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.junit.Test;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.io.File;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Fetch521 {
@Data
private static class Result{
private String js;
private int statusCode;
}
private Result fetchBodyAndCookie(String url, Map<String, String> cookies, boolean fetchCookie) {
Result result= new Result();
Connection connect = Jsoup.connect(url)
.ignoreHttpErrors(true)
.ignoreContentType(true)
.timeout(100000)
.maxBodySize(0)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
if (cookies.size() > 0) {
connect.cookies(cookies);
}
Connection.Response response = null;
try {
response = connect.execute();
} catch (Exception e) {
e.printStackTrace();
}
if (fetchCookie) {
cookies.putAll(response.cookies());
}
String js = response.body();
int code=response.statusCode();
result.setJs(js);
result.setStatusCode(code);
return result;
}
private String fetchCookie(String js, Map<String, String> cookies) {
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("JavaScript");
String cookie_v = "";
try {
Object result = engine.eval(js);
if (null != result) {
String out = result.toString();
cookie_v = StringUtils.substringBefore(out, ";");
cookie_v = StringUtils.substringAfter(cookie_v, "__jsl_clearance_s=");
cookies.put("__jsl_clearance_s", cookie_v.replace("", ""));
}
} catch (ScriptException e) {
e.printStackTrace();
}
return cookie_v;
}
private String formatJS(String js){
return js.replace("location.href=location.pathname+location.search", "").replace("<script>", "").replace("</script>", "").replace("document.cookie=", "");
}
@Test
public void fetch() {
String url = "https://xxxxxxx/public/5921/108921040.html";
Map<String, String> cookies = new HashMap<>(8);
Result result =fetchBodyAndCookie(url, cookies, true);
String js=result.getJs();
js = formatJS(js);
fetchCookie(js, cookies);
for (int x = 0; x < 30; x++) {
result = fetchBodyAndCookie(url, cookies, false);
js=result.getJs();
js = formatJS(js);
if (js.indexOf("};go({\"bts\":[") > -1) {
Pattern pattern = Pattern.compile("};go\\((.*?)\\)");
Matcher matcher = pattern.matcher(js);
String json = "{}";
boolean exit=false;
if (matcher.find()) {
json = matcher.group(1);
//System.out.println("json:" + json);
JSONObject go = JSON.parseObject(json);
String chars = go.getString("chars");
String vales = "";
String ha = "";
outer:
for (int i = 0; i < chars.length(); i++) {
for (int j = 0; j < chars.length(); j++) {
vales = String.valueOf(go.getJSONArray("bts").get(0)) + String.valueOf(go.getString("chars").charAt(i)) + String.valueOf(go.getString("chars").charAt(j)) + String.valueOf(go.getJSONArray("bts").get(1));
if (go.getString("ha").equalsIgnoreCase("md5")) {
ha = DigestUtil.md5Hex(vales);
} else if (go.getString("ha").equalsIgnoreCase("sha1")) {
ha = DigestUtil.sha1Hex(vales);
} else if (go.getString("ha").equalsIgnoreCase("sha256")) {
ha = DigestUtil.sha256Hex(vales);
}
if (ha.equalsIgnoreCase(go.getString("ct"))) {
cookies.put("__jsl_clearance_s", vales);
result =fetchBodyAndCookie(url,cookies,false);
js=result.getJs();
if(js.startsWith("<script>document.cookie=('_')+('_')") || js.endsWith("location.href=location.pathname+location.search</script>")){
js = formatJS(js);
break outer;
}else{
exit=true;
}
}
}
}
}
if(exit) {
break;
}
}
fetchCookie(js, cookies);
}
}
}