Java爬虫技术之绕过百度云防护抓取网站内容

大家好,我是Coody


最近做文章采集,碰到一个有经过百度云加速的网站,由于打开浏览器需要安全检查,所以针对相关机制做了一下研究,故此封装了一个HTTP工具。


本文已发布之开源中国,由于csdn用户量巨大且易于搜索引擎收录,故此分享出来希望对特定的友友有所帮助。


直接贴代码,copy下来可以直接使用






如图:


输入图片说明


首先需要一个Http工具类:HttpHandle


package org.coody.robot.util;


import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;




public class HttpHandle {



public final static String POST="POST";
public final static String GET="GET";
public final static String HEAD="HEAD";
public final static String PUT="PUT";
public final static String CONNECT="CONNECT";
public final static String OPTIONS="OPTIONS";
public final static String DELETE="DELETE";
public final static String PATCH="PATCH";
public final static String PROPFIND="PROPFIND";
public final static String PROPPATCH="PROPPATCH";
public final static String MKCOL="MKCOL";
public final static String COPY="COPY";
public final static String MOVE="MOVE";
public final static String LOCK="LOCK";
public final static String UNLOCK ="UNLOCK";
public final static String TRACE="TRACE";

public final static String HTTP_GENERAL="HTTP_GENERAL";

public final static String HTTP_JSON="HTTP_JSON";

public HttpConfig config=new HttpConfig();




public HttpConfig getConfig() {
return config;
}


public void setConfig(HttpConfig config) {
this.config = config;
}
public static class HttpConfig{

private boolean allowRedirects=true;

private String cookie="";

private String encode="UTF-8";

private int timeOut=15;

private String httpModule=HTTP_GENERAL;

private Map<String, String> headerMap=new HashMap<String, String>();


public void setEncode(String encode) {
this.encode = encode;
}




public void setTimeOut(int timeOut) {
this.timeOut = timeOut;
}




public void setCookie(String cookie) {
this.cookie = cookie;
}




public void setHeaderMap(Map<String, String> headerMap) {
this.headerMap = headerMap;
}


//设置Header头部
public void setRequestProperty(String fieldName,String value){
headerMap.put(fieldName, value);
}
//是否开启Gzip
public void setGzip(boolean isGzip){
if(isGzip){
headerMap.put("Accept-Encoding", "gzip, deflate, sdch");
return;
}
headerMap.put("Accept-Encoding", "*");
}
//是否保持连接
public void setKeepAlive(boolean keepAlive){
if(keepAlive){
headerMap.put("Connection", "keep-alive");
return;
}
headerMap.put("Connection", "close");
}

//是否允许重定向
public void allowRedirects(boolean allowRedirects){
this.allowRedirects=allowRedirects;
}
}

private HttpURLConnection createConnectionGeneral(String url) {
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url)
.openConnection();
conn.addRequestProperty("Referer", getDomain(url));
conn.addRequestProperty(
"Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
conn.addRequestProperty("Content-type",
"application/x-www-form-urlencoded");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
return conn;
} catch (Exception e) {
return null;
}
}

private HttpURLConnection createConnectionJson(String url) {
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url)
.openConnection();
conn.addRequestProperty("Referer", getDomain(url));
conn.addRequestProperty(
"Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
conn.addRequestProperty("Content-type",
"application/x-www-form-urlencoded");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
return conn;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}








//获取默认来源地址
public static String getDomain(String urlStr){
try {
URI uri=new URI(urlStr);
String result=uri.getScheme()+"://"+uri.getHost();
if(uri.getPort()>0&&uri.getPort()!=80){
result+=("/"+uri.getPort());
}
if(!result.endsWith("/")){
result+="/";
}
return result;
} catch (Exception e) {
e.printStackTrace();
return null;
}

}
//合并Cookie
private static String mergeCookie(String oldCookie, String newCookie) {
if (newCookie == null) {
return oldCookie;
}
Map<String, String> cookieMap = new HashMap<String, String>();
String[] cookTmp = null;
String[] cookieTab = null;
StringBuilder valueTmp = new StringBuilder();
String[] cookies = { oldCookie, newCookie };
for (String currCookie : cookies) {
if (StringUtil.isNullOrEmpty(currCookie)) {
continue;
}
cookieTab = currCookie.split(";");
for (String cook : cookieTab) {
cookTmp = cook.split("=");
if (cookTmp.length < 2) {
continue;
}
valueTmp = new StringBuilder();
for (int i = 1; i < cookTmp.length; i++) {
valueTmp.append(cookTmp[i]);
if (i < cookTmp.length - 1) {
valueTmp.append("=");
}
}
if (StringUtil.findNull(cookTmp[0], valueTmp) > -1) {
continue;
}
cookieMap.put(cookTmp[0], valueTmp.toString());
}
}
valueTmp = new StringBuilder();
for (String key : cookieMap.keySet()) {
valueTmp.append(key).append("=").append(cookieMap.get(key));
valueTmp.append(";");
}
return valueTmp.toString();
}

private HttpURLConnection getConnection(String url) {
if(config.httpModule.equals(HTTP_GENERAL)){
return createConnectionGeneral(url);
}
if(config.httpModule.equals(HTTP_JSON)){
return createConnectionJson(url);
}
return null;
}

public HttpEntity Get(String url){
return Conn(url, GET, null);
}

public HttpEntity Post(String url,String data){
return Conn(url, POST, data);
}

public HttpEntity Conn(String url, String method,
String postData){
if(url.contains(" ")){
url=url.replace(" ", "%20");
}
HttpURLConnection conn = getConnection(url);
if (conn == null) {
return null;
}
if (!StringUtil.isNullOrEmpty(config.headerMap)) {
for (String key : config.headerMap.keySet()) {
conn.setRequestProperty(key, config.headerMap.get(key));
key = null;
}
}
if(!config.allowRedirects){
conn.setInstanceFollowRedirects(false);
}
if (!StringUtil.isNullOrEmpty(config.cookie)) {
conn.setRequestProperty("Cookie", config.cookie);
}
try {
conn.setRequestMethod(method);
if (method.equalsIgnoreCase(POST)||method.equalsIgnoreCase(PUT)) {
conn.setDoOutput(true);
byte [] postByte=postData.getBytes(config.encode);
conn.setRequestProperty("Content-Length", String.valueOf(postByte.length));
conn.getOutputStream().write(postByte);
conn.connect();
conn.getOutputStream().flush();
conn.getOutputStream().close();
}
} catch (Exception e) {
e.printStackTrace();
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值