java爬虫防屏蔽_Java爬虫技术之绕过百度云防护抓取网站内容

如图:

425ba2cc3f2d469abdec4ece793f9e6d.png

首先需要一个Http工具类:HttpHandle

package org.coody.robot.util;

import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URI;

import java.net.URL;

import java.util.HashMap;

import java.util.Map;

public class HttpHandle {

public final static String POST="POST";

public final static String GET="GET";

public final static String HEAD="HEAD";

public final static String PUT="PUT";

public final static String CONNECT="CONNECT";

public final static String OPTIONS="OPTIONS";

public final static String DELETE="DELETE";

public final static String PATCH="PATCH";

public final static String PROPFIND="PROPFIND";

public final static String PROPPATCH="PROPPATCH";

public final static String MKCOL="MKCOL";

public final static String COPY="COPY";

public final static String MOVE="MOVE";

public final static String LOCK="LOCK";

public final static String UNLOCK ="UNLOCK";

public final static String TRACE="TRACE";

public final static String HTTP_GENERAL="HTTP_GENERAL";

public final static String HTTP_JSON="HTTP_JSON";

public HttpConfig config=new HttpConfig();

public HttpConfig getConfig() {

return config;

}

public void setConfig(HttpConfig config) {

this.config = config;

}

public static class HttpConfig{

private boolean allowRedirects=true;

private String cookie="";

private String encode="UTF-8";

private int timeOut=15;

private String httpModule=HTTP_GENERAL;

private Map headerMap=new HashMap();

public void setEncode(String encode) {

this.encode = encode;

}

public void setTimeOut(int timeOut) {

this.timeOut = timeOut;

}

public void setCookie(String cookie) {

this.cookie = cookie;

}

public void setHeaderMap(Map headerMap) {

this.headerMap = headerMap;

}

//设置Header头部

public void setRequestProperty(String fieldName,String value){

headerMap.put(fieldName, value);

}

//是否开启Gzip

public void setGzip(boolean isGzip){

if(isGzip){

headerMap.put("Accept-Encoding", "gzip, deflate, sdch");

return;

}

headerMap.put("Accept-Encoding", "*");

}

//是否保持连接

public void setKeepAlive(boolean keepAlive){

if(keepAlive){

headerMap.put("Connection", "keep-alive");

return;

}

headerMap.put("Connection", "close");

}

//是否允许重定向

public void allowRedirects(boolean allowRedirects){

this.allowRedirects=allowRedirects;

}

}

private HttpURLConnection createConnectionGeneral(String url) {

try {

HttpURLConnection conn = (HttpURLConnection) new URL(url)

.openConnection();

conn.addRequestProperty("Referer", getDomain(url));

conn.addRequestProperty(

"Accept",

"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");

conn.addRequestProperty("Content-type",

"application/x-www-form-urlencoded");

conn.addRequestProperty(

"User-Agent",

"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");

return conn;

} catch (Exception e) {

return null;

}

}

private HttpURLConnection createConnectionJson(String url) {

try {

HttpURLConnection conn = (HttpURLConnection) new URL(url)

.openConnection();

conn.addRequestProperty("Referer", getDomain(url));

conn.addRequestProperty(

"Accept",

"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");

conn.addRequestProperty("Content-type",

"application/x-www-form-urlencoded");

conn.addRequestProperty(

"User-Agent",

"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");

return conn;

} catch (Exception e) {

e.printStackTrace();

return null;

}

}

//获取默认来源地址

public static String getDomain(String urlStr){

try {

URI uri=new URI(urlStr);

String result=uri.getScheme()+"://"+uri.getHost();

if(uri.getPort()>0&&uri.getPort()!=80){

result+=("/"+uri.getPort());

}

if(!result.endsWith("/")){

result+="/";

}

return result;

} catch (Exception e) {

e.printStackTrace();

return null;

}

}

//合并Cookie

private static String mergeCookie(String oldCookie, String newCookie) {

if (newCookie == null) {

return oldCookie;

}

Map cookieMap = new HashMap();

String[] cookTmp = null;

String[] cookieTab = null;

StringBuilder valueTmp = new StringBuilder();

String[] cookies = { oldCookie, newCookie };

for (String currCookie : cookies) {

if (StringUtil.isNullOrEmpty(currCookie)) {

continue;

}

cookieTab = currCookie.split(";");

for (String cook : cookieTab) {

cookTmp = cook.split("=");

if (cookTmp.length < 2) {

continue;

}

valueTmp = new StringBuilder();

for (int i = 1; i < cookTmp.length; i++) {

valueTmp.append(cookTmp[i]);

if (i < cookTmp.length - 1) {

valueTmp.append("=");

}

}

if (StringUtil.findNull(cookTmp[0], valueTmp) > -1) {

continue;

}

cookieMap.put(cookTmp[0], valueTmp.toString());

}

}

valueTmp = new StringBuilder();

for (String key : cookieMap.keySet()) {

valueTmp.append(key).append("=").append(cookieMap.get(key));

valueTmp.append(";");

}

return valueTmp.toString();

}

private HttpURLConnection getConnection(String url) {

if(config.httpModule.equals(HTTP_GENERAL)){

return createConnectionGeneral(url);

}

if(config.httpModule.equals(HTTP_JSON)){

return createConnectionJson(url);

}

return null;

}

public HttpEntity Get(String url){

return Conn(url, GET, null);

}

public HttpEntity Post(String url,String data){

return Conn(url, POST, data);

}

public HttpEntity Conn(String url, String method,

String postData){

if(url.contains(" ")){

url=url.replace(" ", "%20");

}

HttpURLConnection conn = getConnection(url);

if (conn == null) {

return null;

}

if (!StringUtil.isNullOrEmpty(config.headerMap)) {

for (String key : config.headerMap.keySet()) {

conn.setRequestProperty(key, config.headerMap.get(key));

key = null;

}

}

if(!config.allowRedirects){

conn.setInstanceFollowRedirects(false);

}

if (!StringUtil.isNullOrEmpty(config.cookie)) {

conn.setRequestProperty("Cookie", config.cookie);

}

try {

conn.setRequestMethod(method);

if (method.equalsIgnoreCase(POST)||method.equalsIgnoreCase(PUT)) {

conn.setDoOutput(true);

byte [] postByte=postData.getBytes(config.encode);

conn.setRequestProperty("Content-Length", String.valueOf(postByte.length));

conn.getOutputStream().write(postByte);

conn.connect();

conn.getOutputStream().flush();

conn

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值