Java爬虫技术之绕过百度云防护抓取网站内容

如图:

输入图片说明

首先需要一个Http工具类:HttpHandle

package org.coody.robot.util;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;


public class HttpHandle {

	
	public final static String POST="POST";
	public final static String GET="GET";
	public final static String HEAD="HEAD";
	public final static String PUT="PUT";
	public final static String CONNECT="CONNECT";
	public final static String OPTIONS="OPTIONS";
	public final static String DELETE="DELETE";
	public final static String PATCH="PATCH";
	public final static String PROPFIND="PROPFIND";
	public final static String PROPPATCH="PROPPATCH";
	public final static String MKCOL="MKCOL";
	public final static String COPY="COPY";
	public final static String MOVE="MOVE";
	public final static String LOCK="LOCK";
	public final static String UNLOCK ="UNLOCK";
	public final static String TRACE="TRACE";
	
	public final static String HTTP_GENERAL="HTTP_GENERAL";
	
	public final static String HTTP_JSON="HTTP_JSON";
	
	public HttpConfig config=new HttpConfig();
	

	
	public HttpConfig getConfig() {
		return config;
	}

	public void setConfig(HttpConfig config) {
		this.config = config;
	}
	public static class HttpConfig{
		
		private boolean allowRedirects=true;
		
		private String cookie="";
		
		private String encode="UTF-8";
		
		private int timeOut=15;
		
		private String httpModule=HTTP_GENERAL;
		
		private Map<String, String> headerMap=new HashMap<String, String>();
		
		
		public void setEncode(String encode) {
			this.encode = encode;
		}


		public void setTimeOut(int timeOut) {
			this.timeOut = timeOut;
		}


		public void setCookie(String cookie) {
			this.cookie = cookie;
		}


		public void setHeaderMap(Map<String, String> headerMap) {
			this.headerMap = headerMap;
		}

		//设置Header头部
		public void setRequestProperty(String fieldName,String value){
			headerMap.put(fieldName, value);
		}
		//是否开启Gzip
		public void setGzip(boolean isGzip){
			if(isGzip){
				headerMap.put("Accept-Encoding", "gzip, deflate, sdch");
				return;
			}
			headerMap.put("Accept-Encoding", "*");
		}
		//是否保持连接
		public void setKeepAlive(boolean keepAlive){
			if(keepAlive){
				headerMap.put("Connection", "keep-alive");
				return;
			}
			headerMap.put("Connection", "close");
		}
		
		//是否允许重定向
		public void allowRedirects(boolean allowRedirects){
			this.allowRedirects=allowRedirects;
		}
	}
	
	private HttpURLConnection createConnectionGeneral(String url) {
		try {
			HttpURLConnection conn = (HttpURLConnection) new URL(url)
					.openConnection();
			conn.addRequestProperty("Referer", getDomain(url));
			conn.addRequestProperty(
					"Accept",
					"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
			conn.addRequestProperty("Content-type",
					"application/x-www-form-urlencoded");
			conn.addRequestProperty(
					"User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
			return conn;
		} catch (Exception e) {
			return null;
		}
	}
	
	private HttpURLConnection createConnectionJson(String url) {
		try {
			HttpURLConnection conn = (HttpURLConnection) new URL(url)
					.openConnection();
			conn.addRequestProperty("Referer", getDomain(url));
			conn.addRequestProperty(
					"Accept",
					"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
			conn.addRequestProperty("Content-type",
					"application/x-www-form-urlencoded");
			conn.addRequestProperty(
					"User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
			return conn;
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}
	
	
	


	
	//获取默认来源地址
	public static String getDomain(String urlStr){
		try {
			URI uri=new URI(urlStr);
			String result=uri.getScheme()+"://"+uri.getHost();
			if(uri.getPort()>0&&uri.getPort()!=80){
				result+=("/"+uri.getPort());
			}
			if(!result.endsWith("/")){
				result+="/";
			}
			return result;
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
		
	}
	//合并Cookie
	private static String mergeCookie(String oldCookie, String newCookie) {
		if (newCookie == null) {
			return oldCookie;
		}
		Map<String, String> cookieMap = new HashMap<String, String>();
		String[] cookTmp = null;
		String[] cookieTab = null;
		StringBuilder valueTmp = new StringBuilder();
		String[] cookies = { oldCookie, newCookie };
		for (String currCookie : cookies) {
			if (StringUtil.isNullOrEmpty(currCookie)) {
				continue;
			}
			cookieTab = currCookie.split(";");
			for (String cook : cookieTab) {
				cookTmp = cook.split("=");
				if (cookTmp.length < 2) {
					continue;
				}
				valueTmp = new StringBuilder();
				for (int i = 1; i < cookTmp.length; i++) {
					valueTmp.append(cookTmp[i]);
					if (i < cookTmp.length - 1) {
						valueTmp.append("=");
					}
				}
				if (StringUtil.findNull(cookTmp[0], valueTmp) > -1) {
					continue;
				}
				cookieMap.put(cookTmp[0], valueTmp.toString());
			}
		}
		valueTmp = new StringBuilder();
		for (String key : cookieMap.keySet()) {
			valueTmp.append(key).append("=").append(cookieMap.get(key));
			valueTmp.append(";");
		}
		return valueTmp.toString();
	}
	
	private HttpURLConnection getConnection(String url) {
		if(config.httpModule.equals(HTTP_GENERAL)){
			return createConnectionGeneral(url);
		}
		if(config.httpModule.equals(HTTP_JSON)){
			return createConnectionJson(url);
		}
		return null;
	}
	
	public HttpEntity Get(String url){
			return Conn(url, GET, null);
	}
	
	public HttpEntity Post(String url,String data){
		return Conn(url, POST, data);
	}
	
	public HttpEntity Conn(String url, String method,
			String postData){
		if(url.contains(" ")){
			url=url.replace(" ", "%20");
		}
		HttpURLConnection conn = getConnection(url);
		if (conn == null) {
			return null;
		}
		if (!StringUtil.isNullOrEmpty(config.headerMap)) {
			for (String key : config.headerMap.keySet()) {
				conn.setRequestProperty(key, config.headerMap.get(key));
				key = null;
			}
		}
		if(!config.allowRedirects){
			conn.setInstanceFollowRedirects(false);
		}
		if (!StringUtil.isNullOrEmpty(config.cookie)) {
			conn.setRequestProperty("Cookie", config.cookie);
		}
		try {
			conn.setRequestMethod(method);
			if (method.equalsIgnoreCase(POST)||method.equalsIgnoreCase(PUT)) {
				conn.setDoOutput(true);
				byte [] postByte=postData.getBytes(config.encode);
				conn.setRequestProperty("Content-Length", String.valueOf(postByte.length));
				conn.getOutputStream().write(postByte);
				conn.connect();
				conn.getOutputStream().flush();
				conn.getOutputStream().close();
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		conn.setConnectTimeout(config.timeOut*1000);
		InputStream ins = null;
		HttpEntity hEntity = new HttpEntity();
		String key = "";
		StringBuilder cookie = new StringBuilder();
		try {
			Integer status=conn.getResponseCode();
			if (status !=HttpURLConnection.HTTP_OK) {
				ins=conn.getErrorStream();
			}else{
				ins=conn.getInputStream();
			}
			hEntity.setCode(conn.getResponseCode());
			Map<String,String> headMap=new HashMap<String, String>();
			for (int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++) {
				headMap.put(key, conn.getHeaderField(key));
				if (key.equalsIgnoreCase("set-cookie")) {
					try {
						cookie.append(conn.getHeaderField(i).replace("/", ""));
					} catch (Exception e) {
					}
				}
			}
			config.cookie = mergeCookie(config.cookie, cookie.toString());
			byte[] b = toByte(ins);
			if((headMap.get("Content-Encoding")!=null && headMap.get("Content-Encoding").contains("gzip"))||(conn.getRequestProperty("Accept-Encoding")!=null&&conn.getRequestProperty("Accept-Encoding").contains("gzip"))){
				b = GZIPUtils.uncompress(b);
			}
			hEntity.setEncode(config.encode);
			hEntity.setBye(b);
			hEntity.setCookie(config.cookie);
			hEntity.setHeadMap(headMap);
		} catch (Exception e) {
			e.printStackTrace();
		}finally{
			try {
				ins.close();
			} catch (Exception e2) {
			}
		}
		return hEntity;
	}
	
	
	private byte[] toByte(InputStream ins) {
		if(ins==null){
			return null;
		}
		ByteArrayOutputStream swapStream = null;
		try {
			swapStream = new ByteArrayOutputStream();
			byte[] buff = new byte[1024];
			int rc = 0;
			while ((rc = ins.read(buff, 0, 1024)) > 0) {
				swapStream.write(buff, 0, rc);
			}
			return swapStream.toByteArray();
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		} finally {
			try {
				swapStream.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	
	
	
	
}

其次需要一个Http响应对象类:HttpEntity

package org.coody.robot.util;

import java.util.HashMap;
import java.util.Map;

public class HttpEntity {
	
	
	
	private String html;
	private byte[] bye;
	private String cookie;
	private Integer code=-1;
	private Map<String,String> headMap;
	
	public Map<String, String> getHeadMap() {
		return headMap;
	}

	public void setHeadMap(Map<String, String> headMap) {
		this.headMap = headMap;	
	}

	private String encode="UTF-8";
	
	public String getHtml() {
		try {
			if(html!=null){
				return html;
			}
		if(bye==null){
			return null;
		}
			String str= new String(bye, encode);
			html=str;
			return str;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public String getHtml(boolean isGzip) {
		try {
		if(bye==null){
			return null;
		}
			String str= new String(GZIPUtils.uncompress(bye), encode);
			return str;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}

	public String getEncode() {
		return encode;
	}

	public void setEncode(String encode) {
		this.encode = encode;
	}

	public void setHtml(String html) {
		this.html = html;
	}

	public Integer getCode() {
		return code;
	}

	public void setCode(Integer code) {
		this.code = code;
	}

	public String getCookie() {
		return cookie;
	}

	public void setCookie(String cookie) {
		this.cookie = cookie;
	}

	public byte[] getBye() {
		return bye;
	}

	public void setBye(byte[] bye) {
		this.bye = bye;
	}

	public Map<String, String> getCookieMap() {
		if (cookie == null) {
			return null;
		}
		Map<String, String> cookieMap = new HashMap<String, String>();
		String[] cookies = cookie.split(";");
		for (String cook : cookies) {
			String[] tmps = cook.split("=");
			if (tmps.length >= 2) {
				String cookieValue = "";
				for (int i = 1; i < tmps.length; i++) {
					cookieValue += tmps[i];
					if (i < tmps.length-1) {
						cookieValue += "=";
					}
				}
				cookieMap.put(tmps[0].trim(), cookieValue.trim());
			}
		}
		return cookieMap;
	}
}

某些网站是有Gzip压缩的,需要一个Gzip压缩类GzipUtils

package org.coody.robot.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;  
  
  
  
/** 
 *  
 * @author wenqi5 
 *  
 */  
public class GZIPUtils {  
  
    public static final String GZIP_ENCODE_UTF_8 = "UTF-8";  
  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @return 
     */  
    public static byte[] compress(String str) {  
        try {
			return compress(str.getBytes("UTF-8"));
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
			return null;
		}  
    }  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @param encoding 
     * @return 
     */  
    public static byte[] compress(byte[] data) {  
        if (data == null || data.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        GZIPOutputStream gzip = null;  
        try {  
            gzip = new GZIPOutputStream(out);  
            gzip.write(data);  
        } catch (IOException e) {  
        }finally {
        	try {
        		gzip.close();  
			} catch (Exception e2) {
			}
        	try {
        		out.close();
			} catch (Exception e2) {
			}
		}
        return out.toByteArray();  
    }  
  
    /** 
     * GZIP解压�? 
     *  
     * @param bytes 
     * @return 
     */  
    public static byte[] uncompress(byte[] bytes) {  
        if (bytes == null || bytes.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);  
        try {  
            GZIPInputStream ungzip = new GZIPInputStream(in);  
            byte[] buffer = new byte[256];  
            int n;  
            while ((n = ungzip.read(buffer)) >= 0) {  
                out.write(buffer, 0, n);  
            }  
        } catch (IOException e) {  
        }  finally {
			try {
				in.close();
			} catch (Exception e2) {
			}
			try {
				out.close();
			} catch (Exception e2) {
			}
        }
  
        return out.toByteArray();  
    }  
}  

以上类均依赖一个StringUtil,笔者比较懒,也没有拆分出来

package org.coody.robot.util;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import oracle.sql.CLOB;

public class StringUtil {

	
	public static Integer[] getIntegerParas(Object[] objs) {
		if (isNullOrEmpty(objs)) {
			return null;
		}
		Integer[] ints = new Integer[objs.length];
		for (int i = 0; i < objs.length; i++) {
			try {
				ints[i] = Integer.valueOf(objs[i].toString());
			} catch (Exception e) {
			}
		}
		return ints;
	}

	/**
	 * 生成指定数目字符串按分隔符分�?
	 * 
	 * @param baseStr
	 * @param mosaicChr
	 * @param size
	 * @return
	 */
	public static String getByMosaicChr(String baseStr, String mosaicChr, Integer size) {
		List<String> list = new ArrayList<String>();
		for (int i = 0; i < size; i++) {
			if (isNullOrEmpty(baseStr)) {
				continue;
			}
			list.add(baseStr);
		}
		return collectionMosaic(list, mosaicChr);
	}

	/**
	 * 根据分割符将字符串分割成String数组
	 * 
	 * @param src
	 *            源字符串
	 * @param separator
	 *            分隔�??
	 * @return String数组
	 */
	public static String[] splitToStringArray(String src, String separator) {
		Vector<String> splitArrays = new Vector<String>();
		int i = 0;
		int j = 0;
		while (i <= src.length()) {
			j = src.indexOf(separator, i);
			if (j < 0) {
				j = src.length();
			}
			splitArrays.addElement(src.substring(i, j));
			i = j + 1;
		}
		int size = splitArrays.size();
		String[] array = new String[size];
		System.arraycopy(splitArrays.toArray(), 0, array, 0, size);
		return array;
	}

	/**
	 * 根据分割符将字符串分割成Integer数组
	 * 
	 * @param src
	 *            源字符串
	 * @param separator
	 *            分隔�??
	 * @return Integer数组
	 */
	public static Integer[] splitToIntgArray(String src, String separator) {
		String[] arr = splitToStringArray(src, separator);
		Integer[] intArr = new Integer[arr.length];
		for (int i = 0; i < arr.length; i++) {
			intArr[i] = Integer.valueOf(arr[i]);
		}
		return intArr;
	}

	/**
	 * 根据分隔符将字符串分割成int数组
	 * 
	 * @param src
	 *            源字符串
	 * @param separator
	 *            分隔�??
	 * @return int数组
	 */
	public static int[] splitToIntArray(String src, String separator) {
		String[] arr = splitToStringArray(src, separator);
		int[] intArr = new int[arr.length];
		for (int i = 0; i < arr.length; i++) {
			intArr[i] = Integer.parseInt(arr[i]);
		}
		return intArr;
	}

	public static String getInPara(Integer size) {
		return getByMosaicChr("?", ",", size);

	}

	public static String textCutCenter(String allTxt, String firstTxt, String lastTxt) {
		try {
			String tmp = "";
			int n1 = allTxt.indexOf(firstTxt);
			if (n1 == -1) {
				return "";
			}
			tmp = allTxt.substring(n1 + firstTxt.length(), allTxt.length());
			int n2 = tmp.indexOf(lastTxt);
			if (n2 == -1) {
				return "";
			}
			tmp = tmp.substring(0, n2);
			return tmp;
		} catch (Exception e) {
			return "";
		}
	}

	public static List<String> textCutCenters(String allTxt, String firstTxt, String lastTxt) {
		try {
			List<String> results = new ArrayList<String>();
			while(allTxt.contains(firstTxt)){
				int n = allTxt.indexOf(firstTxt);
				allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
				n=allTxt.indexOf(lastTxt);
				if(n==-1){
					return results;
				}
				String result=allTxt.substring(0, n);
				results.add(result);
				allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
			}
			return results;
		} catch (Exception e) {
			return null;
		}
	}
	public static String convertToUnicode(String source) {
		String result = "";
		char[] chrs = source.toCharArray();
		for (int i = 0; i < chrs.length; i++) {
			result += "&#" +Character.codePointAt(chrs, i);
		}
		return result;
	}
	public static Integer toInteger(Object obj) {
		if (isNullOrEmpty(obj)) {
			return null;
		}
		try {
			return Integer.valueOf(obj.toString());
		} catch (Exception e) {
			return null;
		}
	}

	public static String toString(Object obj) {
		if (isNullOrEmpty(obj)) {
			return null;
		}
		try {
			return String.valueOf(obj.toString());
		} catch (Exception e) {
			return null;
		}
	}

	public static Double toDouble(Object obj) {
		if (isNullOrEmpty(obj)) {
			return null;
		}
		try {
			return Double.valueOf(obj.toString());
		} catch (Exception e) {
			return null;
		}
	}

	public static Float toFloat(Object obj) {
		if (isNullOrEmpty(obj)) {
			return null;
		}
		try {
			return Float.valueOf(obj.toString());
		} catch (Exception e) {
			return null;
		}
	}

	public static Long toLong(Object obj) {
		if (isNullOrEmpty(obj)) {
			return null;
		}
		try {
			return Long.valueOf(obj.toString());
		} catch (Exception e) {
			return null;
		}
	}

	public static Integer getRanDom(int start, int end) {
		return (int) (Math.random() * (end - start + 1)) + start;
	}

	public static float getRanDom(Float start, Float end) {
		String str = String.valueOf(start);
		String[] tabs = str.split("\\.");
		Integer startLength = 1;
		if (tabs.length == 2) {
			startLength = tabs[1].length();
		}
		str = String.valueOf(end);
		tabs = str.split("\\.");
		Integer endLength = 1;
		if (tabs.length == 2) {
			endLength = tabs[1].length();
		}
		if (endLength > startLength) {
			startLength = endLength;
		}
		start = (float) (start * Math.pow(10, startLength));
		end = (float) (end * Math.pow(10, startLength));
		return (float) (getRanDom(start.intValue(), end.intValue()) / Math.pow(10, startLength));
	}

	public static String replaceBlank(String str) {
		String dest = "";
		if (str != null) {
			Pattern p = Pattern.compile("\\s*|\t|\r|\n");
			Matcher m = p.matcher(str);
			dest = m.replaceAll("");
		}
		return dest;
	}

	public static Boolean isMatcher(String val, String matcher) {
		Pattern p = Pattern.compile(matcher);
		Matcher m = p.matcher(val);
		return m.matches();
	}

	public static boolean isMobile(String mobile) {
		if (isNullOrEmpty(mobile)) {
			return false;
		}
		Pattern p = Pattern.compile("^((13[0-9])|(15[^4,\\D])|(17[^4,\\D])|(18[0,5-9]))\\d{8}$");
		Matcher m = p.matcher(mobile);
		return m.matches();
	}

	public static boolean isLegal(String str) {
		if (isNullOrEmpty(str)) {
			return false;
		}
		Pattern p = Pattern.compile("[A-Za-z0-9_]{3,16}");
		Matcher m = p.matcher(str);
		return m.matches();
	}

	public static boolean isEmail(String email) {
		if (isNullOrEmpty(email)) {
			return false;
		}
		Pattern p = Pattern.compile(
				"^([a-zA-Z0-9_\\-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
		Matcher m = p.matcher(email);
		return m.matches();
	}

	public static boolean isMd5(String md5) {
		if (isNullOrEmpty(md5)) {
			return false;
		}
		Pattern p = Pattern.compile("[A-Za-z0-9_]{16,40}");
		Matcher m = p.matcher(md5);
		return m.matches();
	}


	public static boolean isAllNull(Object... obj) {
		if (obj == null || obj.length == 0) {
			return true;
		}
		for (int i = 0; i < obj.length; i++) {
			if (!isNullOrEmpty(obj[i])) {
				return false;
			}
		}
		return true;
	}

	public static boolean isAllNull(List<Object> objs) {
		return isAllNull(objs.toArray());
	}

	/**
	 * 把一个数组按照分隔符拼接成字符串
	 * 
	 * @param 数组参数
	 * @param 分隔�?
	 * @return
	 */
	public static String collectionMosaic(Object[] objs, String mosaicChr) {
		if (isNullOrEmpty(objs)) {
			return null;
		}
		List<Object> objList = Arrays.asList(objs);
		return collectionMosaic(objList, mosaicChr);
	}

	/**
	 * 把一个数组按照分隔符拼接成字符串
	 * 
	 * @param 数组参数
	 * @param 分隔�?
	 * @return
	 */
	public static String collectionMosaic(int[] intObjs, String mosaicChr) {
		Object[] objs = new Object[intObjs.length];
		for (int i = 0; i < intObjs.length; i++) {
			objs[i] = String.valueOf(intObjs[i]);
		}
		return collectionMosaic(objs, mosaicChr);
	}

	/**
	 * 把一个或多个字符串按照分隔符拼接成字符串
	 * 
	 * @param 数组参数
	 * @param 分隔�?
	 * @return
	 */
	public static String collectionMosaic(String mosaicChr, Object... objs) {
		List<Object> objList = Arrays.asList(objs);
		return collectionMosaic(objList, mosaicChr);
	}

	/**
	 * 把一个集合按照分隔符拼接成字符串
	 * 
	 * @param 集合参数
	 * @param 分隔�?
	 * @return 字符�?
	 */
	public static String collectionMosaic(List<?> objs, String mosaicChr) {
		if (objs == null || objs.isEmpty()) {
			return null;
		}
		StringBuilder sb = new StringBuilder();
		int i = 0;
		for (Object obj : objs) {
			if (isNullOrEmpty(obj)) {
				continue;
			}
			sb.append(obj);
			if (i < objs.size() - 1) {
				sb.append(mosaicChr);
			}
			i++;
		}
		return sb.toString();
	}

	/**
	 * 生成指定数目字符串按分隔符分�?
	 * 
	 * @param baseStr
	 * @param mosaicChr
	 * @param size
	 * @return
	 */
	public static String getStringSByMosaicChr(String baseStr, String mosaicChr, Integer size) {
		List<String> list = new ArrayList<String>();
		for (int i = 0; i < size; i++) {
			if (isNullOrEmpty(baseStr)) {
				continue;
			}
			list.add(baseStr);
		}
		return collectionMosaic(list, mosaicChr);
	}

	/**
	 * 按照分隔符分�?,得到字符串集�?
	 * 
	 * @param text
	 *            原字符串
	 * @param mosaiChr
	 *            分隔�?
	 * @return list
	 */
	public static List<String> splitByMosaic(String text, String mosaiChr) {
		if (text == null || mosaiChr == null) {
			return null;
		}
		String[] tab = text.split(mosaiChr);
		List<String> list = new ArrayList<String>();
		for (int i = 0; i < tab.length; i++) {
			if (isNullOrEmpty(tab[i])) {
				continue;
			}
			list.add(tab[i]);
		}
		return list;
	}

	/**
	 * 按照分隔符分�?,得到字符串集�?
	 * 
	 * @param text
	 *            原字符串
	 * @param mosaiChr
	 *            分隔�?
	 * @return list
	 */
	public static List<Integer> splitByMosaicInteger(String text, String mosaiChr) {
		if (text == null || mosaiChr == null) {
			return null;
		}
		String[] tab = text.split(mosaiChr);
		List<Integer> list = new ArrayList<Integer>();
		for (int i = 0; i < tab.length; i++) {
			if (isNullOrEmpty(tab[i])) {
				continue;
			}
			try {
				list.add(Integer.valueOf(tab[i]));
			} catch (Exception e) {
			}

		}
		return list;
	}

	/**
	 * 按照分隔符分�?,得到字符串集�?
	 * 
	 * @param text
	 *            原字符串
	 * @param mosaiChr
	 *            分隔�?
	 * @return list
	 */
	public static Integer[] splitByMosaicIntegers(String text, String mosaiChr) {
		if (text == null || mosaiChr == null) {
			return null;
		}
		String[] tab = text.split(mosaiChr);
		Integer[] list = new Integer[tab.length];
		for (int i = 0; i < tab.length; i++) {
			if (isNullOrEmpty(tab[i])) {
				continue;
			}
			try {
				list[i] = Integer.valueOf(tab[i]);
			} catch (Exception e) {
			}

		}
		return list;
	}

	public static List<String> doMatcher(String context, String pat) {
		try {
			List<String> images = new ArrayList<String>();
			Integer index = 0;
			Pattern pattern = Pattern.compile(pat, Pattern.DOTALL);
			Matcher matcher = pattern.matcher(context);
			String tmp = null;
			while (matcher.find(index)) {
				tmp = matcher.group(0);
				index = matcher.end();
				if (StringUtil.isNullOrEmpty(tmp)) {
					continue;
				}
				images.add(tmp);
			}
			return images;
		} catch (Exception e) {
			return null;
		}
	}

	public static String doMatcherFirst(String context, String pat) {
		List<String> strs = doMatcher(context, pat);
		if (StringUtil.isNullOrEmpty(strs)) {
			return null;
		}
		return strs.get(0);
	}

	public static boolean isNullOrEmpty(Object obj) {
		try {
			if (obj == null)
				return true;
			if (obj instanceof CharSequence) {
				return ((CharSequence) obj).length() == 0;
			}
			if (obj instanceof Collection) {
				return ((Collection<?>) obj).isEmpty();
			}
			if (obj instanceof Map) {
				return ((Map<?, ?>) obj).isEmpty();
			}
			if (obj instanceof Object[]) {
				Object[] object = (Object[]) obj;
				if (object.length == 0) {
					return true;
				}
				boolean empty = true;
				for (int i = 0; i < object.length; i++) {
					if (!isNullOrEmpty(object[i])) {
						empty = false;
						break;
					}
				}
				return empty;
			}
			return false;
		} catch (Exception e) {
			return true;
		}

	}

	public static Integer findNull(Object... objs) {
		if (isNullOrEmpty(objs)) {
			return 0;
		}
		for (int i = 0; i < objs.length; i++) {
			if (isNullOrEmpty(objs[i])) {
				return i;
			}
		}
		return -1;
	}

	public static boolean hasNull(Object... objs) {
		return findNull(objs)>-1;
	}
	// 判断是否为数�?
	public static Boolean isNumber(String str) {
		if (isNullOrEmpty(str)) {
			return false;
		}
		try {
			Integer.valueOf(str);
			return true;
		} catch (Exception e) {
			return false;
		}
	}

	public static String argsToString(String[] args) {
		StringBuilder sb = new StringBuilder();
		for (String tmp : args) {
			sb.append(tmp);
		}
		return sb.toString();
	}

	// 字符串意义分�?
	public static String[] splitString(String str) {
		if (isNullOrEmpty(str)) {
			return null;
		}
		String[] finalStrs = new String[str.length()];
		for (int i = 0; i < str.length(); i++) {
			finalStrs[i] = str.substring(i, i + 1);
		}
		return finalStrs;
	}

	public static String getString(Object... objs) {
		if (isNullOrEmpty(objs)) {
			return "";
		}
		StringBuilder sb = new StringBuilder();
		for (Object obj : objs) {
			if (isNullOrEmpty(obj)) {
				sb.append("null");
			}
			sb.append(String.valueOf(obj));
		}
		return sb.toString();
	}

	public static String stringSort(String str) {
		if (isNullOrEmpty(str)) {
			return "";
		}
		String[] strs = splitString(str);
		Arrays.sort(strs);
		return argsToString(strs);
	}

	/**
	 * 集合碰撞
	 * 
	 * @param needList
	 *            �?要的集合
	 * @param actualList
	 *            当前实际集合
	 * @return 缺少的元�?
	 */
	public static List<?> collisionList(List<?> needList, List<?> actualList) {
		List<Object> list = new ArrayList<Object>();
		for (Object o : needList) {
			if (actualList.contains(o)) {
				continue;
			}
			list.add(o);
		}
		if (isNullOrEmpty(list)) {
			return null;
		}
		return list;
	}

	public static List<Long> integerListToLong(List<Integer> ids) {
		if (isNullOrEmpty(ids)) {
			return null;
		}
		List<Long> list = new ArrayList<Long>();
		for (Integer id : ids) {
			list.add(Long.valueOf(id));
		}
		return list;
	}

	/**
	 * List碰撞取缺�?
	 * 
	 * @param allList
	 *            理论应该出现的List
	 * @param conflictList
	 *            实际出现的List
	 * @return 丢失的List
	 */
	public static List<?> listConflict(List<?> allList, List<?> conflictList) {
		if (isNullOrEmpty(allList)) {
			return null;
		}
		if (isNullOrEmpty(conflictList)) {
			return allList;
		}
		List<Object> list = new ArrayList<Object>();
		for (Object obj : allList) {
			if (conflictList.contains(obj)) {
				continue;
			}
			list.add(obj);
		}
		if (isNullOrEmpty(list)) {
			return null;
		}
		return list;
	}

	public static Integer bambooParse(Integer... prs) {
		Integer prSum = 0;
		for (Integer pr : prs) {
			prSum += pr;
		}
		Integer random = getRanDom(1, prSum);
		prSum = 0;
		for (int i = 0; i < prs.length; i++) {
			prSum += prs[i];
			if (random <= prSum) {
				return i;
			}
		}
		return 0;
	}

	public static Integer SumInteger(Integer... sums) {
		if (isNullOrEmpty(sums)) {
			return -1;
		}
		Integer total = 0;
		for (Integer tmp : sums) {
			total += tmp;
		}
		return total;
	}

	/**
	 * 概率算法
	 * 
	 * @param chances
	 *            各成员概率权�?
	 * @return 权重下标
	 */
	public static Integer getBambooIndex(Integer... chances) {
		if (isNullOrEmpty(chances)) {
			return -1;
		}
		Integer total = SumInteger(chances);
		Integer random = getRanDom(1, total);
		total = new Integer(0);
		for (int i = 0; i < chances.length; i++) {
			total += chances[i];
			if (random <= total) {
				return i;
			}
		}
		return -1;
	}

	public static List<?> removeEmpty(List<?> list) {
		if (StringUtil.isNullOrEmpty(list)) {
			return null;
		}
		List<Object> newList = new ArrayList<Object>(list.size());
		for (Object obj : list) {
			if (isNullOrEmpty(obj)) {
				continue;
			}
			newList.add(obj);
		}
		if (isNullOrEmpty(newList)) {
			return null;
		}
		return newList;
	}

	public static Integer getBambooIndex(Float... chanceSources) {
		if (isNullOrEmpty(chanceSources)) {
			return -1;
		}
		Float[] chances = Arrays.copyOf(chanceSources, chanceSources.length);
		Integer smallLength = 0;
		for (Float f : chances) {
			String str = String.valueOf(f);
			String[] tabs = str.split("\\.");
			if (tabs.length != 2) {
				continue;
			}
			smallLength = tabs[1].length();
		}
		if (smallLength > 0) {
			Integer multiple = Double.valueOf(Math.pow(10, smallLength)).intValue();
			for (int i = 0; i < chances.length; i++) {
				chances[i] = chances[i] * multiple;
			}
		}
		Integer[] chanceInts = new Integer[chances.length];
		for (int i = 0; i < chances.length; i++) {
			chanceInts[i] = chances[i].intValue();
		}
		return getBambooIndex(chanceInts);
	}

	public static Float floatCut(Float f1, Float f2) {
		BigDecimal b1 = new BigDecimal(Float.toString(f1));
		BigDecimal b2 = new BigDecimal(Float.toString(f2));
		return b1.subtract(b2).floatValue();
	}

	/**
	 * 获取网址后缀
	 * 
	 * @param url
	 * @return
	 */
	public static String getSuffix(String url) {
		if (isNullOrEmpty(url)) {
			return "";
		}
		String[] tab = url.split("\\.");
		if (tab.length > 1) {
			return tab[tab.length - 1];
		}
		return "";
	}

}

为了方便我们使用,特意为百度云防护的网站封装了一个工具类RobotHttpHandle,维护了Cookie机制

package org.coody.robot.rote;

import java.util.Date;

import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

import org.coody.robot.util.HttpEntity;
import org.coody.robot.util.HttpHandle;
import org.coody.robot.util.StringUtil;


public class RobotHttpHandle {

	public static String cookie="";
	
	
	public RobotHttpHandle(){
		
	}
	
	
	private HttpEntity initCookie(String url){
		try {
			String baseURL=HttpHandle.getDomain(url);
			HttpHandle http=new HttpHandle();
			http.config.setRequestProperty("If-Modified-Since", new Date().toString());
			http.config.setRequestProperty("Cache-Control", "max-age=0");
			http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
			http.config.setKeepAlive(true);
			HttpEntity entity = http.Get(baseURL);
			System.out.println(entity.getCookie());
			String html = entity.getHtml();
			String temp = html.replace(" ", "");
			String jschl_vc = StringUtil.textCutCenter(temp, "jschl_vc\"value=\"", "\"");
			String pass = StringUtil.textCutCenter(temp, "pass\"value=\"", "\"");

			String funcCode = StringUtil.textCutCenter(html, "setTimeout(function(){", "f.submit();");

			funcCode = funcCode.replace("a.value", "a");
			funcCode = funcCode.replace("  ", " ");
			String[] tabs = funcCode.split("\n");
			funcCode = tabs[1];
			funcCode += "\r\nt=\"" + baseURL + "\";";
			funcCode += "\r\nr = t.match(/https?:\\/\\//)[0];";
			funcCode += "\r\nt = t.substr(r.length);";
			funcCode += "\r\nt = t.substr(0, t.length - 1);";
			funcCode += tabs[8];
			funcCode += "\r\n return a;";

			funcCode = "function jschl_answer(){\r\n" + funcCode + "\r\n}";

			ScriptEngineManager manager = new ScriptEngineManager();
			ScriptEngine engine = manager.getEngineByName("js");
			engine.eval(funcCode);
			Invocable invocable = (Invocable) engine;
			Double jschl_answer = (Double) invocable.invokeFunction("jschl_answer");
			url=baseURL+"/cdn-cgi/l/chk_jschl?jschl_vc="+jschl_vc+"&pass="+pass+"&jschl_answer="+jschl_answer.intValue();
			http.config.allowRedirects(false);
			System.out.println(url);
			Thread.sleep(3800l);
			http.config.setGzip(true);
			entity=http.Get(url);
			cookie=entity.getCookie();
			if(!cookie.contains("cf_clearance")){
				return null;
			}
			return entity;
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}
	
	public HttpEntity Get(String url){
		if(cookie!=null&&!"".equals(cookie)){
			loadCookie(url);
		}
		HttpHandle http=new HttpHandle();
		http.config.setRequestProperty("If-Modified-Since", new Date().toString());
		http.config.setRequestProperty("Cache-Control", "max-age=0");
		http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
		http.config.setKeepAlive(true);
		http.config.setCookie(cookie);
		HttpEntity entity=http.Get(url);
		if(entity.getCode()!=200){
			loadCookie(url);
			http.config.setCookie(cookie);
			entity=http.Get(url);
		}
		return entity;
	}
	
	public void loadCookie(String url){
		cookie=null;
		HttpEntity entity=initCookie(url);
		while(entity==null){
			entity=initCookie(url);
		}
	}
	
	
	public static void main(String[] args) throws NoSuchMethodException, ScriptException, InterruptedException {
		HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
		System.out.println(entity.getHtml());
	}
}

使用方式:

	HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
	System.out.println(entity.getHtml());

如图:

输入图片说明

转载于:https://my.oschina.net/hooker/blog/1547276

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值