HttpClient抓取解析网站支持多种验证方式

工作中遇到了抓取多个项目数据并且有多种验证方式包括Http标准验证和非标准验证NTLM、BASIC,写了个较通用的抓数框架,支持多线程,用到的开源框架有HttpClient 4.23、Jsoup、JSONObject、Spring3.0,注意HttpClient版本不同版本API不同。
设计思路:基于bean+spring配置文件方式,配置多个项目属性,实现项目自动登录,实现通用接口或抽象类,自定义解析类,最后通过url传参,反射实例化对象,实现方法的通用。

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd">

<!-- 登陆网站设置 可配置多个登陆网站策略 -->
<bean class="com.berheley.bi.grp.fetch.handler.HttpHandler" init-method="init">
<property name="maxTotal" value="400"/>
<property name="maxRoute" value="200"/>
<property name="cnTimeOut" value="60000"/>
<property name="soTimeOut" value="60000"/>
<property name="attributes">
<map>
<entry key="60.28.43.164"> <!-- 域名或IP地址、端口号 -->
<bean class="com.berheley.bi.grp.fetch.pojo.HttpAttributes">
<property name="packPath" value="com.berheley.bi.grp.fetch.custom.business"/>
<property name="domain" value="60.28.43.164"/>
<property name="port" value="80"/>
<property name="loginUrl" value=""/> <!-- 登陆提交表单全路径地址 如http://www.iteye.com/login.jsp -->
<property name="errorUrl" value=""/> <!-- 登陆失败之后的请求地址 如/error.jsp -->
<property name="scheme" value="NTLM"/>
<property name="params">
<map>
<entry key="username" value="登录名"/>
<entry key="password" value="密码"/>
</map>
</property>
</bean>
</entry>
</map>
</property>
</bean>
</beans>


import java.util.Map;

import org.apache.http.client.HttpClient;

/**
*
* 类功能描述:远程登录项目属性类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:49:02
*/
public class HttpAttributes {

//发请求对象
private HttpClient httpClient;

//解析当前域下网页类的包路径
private String packPath;

//域名或IP地址
private String domain = "";

//登陆提交表单全路径地址 如http://www.iteye.com/login.jsp
private String loginUrl = "";

//登陆失败之后的请求地址 如/error.jsp
private String errorUrl = "";

//端口号
private int port = 80;

//登陆参数
private Map<String,String> params;

//验证方式
private String scheme;

public HttpClient getHttpClient() {
return httpClient;
}

public void setHttpClient(HttpClient httpClient) {
this.httpClient = httpClient;
}

public String getPackPath() {
return packPath;
}

public void setPackPath(String packPath) {
this.packPath = packPath;
}

public String getDomain() {
return domain;
}

public void setDomain(String domain) {
this.domain = domain;
}

public String getLoginUrl() {
return loginUrl;
}

public void setLoginUrl(String loginUrl) {
this.loginUrl = loginUrl;
}

public String getErrorUrl() {
return errorUrl;
}

public void setErrorUrl(String errorUrl) {
this.errorUrl = errorUrl;
}

public int getPort() {
return port;
}

public void setPort(int port) {
this.port = port;
}

public Map<String, String> getParams() {
return params;
}

public void setParams(Map<String, String> params) {
this.params = params;
}

public String getScheme() {
return scheme;
}

public void setScheme(String scheme) {
this.scheme = scheme;
}
}

import java.util.Map;

/**
*
* 类功能描述:解析统一接口
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
* Create: 2014-2-26 下午01:53:10
*/
public interface IParse<T> {

/**
*
* @function:url中以m_开头的自定义参数
* @param params
* @return
* @author: mengqingyu 2014-3-4 上午09:32:54
*/
abstract T process(Map<String, Object> params);
}

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
*
* 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-19 下午01:53:53
* @param <T>
*/
public abstract class HtmlParse<T> implements IParse<T>{

protected Log log = LogFactory.getLog(HtmlParse.class);

protected Document doc;

public HtmlParse(String doc) {
this.doc = Jsoup.parse(doc);
}
}

import net.sf.json.JSONObject;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
*
* 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-19 下午01:53:53
* @param <T>
*/
public abstract class JsonParse<T> implements IParse<T>{

protected Log log = LogFactory.getLog(JsonParse.class);

protected JSONObject doc;

public JsonParse(String doc) {
this.doc = JSONObject.fromObject(doc);
}
}

package com.berheley.bi.grp.fetch.parse;

import java.util.Map;

/**
*
* 类功能描述:解析统一接口
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
* Create: 2014-2-26 下午01:53:10
*/
public interface IParse<T> {

/**
*
* @function:url中以m_开头的自定义参数
* @param params
* @return
* @author: mengqingyu 2014-3-4 上午09:32:54
*/
abstract T process(Map<String, Object> params);
}

import java.util.Map;

import net.sf.json.JSONObject;

import com.berheley.bi.grp.fetch.parse.HtmlParse;


public class FyxxInfoHtmlParse extends HtmlParse<String>{

public FyxxInfoHtmlParse(String doc) {
super(doc);
}

@Override
public String process(Map<String, Object> params) {
JSONObject jsonObj = new JSONObject();

//价位无
String tfj_rentcost = doc.getElementById("tfj_rentcost").val(); //租金

String tfj_buildingarea = doc.getElementById("tfj_buildingarea")==null?"":doc.getElementById("tfj_buildingarea").val();//面积

String tfj_standardstorey = doc.getElementById("tfj_standardstorey").val();// 标准层高

String tfj_floorloading = doc.getElementById("tfj_floorloading_d").val();//楼面承重 tfj_floorloading_d

String tfj_phone = doc.getElementById("tfj_phone").val();//业主单位联系方式

String tfj_propertycost = doc.getElementById("tfj_propertycost").val();//物业

String tfj_watercost = doc.getElementById("tfj_watercost").val();//水

String tfj_eleccost = doc.getElementById("tfj_eleccost").val();//电


jsonObj.put("rentcost", tfj_rentcost);
jsonObj.put("buildingarea", tfj_buildingarea);
jsonObj.put("standardstorey", tfj_standardstorey);
jsonObj.put("floorloading", tfj_floorloading);
jsonObj.put("phone", tfj_phone);
jsonObj.put("propertycost", tfj_propertycost);
jsonObj.put("watercost", tfj_watercost);
jsonObj.put("eleccost", tfj_eleccost);

jsonObj.put("success", true);
return jsonObj.toString();
}

}

/**
*
* 类功能描述:常量类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-28 下午02:37:32
*/
public final class HttpConstant {

public static final String POST = "POST";

public static final String URL = "m_url";

public static final String PARSE = "m_parse";

public static final String GBK = "gbk";

public static final String UTF8 = "UTF-8";
}

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.ContentType;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;

/**
*
* 类功能描述:请求工具类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-19 下午01:53:18
*/
public final class HttpUtils {

private static Log log = LogFactory.getLog(HttpUtils.class);

/**
*
* @function:get请求
* @param httpclient
* @param url
* @return
* @author: mengqingyu 2014-2-19 下午01:50:58
*/
public static HttpResponse httpGet(HttpClient httpclient, String url) {
HttpResponse response = null;
HttpGet httpget = new HttpGet(url);
try {
response = httpclient.execute(httpget);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
log.info("get status: " + response.getStatusLine());
return response;
}

/**
* get请求
* @param httpclient
* @param url
* @param handler
* @param context new BasicHttpContext() 可取到请求后url
* @return
*/
public static HttpResponse httpGet(HttpClient httpclient, String url, HttpContext context) {
HttpResponse response = null;
HttpGet httpget = new HttpGet(url);
try {
response = httpclient.execute(httpget, context);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
log.info("get status: " + response.getStatusLine());
return response;
}

/**
* get请求 包含判断是否需要登录的POST
* @param httpclient
* @param url
* @param handler
* @param context new BasicHttpContext() 可取到请求后url
* @return
*/
public static HttpResponse httpGetByScheme(HttpClient httpclient, String url, HttpContext context, HttpAttributes attributes) {
HttpResponse response = httpGet(httpclient, url, context);
HttpUriRequest req = (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
log.info("get请求跳转地址: " + req.getURI());
if(HttpConstant.POST.equalsIgnoreCase(attributes.getScheme())&&attributes.getErrorUrl().equalsIgnoreCase(req.getURI().toString())){
httpPost(httpclient, attributes.getLoginUrl(), getPairs(attributes.getParams()));
response = httpGet(httpclient, url, context);
}
log.info("get status: " + response.getStatusLine());
return response;
}

/**
*
* @function:post提交
* @param httpclient
* @param url
* @param params
* @return
* @author: mengqingyu 2014-2-19 下午01:51:38
*/
public static HttpResponse httpPost(HttpClient httpclient, String url, List<NameValuePair> params) {
HttpResponse response = null;
HttpPost httpost = new HttpPost(url);
httpost.setEntity(new UrlEncodedFormEntity(params, Charset.forName(HttpConstant.GBK)));
// httpost.getParams().setBooleanParameter(CoreProtocolPNames.USE_EXPECT_CONTINUE,false);
try {
response = httpclient.execute(httpost);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
log.info("post status: " + response.getStatusLine());
return response;
}

/**
*
* @function:主机地址
* @param context
* @return
* @author: mengqingyu 2014-2-19 下午01:51:57
*/
public static HttpHost getHttpHost(HttpContext context) {
return (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
}

/**
*
* @function:子地址
* @param context
* @return
* @author: mengqingyu 2014-2-19 下午01:52:05
*/
public static HttpUriRequest getHttpUriRequest(HttpContext context) {
return (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
}

/**
*
* @function:表单参数转换
* @param params
* @return
* @author: mengqingyu 2014-2-19 下午01:52:24
*/
public static List<NameValuePair> getPairs(Map<?, ?> params) {
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
if (params != null) {
for (Map.Entry<?, ?> entry : params.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey().toString(), entry.getValue().toString()));
}
}
return nameValuePairs;
}

/**
*
* @function:实体类转换html文本
* @param response
* @return
* @author: mengqingyu 2014-2-19 下午01:52:40
*/
public static String entityToString(HttpResponse response) {
HttpEntity entity = response.getEntity();
InputStream is = null;
BufferedReader br = null;
StringBuilder sb = null;
ContentType contentType = ContentType.getOrDefault(entity);
Charset charset = contentType.getCharset();
if(charset==null)
charset = Charset.forName(HttpConstant.GBK);
try {
is = entity.getContent();
br = new BufferedReader(new InputStreamReader(is, charset));
sb = new StringBuilder();
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
is.close();
EntityUtils.consume(entity);
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}

/**
*
* @function:反射生成解析策略类
* @param parseBean
* @param html
* @return
* @throws BusinessException
* @author: mengqingyu 2014-2-26 下午04:31:25
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public static IParse<String> newInstance(String packPath, String parseBean, String text) throws BusinessException{
IParse<String> parse = null;
try {
Class clazz = Class.forName(packPath+"."+parseBean);
Constructor constructor = clazz.getConstructor(String.class);
parse = (IParse) constructor.newInstance(text);
} catch (Exception e) {
throw new BusinessException("网页解析类初始化错误 "+e.getMessage(), e);
}
return parse;
}

/**
*
* @function:通过url获取域名
* @param url
* @return
* @author: mengqingyu 2014-2-26 下午04:30:49
*/
public static String initParams(Map<String, Object> params) {
String url = params.get(HttpConstant.URL).toString();
int index = url.indexOf("?");
if(index==-1) return url;
String urlPath = url.substring(0, url.indexOf("?")+1);
String paramStr = url.substring(url.indexOf("?")+1);
String[] urlArray = paramStr.split("&");
for (int i = 0; i < urlArray.length; i++) {
String[] paramArray = null;
if(urlArray[i].startsWith("m_")) {
paramArray = urlArray[i].split("=");
params.put(paramArray[0], paramArray[1]);
paramStr = paramStr.replaceAll("(\\?|&)"+urlArray[i], "");
}
}
paramStr = urlEncoder(paramStr);
paramStr = paramStr.replace("%3D", "=").replace("%26", "&");
return urlPath+paramStr;
}

/**
*
* @function:url 编码
* @param paramStr
* @return
* @author: mengqingyu 2014-2-28 下午02:58:59
*/
public static String urlEncoder(String paramStr) {
try {
paramStr = URLEncoder.encode(paramStr,HttpConstant.UTF8);
} catch (UnsupportedEncodingException e) {
log.error("url编码错误", e);
}
return paramStr;
}
}

import java.io.IOException;

import jcifs.ntlmssp.NtlmFlags;
import jcifs.ntlmssp.Type1Message;
import jcifs.ntlmssp.Type2Message;
import jcifs.ntlmssp.Type3Message;
import jcifs.util.Base64;

import org.apache.http.impl.auth.NTLMEngine;
import org.apache.http.impl.auth.NTLMEngineException;

/**
*
* 类功能描述:JCIFS实现NTLM windows域验证
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:55:31
*/
public final class JCIFSEngine implements NTLMEngine {

private static final int TYPE_1_FLAGS = NtlmFlags.NTLMSSP_NEGOTIATE_56 | NtlmFlags.NTLMSSP_NEGOTIATE_128 | NtlmFlags.NTLMSSP_NEGOTIATE_NTLM2
| NtlmFlags.NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NtlmFlags.NTLMSSP_REQUEST_TARGET;

public String generateType1Msg(final String domain, final String workstation) throws NTLMEngineException {
final Type1Message type1Message = new Type1Message(TYPE_1_FLAGS, domain, workstation);
return Base64.encode(type1Message.toByteArray());
}

public String generateType3Msg(final String username, final String password, final String domain, final String workstation, final String challenge)
throws NTLMEngineException {
Type2Message type2Message;
try {
type2Message = new Type2Message(Base64.decode(challenge));
} catch (final IOException exception) {
throw new NTLMEngineException("Invalid NTLM type 2 message", exception);
}
final int type2Flags = type2Message.getFlags();
final int type3Flags = type2Flags & (0xffffffff ^ (NtlmFlags.NTLMSSP_TARGET_TYPE_DOMAIN | NtlmFlags.NTLMSSP_TARGET_TYPE_SERVER));
final Type3Message type3Message = new Type3Message(type2Message, password, domain, username, workstation, type3Flags);
return Base64.encode(type3Message.toByteArray());
}

}

import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthSchemeFactory;
import org.apache.http.impl.auth.NTLMScheme;
import org.apache.http.params.HttpParams;

/**
*
* 类功能描述:NTLM windows域验证
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:54:40
*/
public class NTLMSchemeFactory implements AuthSchemeFactory {

public AuthScheme newInstance(final HttpParams params) {
return new NTLMScheme(new JCIFSEngine());
}

}

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.auth.params.AuthPNames;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.AuthPolicy;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.HttpParams;

import com.berheley.bi.grp.fetch.ntlm.NTLMSchemeFactory;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;

/**
*
* 类功能描述:远程登录处理类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:49:45
*/
public class HttpHandler {

private Log log = LogFactory.getLog(HttpHandler.class);

// 创建socket的上线
private int maxTotal = 400;

// 对每个指定连接的服务器(指定的ip)可以创建的并发数
private int maxRoute = 200;

// 连接超时时间
private int cnTimeOut = 60000;

// 数据传输超时
private int soTimeOut = 60000;

//连接对象
private HttpClient httpClient;

//连接属性设置
private HttpParams httpParams;

//多线程连接管理
private ClientConnectionManager connectionManager;

// key:IP地址,value:每个项目的属性
private Map<String, HttpAttributes> attributes;

public HttpHandler() {
httpParams = this.getHp();
connectionManager = this.getCm();
httpClient = new DefaultHttpClient(connectionManager, httpParams);
}

public int getMaxTotal() {
return maxTotal;
}

public void setMaxTotal(int maxTotal) {
this.maxTotal = maxTotal;
}

public int getMaxRoute() {
return maxRoute;
}

public void setMaxRoute(int maxRoute) {
this.maxRoute = maxRoute;
}

public int getCnTimeOut() {
return cnTimeOut;
}

public void setCnTimeOut(int cnTimeOut) {
this.cnTimeOut = cnTimeOut;
}

public int getSoTimeOut() {
return soTimeOut;
}

public void setSoTimeOut(int soTimeOut) {
this.soTimeOut = soTimeOut;
}

public HttpParams getHttpParams() {
return httpParams;
}

public void setHttpParams(HttpParams httpParams) {
this.httpParams = httpParams;
}

public ClientConnectionManager getConnectionManager() {
return connectionManager;
}

public void setConnectionManager(ClientConnectionManager connectionManager) {
this.connectionManager = connectionManager;
}

public Map<String, HttpAttributes> getAttributes() {
return attributes;
}

public void setAttributes(Map<String, HttpAttributes> attributes) {
this.attributes = attributes;
}

/**
*
* @function:初始化 HttpClient
* @author: mengqingyu 2014-2-26 下午02:57:09
*/
public void init() {
for (Entry<String, HttpAttributes> entry : attributes.entrySet()) {
HttpAttributes attributes = entry.getValue();
String scheme = attributes.getScheme();
DefaultHttpClient httpClient = null;
if (AuthPolicy.NTLM.equalsIgnoreCase(scheme)) {
httpClient = new DefaultHttpClient(connectionManager, httpParams);
List<String> authpref = new ArrayList<String>();
authpref.add(AuthPolicy.NTLM);
httpClient.getParams().setParameter(AuthPNames.TARGET_AUTH_PREF, authpref);
// httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,CookiePolicy.BEST_MATCH);
httpClient.getAuthSchemes().register(AuthPolicy.NTLM, new NTLMSchemeFactory());
NTCredentials creds = new NTCredentials(attributes.getParams().get("username"), attributes.getParams().get("password"), "", "");
httpClient.getCredentialsProvider().setCredentials(AuthScope.ANY, creds);
attributes.setHttpClient(httpClient);
} else if (AuthPolicy.BASIC.equalsIgnoreCase(scheme)) {
httpClient = new DefaultHttpClient(connectionManager, httpParams);
httpClient.getCredentialsProvider().setCredentials(new AuthScope(attributes.getDomain(), attributes.getPort()),
new UsernamePasswordCredentials(attributes.getParams().get("username"), attributes.getParams().get("password")));
attributes.setHttpClient(httpClient);
} else if (HttpConstant.POST.equalsIgnoreCase(scheme)) {
attributes.setHttpClient(this.httpClient);
}
}
log.info("初始化 HttpClient");
}

/**
*
* @function:连接属性设置
* @return
* @author: mengqingyu 2014-2-26 下午02:56:49
*/
private HttpParams getHp() {
HttpParams params = new BasicHttpParams();
params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, cnTimeOut);
params.setParameter(CoreConnectionPNames.SO_TIMEOUT, soTimeOut);
return params;
}

/**
*
* @function:多线程连接设置
* @return
* @author: mengqingyu 2014-2-26 下午02:56:49
*/
private ClientConnectionManager getCm() {
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
schemeRegistry.register(new Scheme("https", 433, PlainSocketFactory.getSocketFactory()));
PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry);
cm.setMaxTotal(maxTotal);
cm.setDefaultMaxPerRoute(maxRoute);
return cm;
}

/**
*
* @function:获得项目配置
* @param url
* @return
* @author: mengqingyu 2014-2-27 上午09:52:53
*/
public HttpAttributes getHttpAttributes(String url) {
url = url.substring(url.indexOf("://") + 3);
url = url.substring(0, url.indexOf("/"));
return attributes.get(url);
}
}

import java.util.Map;

import com.berheley.bi.basic.exp.BusinessException;

/**
*
* 类功能描述:抓取网站业务类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:56:24
*/
public interface IFetchService {

/**
*
* @function:抓取并解析数据
* @param params 包含以下
* @param 包含key为:m_url必传参数 每次请求全路径包含参数 在参数内的地址后需要包含参数m_parse
* @return
* @throws BusinessException
* @author: mengqingyu 2014-2-26 下午01:56:38
*/
public String findDate(Map<String,Object> params) throws BusinessException;
}

import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.handler.HttpHandler;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;
import com.berheley.bi.grp.fetch.util.HttpUtils;

/**
*
* 类功能描述:抓取解析业务实现类
*
* @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
* @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
* Create: 2014-2-26 下午01:56:57
*/
@Service
public class FetchService implements IFetchService {

private Log log = LogFactory.getLog(FetchService.class);

@Autowired
private HttpHandler httpHandler;

@Override
public String findDate(Map<String, Object> params) throws BusinessException {
String url = HttpUtils.initParams(params);
HttpAttributes attributes = httpHandler.getHttpAttributes(url);
DefaultHttpClient httpclient = (DefaultHttpClient) attributes.getHttpClient();
HttpContext localContext = new BasicHttpContext();
HttpResponse response = HttpUtils.httpGetByScheme(httpclient, url, localContext, attributes);
String result = HttpUtils.entityToString(response);
IParse<String> parse = HttpUtils.newInstance(attributes.getPackPath(), params.get(HttpConstant.PARSE).toString(), result);
String json = parse.process(params);
log.info(json);
return json;
}

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值