爬取互动百科数据
package com.companyName.ott.core.fetch.service.impl;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import org.apache.commons.lang.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import com.companyName.ott.common.util.PropertiesUtil;
import com.companyName.ott.core.exception.ErrorCode;
import com.companyName.ott.core.exception.FetchException;
import com.companyName.ott.core.fetch.network.FetchNetwork;
import com.companyName.ott.core.fetch.service.InfoFetcher;
import com.companyName.ott.core.fetch.vo.InfoResp;
/**
* 互动百科资讯抓取接口实现类。
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
@Service("hudonInfoFetchServiceImpl")
public class HuDonInfoFetchServiceImpl extends AbstractInfoFetchService {
private Logger logger = LoggerFactory.getLogger(getClass());
@Override
public String fetchInfoByKeyWord(String keyWord) throws FetchException {
// 1.检查参数
if (StringUtils.isBlank(keyWord)) {
throw new FetchException(ErrorCode.PARAMETER_EMPTY,
"keyWord is empty");
}
String title = null;
try {
title = URLEncoder.encode(keyWord, "utf-8");
} catch (UnsupportedEncodingException e) {
throw new FetchException(ErrorCode.PARAMETER_INVALID, e);
}
// 2.组装抓取数据的url
String fetchUrl = parseUrl(title);
logger.debug("Hudon info fetch url:" + fetchUrl);
// 3.抓取数据
String xmlContent = FetchNetwork.postRequest(fetchUrl, null, null,
"xml", 3000, 3000);
logger.debug("Hudon response content:" + xmlContent);
return xmlContent;
}
@Override
public InfoResp parseResp(String rtnStr) throws FetchException {
/**
* 说明:互动资讯抓取的是返回xml内容<br>
* ------------------------------------------
* 1.首先判断isexist,isexist=1有数据内容<br>
* 2.title字段是返回的标题<br>
* 3.sumary是描述字段内容<br>
* ------------------------------------------
*/
SAXReader reader = new SAXReader();
Document doc = null;
try {
doc = reader.read(new StringReader(rtnStr));
} catch (DocumentException e) {
throw new FetchException(ErrorCode.RESPONSE_DATA_INVALID, e);
}
InfoResp resp = new InfoResp();
Node n = doc.selectSingleNode("/response/isexist");
if (n != null) {
if (n.getStringValue().trim().equals("1")) {
resp.setExist(true);
Node tNode = doc.selectSingleNode("/response/title");
if (tNode != null) {
resp.setTitle(tNode.getStringValue());
}
Node sumaryNode = doc.selectSingleNode("/response/summary");
if (sumaryNode != null) {
resp.setSummary(sumaryNode.getStringValue());
}
} else {
resp.setExist(false);
}
} else {
throw new FetchException(ErrorCode.RESPONSE_DATA_INVALID,
"返回内容格式不正确:" + rtnStr);
}
return resp;
}
private String parseUrl(String title) throws FetchException {
String baseUrl =
"http://api.hudong.com/dict.do?from=aidufei&appkey=e5ec12b9d&type=62";
//String baseUrl = PropertiesUtil.getStringValue("hudon.fetch.baseUrl");
if (StringUtils.isBlank(baseUrl)) {
throw new FetchException(ErrorCode.PARAMETER_EMPTY,
"sysconfig.properties文件中hudon.fetch.baseUrl参数未配置或为空");
}
if (baseUrl.indexOf("title=") > -1) {
baseUrl += baseUrl.substring(0, baseUrl.indexOf("title=")
+ "title=".length())
+ title
+ baseUrl.substring(baseUrl.indexOf("title=")
+ "title=".length());
} else {
if (!baseUrl.endsWith("&")) {
baseUrl += "&";
}
baseUrl += "title=" + title;
}
return baseUrl;
};
public static void main(String[] args) throws FetchException {
InfoFetcher fetchService = new HuDonInfoFetchServiceImpl();
System.out.println(fetchService.fetchInfoByKeyWord("钓鱼岛"));
}
}
package com.companyName.ott.core.fetch.service.impl;
import com.companyName.ott.core.exception.FetchException;
import com.companyName.ott.core.fetch.service.InfoFetchService;
import com.companyName.ott.core.fetch.vo.InfoResp;
/**
* 资讯抓取、解析上层实现类。
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public abstract class AbstractInfoFetchService implements
InfoFetchService {
@Override
public InfoResp fetchInfo(String keyWord) throws FetchException {
// 1.抓取数据
String rtnStr = this.fetchInfoByKeyWord(keyWord);
// 2.解析数据
InfoResp resp = this.parseResp(rtnStr);
return resp;
}
}
//http://api.hudong.com/dict.do?from=aidufei&appkey=e5ec12b9d&type=62&title=%E9%92%93%E9%B1%BC%E5%B2%9B
package com.companyName.ott.core.exception;
/**
* 定义抓取异常。
*
* @version 3.0.0
* @since jdk1.5
* @copyright © 2012, companyName Corporation. All rights reserved.
*
*/
public class FetchException extends Exception {
/**
* 构造异常
*
* @param code
* 异常状态码
* @param msg
* 异常讯息
*/
public FetchException(int code, String msg) {
super(msg);
this.code = code;
}
/**
* 构造异常
*
* @param code
* 异常状态码
* @param ex
* 异常来源
*/
public FetchException(int code, Exception ex) {
super(ex);
this.code = code;
}
/**
*
* @return 异常状态码。
*/
public int getErrorCode() {
return code;
}
// 序列化UID
private static final long serialVersionUID = 8243127099991355146L;
// 错误码
private int code;
}
package com.companyName.ott.core.fetch.service;
import com.companyName.ott.core.exception.FetchException;
import com.companyName.ott.core.fetch.vo.InfoResp;
/**
* 资讯抓取、解析上层接口。
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public interface InfoFetchService extends InfoFetcher, InfoParser {
/**
* 通过关键词抓取资讯
*
* @param keyWord
* @return
* @throws FetchException
*/
InfoResp fetchInfo(String keyWord) throws FetchException;
}
package com.companyName.ott.core.fetch.service;
import com.companyName.ott.core.exception.FetchException;
/**
* 资讯抓取接口。
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public interface InfoFetcher {
/**
* 通过关键字抓取网络内容
*
* @param keyWord
* @return
* @throws FetchException
*/
String fetchInfoByKeyWord(String keyWord) throws FetchException;
}
package com.companyName.ott.core.fetch.service;
import com.companyName.ott.core.exception.FetchException;
import com.companyName.ott.core.fetch.vo.InfoResp;
/**
* 资讯解析接口
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public interface InfoParser {
/**
* 根据抓取返回的内容完成解析(xml/json/html),由业务方法完成内容的解析
*
* @param rtnStr
* 抓取返回的字符串内容
* @return
* @throws FetchException
*/
InfoResp parseResp(String rtnStr) throws FetchException;
}
package com.companyName.ott.core.fetch.vo;
import java.io.Serializable;
public class InfoResp implements Serializable {
/**
*
*/
private static final long serialVersionUID = -4860412938020873566L;
// 请求的搜索词
private String title;
// 是否存在
private boolean isExist;
// 描述
private String summary;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public boolean isExist() {
return isExist;
}
public void setExist(boolean isExist) {
this.isExist = isExist;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[\n");
sb.append(" isExist:" + isExist + "\n");
sb.append(" title:" + title + "\n");
sb.append(" summary:" + summary + "\n");
sb.append("]");
return sb.toString();
}
}
package com.companyName.ott.core.exception;
/**
* 定义抓取错误码。
*
* @version 3.0.0
* @since jdk1.5
* @author companyNamefly
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public class ErrorCode {
// 序列化UID
private static final long serialVersionUID = -1679458253208555786L;
/**
* 错误的参数类型
*/
public final static int PARAMETER_TYPE_ERROR = 2000;
/**
* 必填参数为空。
*/
public final static int PARAMETER_EMPTY = 2001;
/**
* 必填参数无效。
*/
public final static int PARAMETER_INVALID = 2002;
/**
* 服务器响应数据无效。
*/
public final static int RESPONSE_DATA_INVALID = 2003;
/**
* 网络错误。
*/
public final static int NETWORK_ERROR = 3000;
}
package com.companyName.ott.core.fetch.network;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import com.companyName.ott.core.exception.ErrorCode;
import com.companyName.ott.core.exception.FetchException;
/**
* 发送HTTP网络请求类
*
* @version 0.0.1
* @since jdk1.5
* @author cgh
* @copyright © 2012, companyNamefly Corporation. All rights reserved.
*
*/
public class FetchNetwork {
/**
* 发送POST请求(默认超时时间连接3秒,响应3秒)
*
* @param url
* 请求URL地址
* @param params
* 请求参数
* @param cookies
* cookies
* @param protocol
* 协议/xml返回or http
* @return 服务器响应的请求结果
* @throws OpensnsException
* 网络故障时抛出异常。
*/
public static String postRequest(String url, Map<String, String> params,
Map<String, String> cookies, String protocol) throws FetchException {
return postRequest(url, params, cookies, protocol, CONNECTION_TIMEOUT,
READ_DATA_TIMEOUT);
}
/**
* 发送POST请求
*
* @param url
* 请求URL地址
* @param params
* 请求参数
* @return 服务器响应的请求结果
* @throws OpensnsException
* 网络故障时抛出异常。
*/
@SuppressWarnings("rawtypes")
public static String postRequest(String url, Map<String, String> params,
Map<String, String> cookies, String protocol, int connTimeOut,
int readTimeOut) throws FetchException {
HttpClient httpClient = new HttpClient();
PostMethod postMethod = new PostMethod(url);
// 设置请求参数
if (params != null && !params.isEmpty()) {
NameValuePair[] data = new NameValuePair[params.size()];
Iterator iter = params.entrySet().iterator();
int i = 0;
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
data[i] = new NameValuePair((String) entry.getKey(),
(String) entry.getValue());
++i;
}
postMethod.setRequestBody(data);
}
// 设置cookie
if (cookies != null && !cookies.isEmpty()) {
Iterator iter = cookies.entrySet().iterator();
StringBuilder buffer = new StringBuilder(128);
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
buffer.append((String) entry.getKey()).append("=")
.append((String) entry.getValue()).append("; ");
}
// 设置cookie策略
postMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
// 设置cookie内容
postMethod.setRequestHeader("Cookie", buffer.toString());
}
// 设置User-Agent
postMethod.setRequestHeader("User-Agent", "Java Fetch SDK Client");
// 设置建立连接超时时间
httpClient.getHttpConnectionManager().getParams()
.setConnectionTimeout(connTimeOut);
// 设置读数据超时时间
httpClient.getHttpConnectionManager().getParams()
.setSoTimeout(readTimeOut);
// 设置编码
postMethod.getParams().setParameter(
HttpMethodParams.HTTP_CONTENT_CHARSET, CONTENT_CHARSET);
// 使用系统提供的默认的恢复策略
postMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
try {
int statusCode = httpClient.executeMethod(postMethod);
if (statusCode != HttpStatus.SC_OK) {
throw new FetchException(ErrorCode.NETWORK_ERROR,
"Request [" + url + "] failed:"
+ postMethod.getStatusLine());
}
// 读取内容
byte[] responseBody = null;
if ("xml".equals(protocol)) {
InputStream resStream = postMethod
.getResponseBodyAsStream();
BufferedReader br = new BufferedReader(
new InputStreamReader(resStream));
StringBuffer resBuffer = new StringBuffer();
String resTemp = "";
while ((resTemp = br.readLine()) != null) {
resBuffer.append(resTemp);
}
responseBody = resBuffer.toString().getBytes();
} else
responseBody = postMethod.getResponseBody();
return new String(responseBody, CONTENT_CHARSET);
} finally {
// 释放链接
postMethod.releaseConnection();
}
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
throw new FetchException(ErrorCode.NETWORK_ERROR, "Request [" + url
+ "] failed:" + e.getMessage());
} catch (IOException e) {
// 发生网络异常
throw new FetchException(ErrorCode.NETWORK_ERROR, "Request [" + url
+ "] failed:" + e.getMessage());
}
}
// 编码方式
private static final String CONTENT_CHARSET = "UTF-8";
// 连接超时时间
private static final int CONNECTION_TIMEOUT = 3000;
// 读数据超时时间
private static final int READ_DATA_TIMEOUT = 3000;
}