把以前在百度空间收集的文章搬到javaeye了,主要用到的lib就是commons-httpclient和htmlparser,在此记录下一些关键的代码片段。
jar包清单
commons-codec-1.3.jar
commons-httpclient-3.1.jar
commons-lang.jar
commons-logging-1.1.jar
htmlparser.jar
log4j-1.2.15.jar
slf4j-api-1.5.8.jar
slf4j-log4j12-1.5.8.jar
扩展 org.apache.commons.httpclient.HttpClient,覆盖其executeMethod方法处理cookie
package util;
import java.io.IOException;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpState;
public class HttpClientEx extends HttpClient {
private HttpState httpState = new HttpState(); // http状态对象,主要保存cookie
private String cookie = "";
public int executeMethod(HttpMethod httpMethod) throws IOException, HttpException {
String cookie = this.getCookie();
String uri = httpMethod.getURI().getHost();
httpState.addCookie(new Cookie(uri, "cookie", cookie, "/", null, false));
this.setState(httpState);
int statues = super.executeMethod(httpMethod);
Header[] headerArray = httpMethod.getResponseHeaders();
for (Header h : headerArray) {
if (h.getName().trim().equalsIgnoreCase("Set-Cookie")) {
if (!this.getCookie().equals("")) { // 如果值不为空
this.setCookie(this.getCookie() + ";" + h.getValue());
} else {
this.setCookie(h.getValue());
}
}
}
return statues;
}
public String getCookie() {
return cookie;
}
public void setCookie(String cookie) {
this.cookie = cookie;
}
}
get url
String url = HTTP_HI_BAIDU_COM + USER_ID + "/blog";
HttpClient client = new HttpClientEx();
GetMethod getMethod = new GetMethod(url);
client.executeMethod(getMethod);
String body = new String(getMethod.getResponseBody(), getMethod.getResponseCharSet());
getMethod.releaseConnection();
logger.debug("日志列表页面\n{}", body);
分析html页面中的div元素
Parser parser = Parser.createParser(body, getMethod.getResponseCharSet());
NodeFilter filter = new TagNameFilter("div");
NodeList nodeList = parser.parse(filter);
for (int i = 0; i < nodeList.size(); i++) {
Div div = (Div) nodeList.elementAt(i);
if ("m_blog".equals(div.getAttribute("id"))) {
logger.debug("id为m_blog的div内容\n{}", div.toHtml());
}
}
查找含有特定文字的节点集合
NodeList searchFor = div.searchFor("类别");
设置User-Agent和post数据字符编码
private static final String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; iCafeMedia; InfoPath.2)";
private static final String CHARSET = "UTF-8";
HostParams params = new HostParams();
params.setParameter(HttpMethodParams.USER_AGENT,USER_AGENT);
params.setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, CHARSET);
client.getHostConfiguration().setParams(params);
post url
String url = HOST + "/login";
PostMethod postMethod = new PostMethod(url);
postMethod.setParameter("name", "fangwei");
postMethod.setParameter("password", "******");
client.executeMethod(postMethod);