这个属于前一个例子的实际应用版本,用来读取真实的页面并进行正则解析
- package com.laozizhu.apache.httpclient;
- import java.net.Socket;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.http.ConnectionReuseStrategy;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpVersion;
- import org.apache.http.impl.DefaultConnectionReuseStrategy;
- import org.apache.http.impl.DefaultHttpClientConnection;
- import org.apache.http.message.BasicHttpRequest;
- import org.apache.http.params.BasicHttpParams;
- import org.apache.http.params.HttpParams;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.protocol.BasicHttpContext;
- import org.apache.http.protocol.BasicHttpProcessor;
- import org.apache.http.protocol.ExecutionContext;
- import org.apache.http.protocol.HttpContext;
- import org.apache.http.protocol.HttpRequestExecutor;
- import org.apache.http.protocol.RequestConnControl;
- import org.apache.http.protocol.RequestContent;
- import org.apache.http.protocol.RequestExpectContinue;
- import org.apache.http.protocol.RequestTargetHost;
- import org.apache.http.protocol.RequestUserAgent;
- import org.apache.http.util.EntityUtils;
- /**
- * HttpClient使用例子:读取CSDN的所有投票状态
- *
- * @author 老紫竹(java2000.net)
- */
- public class HttpGet {
- public static void main(String[] args) throws Exception {
- HttpParams params = new BasicHttpParams();
- // HTTP 协议的版本,1.1/1.0/0.9
- HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
- // 字符集
- HttpProtocolParams.setContentCharset(params, "UTF-8");
- // 伪装的浏览器类型
- // IE7 是
- // Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)
- //
- // Firefox3.03
- // Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.9.0.3)
- // Gecko/2008092417 Firefox/3.0.3
- //
- HttpProtocolParams.setUserAgent(params, "HttpComponents/1.1");
- HttpProtocolParams.setUseExpectContinue(params, true);
- BasicHttpProcessor httpproc = new BasicHttpProcessor();
- httpproc.addInterceptor(new RequestContent());
- httpproc.addInterceptor(new RequestTargetHost());
- httpproc.addInterceptor(new RequestConnControl());
- httpproc.addInterceptor(new RequestUserAgent());
- httpproc.addInterceptor(new RequestExpectContinue());
- HttpRequestExecutor httpexecutor = new HttpRequestExecutor();
- HttpContext context = new BasicHttpContext(null);
- HttpHost host = new HttpHost("vote.csdn.net", 80);
- DefaultHttpClientConnection conn = new DefaultHttpClientConnection();
- ConnectionReuseStrategy connStrategy = new DefaultConnectionReuseStrategy();
- context.setAttribute(ExecutionContext.HTTP_CONNECTION, conn);
- context.setAttribute(ExecutionContext.HTTP_TARGET_HOST, host);
- System.out.println("<table>");
- try {
- // 这个85是因为目前有85页,如果有更多的页,需要手工修改或者传参数进来
- for (int i = 1; i <= 85; i++) {
- if (!conn.isOpen()) {
- Socket socket = new Socket(host.getHostName(), host.getPort());
- conn.bind(socket, params);
- }
- BasicHttpRequest request = new BasicHttpRequest("GET",
- "http://vote.csdn.net/VoteList.aspx?page=" + i);
- context.setAttribute(ExecutionContext.HTTP_REQUEST, request);
- request.setParams(params);
- httpexecutor.preProcess(request, httpproc, context);
- HttpResponse response = httpexecutor.execute(request, conn, context);
- response.setParams(params);
- httpexecutor.postProcess(response, httpproc, context);
- // 返回码
- if (response.getStatusLine().getStatusCode() != 200) {
- break;
- }
- parseData(EntityUtils.toString(response.getEntity()));
- if (!connStrategy.keepAlive(response, context)) {
- conn.close();
- }
- }
- } finally {
- conn.close();
- }
- System.out.println("</table>");
- }
- static final Pattern p = Pattern
- .compile(
- "<h4>.*?<a href=.*?voteid=(//d+)/">(.*?)</a></h4>.*?发起人:<a href=.*?>(.*?)</a>.*?<a href=.*?>(//d+) 人投票</a>",
- Pattern.DOTALL);
- /**
- * 解析页面,得到投票编号,题目,发起人和参与人数
- *
- * @param msg
- */
- public static void parseData(String msg) {
- String[] parts = msg.split("div class=/"kimi_modifysty/">");
- Matcher m;
- for (String s : parts) {
- m = p.matcher(s);
- if (m.find()) {
- System.out.println("<tr><td>" + m.group(1)
- + "</td><td><a href='http://vote.csdn.net/VotePost.aspx?voteid=" + m.group(1) + "'>"
- + m.group(2).replace(",", ",") + "</a></td><td>" + m.group(3) + "</td><td>"
- + m.group(4) + "</td></tr>");
- }
- }
- }
- }