用httpclient-4.0.jar,httpcore-4.0.1.jar抓取页面内容的方法
public class ParsePage {
/**
*
* 根据baseUrl读取页面信息
*
* @throws Exception
*/
public String parseURLContent(String url)
throws Exception {
String resultString = "";
HttpClient client = new DefaultHttpClient();
HttpUriRequest request = new HttpGet(url);
HttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
InputStream in = entity.getContent();
byte bytes[] = new byte[1024 * 1024];
while (-1 != in.read(bytes, 0, 1024 * 1024))
{
resultString += new String(bytes, "UTF-8");
}
return this.parseResult(resultString);
}
// 提取页面信息
private String getResponseMessage(String result) {
String regex = "<title>.*?</title>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(result);
while (ma.find()) {
return ma.group();
}
return "";
}