正则表达式HTMLApache
学习java的正则表达式,抓取网页并解析HTML部分内容
Java代码 收藏代码
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
public class HttpClientDemo {
/**
*
* @param url
* @return
* @throws Exception
*/
public static String getHTML(String url) throws Exception {
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url);
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + getMethod.getStatusLine());
return null;
}
// 读取内容
byte[] responseBody = getMethod.getResponseBody();
getMethod.releaseConnection();
return new String(responseBody);
}
/**
*
* @throws Exception
*/
public static void test(String url) throws Exception{
String html = getHTML(url);
Pattern p = null;
Matcher m = null;
StringBuffer sb0 = new StringBuffer();
// ul正则
String regex = "<ul class=\"d2_9\">([\\s\\S]*<li>)<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]</li>([\\s].*)";
// 链接正则
String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";
p = Pattern.compile(regex);
// m = p.matcher(sb.toString());
m = p.matcher(html);
int count = 0;
// ul字符串
while (m.find()) {
sb0.append(m.group());
}
//System.out.println(sb0.toString());
p = Pattern.compile(regexa);
m = p.matcher(sb0.toString());
// 链接地址和标题
while (m.find()) {
System.out.println("地址:" + m.group(1));
System.out.println("标题:" + m.group(2));
System.out.println("时间:" + m.group(3));
count++;
}
System.out.println("抓取条数:"+count);
}
public static void main(String[] args) throws Exception {
String url = "http://cpc.people.com.cn/GB/194302/194306/index.html";
test(url);
}
}