随便写了一个抓取网页内容的方法,使用Java对应的URL类和HttpClient包实现抓取网页特定id内容数据,代码核心就是正则匹配,所以抓取速度较低,当需要匹配大量数据内容时,以下内容不使用。
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
public class CatchWebContent {
public static void main(String args[]) {
String regexString[] = new String[] { "lblPost", "lblSecondName",
"lblFirstName","lblBirthday","lblBirthCity", "lblHometown","lblSex"};
URL url;
String htmlurl;
for (int id = 5; id < 3654; id++) {
StringBuffer sb = new StringBuffer();
String temp;
String regex[] = new String[7];
for (int i = 0; i < regexString.length; i++) {
regex[i]="<span\\s*id=\"" + regexString[i] + ".*?\">(.*?)</span>";
}
htmlurl = "http://www.XXX.cn/XXX/XXX/XXX.aspx?Id="+id;
try {
url = new URL(htmlurl);
System.out.println(url);
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url.toString());
try {
httpClient.executeMethod(getMethod);
Long contentLength=getMethod.getResponseContentLength();
if(contentLength>100){
try {
BufferedReader br = new BufferedReader(
new InputStreamReader(url.openStream(), "gb2312"));
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
BufferedWriter bw = null;
for (int i = 0; i < regex.length; i++) {
Pattern pa = Pattern.compile(regex[i], Pattern.CANON_EQ);
Matcher ma = pa.matcher(sb);
if (ma.find()) {
bw=new BufferedWriter(new FileWriter("D://Temp/chuguo.txt",true));
bw.write(ma.group(1)+" ");
bw.flush();
}
}
bw.write("\r\n");
br.close();
bw.close();
} catch (UnsupportedEncodingException e) {
System.out.println("encoding error");
} catch (IOException e) {
System.out.println("read file error");
}
}else{
continue;
}
} catch (HttpException e1) {
System.out.println("http error");
} catch (IOException e1) {
System.out.println("io error");
}
} catch (MalformedURLException e) {
System.out.println("url illegal");
}
}
}
}