项目需要,写了个简单的程序爬去山东省市区县乡镇区划信息。
依赖的jar包来源于httpcomponents-client-4.2.5-bin.zip;
package org.apache.http.examples.test;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
//从国家统计局爬取山东省市区县乡镇
public class DailySign {
public static final String URL_GET = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/";
public static int count = 0;
public static String pre="12";
public static String file = pre+".html";
public static String fileName = pre+".html";
public static FileWriter writer = null;
public static StringBuffer sb = new StringBuffer();
public static Pattern p = Pattern.compile("<a href='([^>]*)'>([^<]*)</a>");
public static void main(String[] args) throws Exception {
writer = new FileWriter(fileName+".txt");
get("",file);
try {
writer.write(sb.toString());
if(writer!=null){
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void get(String prefix, String req) throws ClientProtocolException, IOException, InterruptedException {
count++;
//休眠,防止大量请求被网站拒绝
if(count % 200 == 0){
Thread.sleep(1000);
}
DefaultHttpClient client = new DefaultHttpClient();
HttpGet signGet = new HttpGet(URL_GET + prefix + req);
// 执行签到请求
HttpResponse signResponse = client.execute(signGet);
// 处理响应
showResult(signResponse);
}
/**
* 读取相应内容并输出
* @throws InterruptedException
*/
public static void showResult(HttpResponse response) throws IOException, UnsupportedEncodingException, InterruptedException {
int status = response.getStatusLine().getStatusCode();
HttpEntity entity = response.getEntity();
InputStream instream = null;
if (entity != null) {
instream = entity.getContent();
BufferedReader reader = new BufferedReader(new InputStreamReader(instream,"GBK"));
String line = null;
while ((line = reader.readLine()) != null) {
line = new String(line.getBytes(), "UTF-8");
if (line.startsWith("<tr class='citytr'>") || line.startsWith("<tr class='countytr'>")
|| line.startsWith("<tr class='towntr'>")) {
Matcher m = p.matcher(line);
while (m.find()) {
String code = m.group(1);
String name = m.group(2);
if (name.startsWith(pre)) {
System.out.print(name + "\t");
sb.append(name + "\t");
} else {
System.out.print(code + "\t");
sb.append(code + "\t");
sb.append(name+"\r\n");
System.out.println(name);
String prefix = "";
if (line.startsWith("<tr class='countytr'>")) {
prefix = "/"+code.substring(3, 5);
}
//递归
get(prefix, "/"+code);
}
}
}
}
instream.close();
EntityUtils.consume(entity);
}
}
}