从国家统计局爬取山东省市区县乡镇

项目需要,写了个简单的程序爬去山东省市区县乡镇区划信息。

依赖的jar包来源于httpcomponents-client-4.2.5-bin.zip;


package org.apache.http.examples.test;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
//从国家统计局爬取山东省市区县乡镇
public class DailySign {
 
    public static final String URL_GET = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/";
     
    public static int count = 0;
    
    public static String pre="12";
    public static String file = pre+".html";
    public static String fileName = pre+".html";

    public static FileWriter writer = null;
    public static StringBuffer sb = new StringBuffer();
    
    public static Pattern p = Pattern.compile("<a href='([^>]*)'>([^<]*)</a>");
 
    public static void main(String[] args) throws Exception {
        writer = new FileWriter(fileName+".txt");
        get("",file); 
        try {
        	writer.write(sb.toString());
    		if(writer!=null){
    			writer.close();
    		}
		} catch (IOException e) {
			e.printStackTrace();
		}
    }
 
    public static void get(String prefix, String req) throws ClientProtocolException, IOException, InterruptedException {
        count++;
        //休眠,防止大量请求被网站拒绝
        if(count % 200 == 0){
            Thread.sleep(1000);
        }
        DefaultHttpClient client = new DefaultHttpClient();
        HttpGet signGet = new HttpGet(URL_GET + prefix + req);
        // 执行签到请求
        HttpResponse signResponse = client.execute(signGet);
        // 处理响应
        showResult(signResponse);
    }
 
    /**
     * 读取相应内容并输出
     * @throws InterruptedException 
     */
    public static void showResult(HttpResponse response) throws IOException, UnsupportedEncodingException, InterruptedException {
        int status = response.getStatusLine().getStatusCode();
        HttpEntity entity = response.getEntity();
        InputStream instream = null;
        if (entity != null) {
            instream = entity.getContent();
            BufferedReader reader = new BufferedReader(new InputStreamReader(instream,"GBK"));
            String line = null;
            while ((line = reader.readLine()) != null) {
                line = new String(line.getBytes(), "UTF-8");
                if (line.startsWith("<tr class='citytr'>") || line.startsWith("<tr class='countytr'>")
                        || line.startsWith("<tr class='towntr'>")) {
                    Matcher m = p.matcher(line);
                    while (m.find()) {
                        String code = m.group(1);
                        String name = m.group(2);
                        if (name.startsWith(pre)) {
                            System.out.print(name + "\t");
                            sb.append(name + "\t");
                        } else {
                            System.out.print(code + "\t");
                            sb.append(code + "\t");
                            sb.append(name+"\r\n");
                            System.out.println(name);
                            String prefix = "";
                            if (line.startsWith("<tr class='countytr'>")) {
                                prefix = "/"+code.substring(3, 5);
                            }
                            //递归
                            get(prefix, "/"+code);
                        }
                    }
                }                
            }
            instream.close();
            EntityUtils.consume(entity);
        }         
    }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值