java获取行政区划编码(省市区县居委5级)

背景

打算做一个省级联动的功能,由于网上没有找到符合自己要求的数据。便着手尝试写一个简单爬虫进行数据抓取。由于时间仓促,简单粗糙的写了份代码。(嵌套循环过多)可自行优化。复制代码,能直接运行;

大概花费一个多小时能运行完成

如果不想自己重新抓取,文章的最后有2份不同数据结构格式的sql,可直接使用;

部分城市没有5级的,例如广东省东莞市下一级是镇。

数据来源:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/

运行环境:jdk8;
 


代码

<!-- 依赖包 -->
<dependency>
   <groupId>org.jsoup</groupId>
   <artifactId>jsoup</artifactId>
   <version>1.11.3</version>
</dependency>

import cn.hutool.core.date.DateUtil;


import cn.hutool.core.util.IdUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
import lombok.var;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;

public class Test {


    private static final Logger log = LoggerFactory.getLogger(Test.class);

    private static AtomicInteger atomicInteger = new AtomicInteger(0);
    private static AtomicInteger atomicIntegerErrorNum = new AtomicInteger(0);


    public static void main(String[] args) {

        //获取所有的数据
//        test(null, null);

        //获取指定的城市
        test("广东省", "中山市");
    }

    private static String getHtml(String url) throws Exception {

        atomicInteger.incrementAndGet();

        log.info("调用接口次数 :" + atomicInteger.get());
        log.info("请求开始时间 ==>" + DateUtil.formatDateTime(new Date()));
        log.info(url);
        String html = null;
        try {
            html = getHtml2(url);
        } catch (Exception e) {
            e.printStackTrace();
            log.info(Thread.currentThread().getName() + " = 超时。。睡5秒再重试");
            Thread.sleep(1000 * 5);
            log.info(Thread.currentThread().getName() + " = 超时重试");
            atomicIntegerErrorNum.incrementAndGet();
            log.info("error num = " + atomicIntegerErrorNum.get());

            try {

                //重试1次
                html = getHtml2(url);
            } catch (Exception e2) {
                e2.printStackTrace();

                log.info("error num = " + atomicIntegerErrorNum.get());
                log.info(Thread.currentThread().getName() + " 第二次超时重试");
                log.info(Thread.currentThread().getName() + " = 第二次超时。。睡12分钟再重试");
                log.info("error num = " + atomicIntegerErrorNum.get());

                Thread.sleep(1000 * 60 * 12);

                //重试2次
                html = getHtml2(url);

            }

        }
        log.info("请求结束时间 ==>" + DateUtil.formatDateTime(new Date()));

        System.out.println("");
        System.out.println("");
        return html;
//        }


    }


    private static String getHtml2(String url) throws Exception {

        //链式构建请求
        return HttpRequest.get(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36")//头信息,多个头信息多次调用此方法即可
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")//头信息,多个头信息多次调用此方法即可
                .header("Cookie", "_trs_uv=" + IdUtil.simpleUUID() + "; SF_cookie_1=" + IdUtil.fastUUID())
                .header("If-None-Match", "f32-5d4bccaa05a80-gzip")
//                    .header("If-Modified-Since",new Date())
//                    .form(paramMap)//表单内容
                .timeout(15000)//超时,毫秒
                .execute().body();

    }

    public static String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";


    public static void test(String appendProvide, String appendCity) {

        System.out.println("===============开始抓取数据==================");

        var url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";


        String html = HttpUtil.get(url);

        var htmlDoc = Jsoup.parse(html);

        var selectClasses = htmlDoc.getElementsByClass("provincetr");


        var startDateStr = DateUtil.formatDateTime(new Date());
        long startDate = System.currentTimeMillis();

//        ExecutorService executorService = new ThreadPoolExecutor(4, 5, 1L, TimeUnit.SECONDS, new ArrayBlockingQueue(4), Executors.defaultThreadFactory());

        for (int i = 0; i < selectClasses.size(); i++) {
            int finalI = i;
//            executorService.execute(() -> {
//                System.out.println(Thread.currentThread().getName() + " " + "--->开始爬数据");
            try {
                //一个线程跑就行
                startProvide(selectClasses, finalI, baseUrl, appendProvide, appendCity);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        System.out.println("耗时 =》");
        System.out.println((System.currentTimeMillis() - startDate) / 1000);


        System.out.println("开始时间 ==> " + startDateStr);
        System.out.println("结束时间 ==>" + DateUtil.formatDateTime(new Date()));

    }


    /**
     * @param selectClasses
     * @param i
     * @param baseUrl
     * @param appendProvide 需要查询的省  ,null表示查询所有
     * @throws Exception
     */
    private static void startProvide(Elements selectClasses, Integer i, String baseUrl, String appendProvide, String appendCity) throws Exception {


        var provideCodes = selectClasses.get(i).children();


        //1.省份 provincetr
        for (int provideCodeIndex = 0; provideCodeIndex < provideCodes.size(); provideCodeIndex++) {
            var provideCodeUrl = provideCodes.get(provideCodeIndex).select("a").attr("href");
            var provideName = provideCodes.get(provideCodeIndex).select("a").text();


//            System.out.println("省份 = " + provideName);

            if (!StringUtils.isBlank(provideCodeUrl)) {
                if (StringUtils.isBlank(appendProvide)) {
                    provide(provideCodeUrl, provideName, null);
                } else {
                    if (appendProvide.equals(provideName)) {
                        provide(provideCodeUrl, provideName, appendCity);
                    }
                }
            }
        }
    }


    /**
     * 通过省,市获取下面的数据
     *
     * @param provideCodeUrl
     */
    public static void provide(String provideCodeUrl, String provideName, String appendCity) throws Exception {
        var provideCode = provideCodeUrl.split("\\.")[0];

//                    String gotoCityHtml = HttpUtil.get(baseUrl + provideCodeUrl);
        String gotoCityHtml = getHtml(baseUrl + provideCodeUrl);
        var cityHtmlDoc = Jsoup.parse(gotoCityHtml);
        Elements selectCityClass = cityHtmlDoc.select(".citytr");

//        var len = provideCode.length();
//        var provideCode2 = provideCode;
//        if (len < 6) {
//            len = 6 - len;
//            for (int l = 0; l < len; l++) {
//                provideCode2 += "0";
//            }
//        }


        if (StringUtils.isBlank(appendCity)) {
            //todo 保存到数据库  保存省
            city(selectCityClass, provideName, provideCode, null);
        } else {
            city(selectCityClass, provideName, provideCode, appendCity);
        }

    }


    /**
     * 城市
     *
     * @param selectCityClass
     * @param provideName
     * @param provideCode
     * @throws Exception
     */
    private static void city(Elements selectCityClass, String provideName, String provideCode, String appendCity) throws Exception {

        //2.城市  citytr
        for (int cityIndex = 0; cityIndex < selectCityClass.size(); cityIndex++) {
            var gotoCountyUrl = selectCityClass.get(cityIndex).select("td").get(1).select("a").attr("href");
            var cityName = selectCityClass.get(cityIndex).select("td").get(1).select("a").text();

            System.out.println("城市 = " + cityName);

            if (StringUtils.isBlank(gotoCountyUrl)) {
                continue;
            }

            if (StringUtils.isBlank(appendCity)) {
                appendCity(gotoCountyUrl, provideName, provideCode, cityName);
            } else {
                if (appendCity.equals(cityName)) {
                    appendCity(gotoCountyUrl, provideName, provideCode, cityName);
                }
            }


        }
    }


    private static void appendCity(String gotoCountyUrl, String provideName, String provideCode, String cityName) throws Exception {
        //                        String countytr = HttpUtil.get(baseUrl + gotoCountyUrl);

        boolean flag = true;
        String countytr = getHtml(baseUrl + gotoCountyUrl);
        var countytrDoc = Jsoup.parse(countytr);
        Elements countyClass = countytrDoc.select(".countytr");
        if (countyClass.size() == 0) {
            flag = false;
            countyClass = countytrDoc.select(".towntr");
        }

        var strCityUrl = gotoCountyUrl.split("/");
        var cityCode = strCityUrl[strCityUrl.length - 1].split("\\.")[0];
        var cityCodeLen = cityCode.length();
        var cityCode2 = cityCode;
        if (cityCodeLen < 6) {
            cityCodeLen = 6 - cityCodeLen;
            for (int l = 0; l < cityCodeLen; l++) {
                cityCode2 += "0";
            }
        }

        if (cityName.equals("市辖区")) {
            cityName = provideName;
        }

        cityCode = cityCode2;
        if (!flag) {

            town(countyClass,
                    provideName,
                    provideCode,
                    cityCode
                    , flag
            );
        } else {
            //todo 保存到数据库   保存城市
            county(countyClass,
                    provideName,
                    provideCode);
        }

    }


    /**
     * 街道
     *
     * @param countyClass
     * @param provideCode
     */
    private static void county(Elements countyClass, String provideName, String provideCode) throws Exception {
        //3.县区  countytr
        for (int county = 0; county < countyClass.size(); county++) {
            var gotoTownUrl = countyClass.get(county).select("td").get(1).select("a").attr("href");
            var countyName = countyClass.get(county).select("td").get(1).select("a").text();
            System.out.println("县区 = " + countyName);


            if (StringUtils.isBlank(gotoTownUrl)) {
                continue;
            }


            //todo 保存到数据库

            String towntr = getHtml(baseUrl + provideCode + "/" + gotoTownUrl);
            var townDoc = Jsoup.parse(towntr);
            Elements townClass = townDoc.select(".towntr");
            var gotoTownCode = gotoTownUrl.split("/")[0];
//                            Thread.sleep(3000);
            town(townClass,
                    provideName,
                    provideCode,
                    gotoTownCode, true);
        }
    }

    /**
     * @param townClass
     * @param provideName
     * @param provideCode
     * @param gotoTownCode
     * @param flag         是否地级市。true是,false否
     * @throws Exception
     */
    private static void town(Elements townClass, String provideName, String provideCode, String gotoTownCode, boolean flag) throws Exception {
        //4.街道。镇 towntr
        for (int town = 0; town < townClass.size(); town++) {
            //towntr
            var gotoVillageHref = townClass.get(town).select("td").get(1).select("a").attr("href");
            var townName = townClass.get(town).select("td").get(1).select("a").text();
            System.out.println("街道。镇 = " + townName);

            if (StringUtils.isBlank(gotoVillageHref)) {
                continue;
            }


            //todo 保存到数据库


            //居委会
//                                String villageStr = HttpUtil.get(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref);
            String villageStr = null;

            if(flag){
                villageStr = getHtml(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref);
            }else{
                villageStr = getHtml(baseUrl + provideCode + "/"+ gotoVillageHref);
            }

            var villageDoc = Jsoup.parse(villageStr);
            Elements villagetr = villageDoc.select(".villagetr");

            for (int villageIndex = 0; villageIndex < villagetr.size(); villageIndex++) {
                var tds = villagetr.get(villageIndex).select("td");//[0].text();
                var villageCode = tds.get(0).text();
                var villageType = tds.get(1).text();
                var villageName = tds.get(2).text();
                System.out.println(villageCode + " " + villageType + " " + villageName);

                //todo 保存到数据库

            }
        }
    }

}

附录

2021全国行政区域4级,5级(不含港澳台)-MySQL文档类资源-CSDN下载

免责声明:本文章仅用于学习参考

  • 4
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值