爬虫模式-JAVA获取省市区编码

起因

准备做一个省级联动的功能,从网页上查找了很多资源符合要求的很少(缺失港澳台数据,不好实现直辖市、特别行政区判断)。复制代码,能直接运行。

数据来源:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/
运行环境:jdk8;

代码

jar 包

<!-- 依赖包 -->
    <dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
    </dependency>
    
    <dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-all</artifactId>
    <version>4.5.11</version>
    </dependency>
    
    <dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    
    <dependency>
    <groupId>org.projectlombok</groupId>
    <artifactId>lombok</artifactId>
    <optional>true</optional>
    </dependency>
    <dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-test</artifactId>
    </dependency>
    
    <dependency>
    <groupId>com.baomidou</groupId>
    <artifactId>mybatis-plus-boot-starter</artifactId>
    <version>3.3.0</version>
    </dependency>
    
    <dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.35</version>
    </dependency>
    
    <dependency>
    <groupId>com.baomidou</groupId>
    <artifactId>mybatis-plus-generator</artifactId>
    <version>3.3.0</version>
    </dependency>
    <dependency>
    <groupId>org.apache.velocity</groupId>
    <artifactId>velocity-engine-core</artifactId>
    <version>2.1</version>
    </dependency>

_服务实现类 _ChinasServiceImpl

package com.hn.yuan.city.service.impl;

import cn.hutool.core.date.DateUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
import com.hn.yuan.city.entity.Chinas;
import com.hn.yuan.city.mapper.ChinasMapper;
import com.hn.yuan.city.service.ChinasService;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.hn.yuan.reptileCity.demoTest.Test;
import lombok.var;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * <p>
 * 服务实现类
 * </p>
 *
 * @author XIAOCAO
 * 
 */
@Service
public class ChinasServiceImpl extends ServiceImpl<ChinasMapper, Chinas> implements ChinasService {


    @Autowired
    private ChinasMapper chinasMapper;

    /*什么情况下使用AtomicInteger
    1、作为多个线程同时使用的原子计数器。
    2、在比较和交换操作中实现非阻塞算法。
    https://www.jianshu.com/p/073096a729f6
     */
    public String addChinas(Chinas chinas) {
        chinasMapper.insert(chinas);
        return "成功";
    }

    private static final Logger log = LoggerFactory.getLogger(Test.class);

    //AtomicInteger类是系统底层保护的int类型,通过提供执行方法的控制进行值的原子操作
    private static AtomicInteger atomicInteger = new AtomicInteger(0);
    private static AtomicInteger atomicIntegerErrorNum = new AtomicInteger(0);


    public void test(String appendProvide, String appendCity) {

        System.out.println("===============开始抓取数据=================");
        var url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
        String html = HttpUtil.get(url);
        var htmlDoc = Jsoup.parse(html);
        var selectClasses = htmlDoc.getElementsByClass("provincetr");
        var startDateStr = DateUtil.formatDateTime(new Date());
        long startDate = System.currentTimeMillis();
//        ExecutorService executorService = new ThreadPoolExecutor(4, 5, 1L, TimeUnit.SECONDS, new ArrayBlockingQueue(4), Executors.defaultThreadFactory());
        for (int i = 0; i < selectClasses.size(); i++) {
            int finalI = i;
//            executorService.execute(() -> {
//                System.out.println(Thread.currentThread().getName() + " " + "--->开始爬数据");
            try {
                //一个线程跑就行
                startProvide(selectClasses, finalI, baseUrl, appendProvide, appendCity);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        System.out.println("耗时 =》");
        System.out.println((System.currentTimeMillis() - startDate) / 1000);

        System.out.println("开始时间 ==> " + startDateStr);
        System.out.println("结束时间 ==>" + DateUtil.formatDateTime(new Date()));

    }


    private String getHtml(String url) throws Exception {

        //以原子方式将当前值递增1并在递增后返回新值。它相当于i ++操作。
        atomicInteger.incrementAndGet();

        log.info("调用接口次数 :" + atomicInteger.get());
        log.info("请求开始时间 ==>" + DateUtil.formatDateTime(new Date()));
        log.info(url);
        String html = null;
        try {
            html = getHtml2(url);
        } catch (Exception e) {
            e.printStackTrace();
            log.info(Thread.currentThread().getName() + " = 超时。。睡5秒再重试");
            Thread.sleep(1000 * 5);
            log.info(Thread.currentThread().getName() + " = 超时重试");
            atomicIntegerErrorNum.incrementAndGet();
            log.info("error num = " + atomicIntegerErrorNum.get());

            try {
                //重试1次
                html = getHtml2(url);
            } catch (Exception e2) {
                e2.printStackTrace();

                log.info("error num = " + atomicIntegerErrorNum.get());
                log.info(Thread.currentThread().getName() + " 第二次超时重试");
                log.info(Thread.currentThread().getName() + " = 第二次超时。。睡12分钟再重试");
                log.info("error num = " + atomicIntegerErrorNum.get());

                Thread.sleep(1000 * 60 * 12);

                //重试2次
                html = getHtml2(url);

            }

        }
        log.info("请求结束时间 ==>" + DateUtil.formatDateTime(new Date()));

        System.out.println("");
        System.out.println("");
        return html;

    }

    private String getHtml2(String url) throws Exception {

        //链式构建请求
        return HttpRequest.get(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36")//头信息,多个头信息多次调用此方法即可
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")//头信息,多个头信息多次调用此方法即可
                .header("Cookie", "_trs_uv=" + IdUtil.simpleUUID() + "; SF_cookie_1=" + IdUtil.fastUUID())
                .header("If-None-Match", "f32-5d4bccaa05a80-gzip")
//                    .header("If-Modified-Since",new Date())
//                    .form(paramMap)//表单内容
                .timeout(15000)//超时,毫秒
                .execute().body();

    }

    //国家统计局链接
    public static String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";


    /**
     * @param selectClasses
     * @param i
     * @param baseUrl
     * @param appendProvide 需要查询的省  ,null表示查询所有
     * @throws Exception
     */
    private void startProvide(Elements selectClasses, Integer i, String baseUrl, String appendProvide, String appendCity) throws Exception {

        var provideCodes = selectClasses.get(i).children();
        //1.省份 provincetr
        for (int provideCodeIndex = 0; provideCodeIndex < provideCodes.size(); provideCodeIndex++) {
            var provideCodeUrl = provideCodes.get(provideCodeIndex).select("a").attr("href");
            var provideName = provideCodes.get(provideCodeIndex).select("a").text();
//            System.out.println("省份 = " + provideName);
            if (!StringUtils.isBlank(provideCodeUrl)) {
                if (StringUtils.isBlank(appendProvide)) {
                    provide(provideCodeUrl, provideName, null);
                } else {
                    if (appendProvide.equals(provideName)) {
                        System.out.println("要的省份 = " + provideName);
                        provide(provideCodeUrl, provideName, appendCity);
                    }
                }
            }
        }
    }


    /**
     * 通过省,市获取下面的数据
     *
     * @param provideCodeUrl
     */
    public void provide(String provideCodeUrl, String provideName, String appendCity) throws Exception {
        var provideCode = provideCodeUrl.split("\\.")[0];

//                    String gotoCityHtml = HttpUtil.get(baseUrl + provideCodeUrl);
        String gotoCityHtml = getHtml(baseUrl + provideCodeUrl);
        var cityHtmlDoc = Jsoup.parse(gotoCityHtml);
        Elements selectCityClass = cityHtmlDoc.select(".citytr");

        //拼接省code码插入数据库
        var len = provideCode.length();
        var provideCode2 = provideCode;
        if (len < 6) {
            len = 6 - len;
            for (int l = 0; l < len; l++) {
                provideCode2 += "0";
            }
        }

        if (StringUtils.isBlank(appendCity)) {
            //todo 保存到数据库  保存省
            System.out.println("保存数据库的id:" + provideCode2 + "省名称:" + provideName + "code码:" + provideCode2 + "level:" + 1);
            Chinas chinas = new Chinas().setCode(Long.valueOf(provideCode2)).setName(provideName).setPid(Long.valueOf(0)).setLevel("1");
            this.addChinas(chinas);
            city(selectCityClass, provideName, provideCode, null);
        } else {
            city(selectCityClass, provideName, provideCode, appendCity);
        }

    }


    /**
     * 城市
     *
     * @param selectCityClass
     * @param provideName
     * @param provideCode
     * @throws Exception
     */
    private void city(Elements selectCityClass, String provideName, String provideCode, String appendCity) throws Exception {
        //2.城市  citytr
        for (int cityIndex = 0; cityIndex < selectCityClass.size(); cityIndex++) {
            var gotoCountyUrl = selectCityClass.get(cityIndex).select("td").get(1).select("a").attr("href");
            var cityName = selectCityClass.get(cityIndex).select("td").get(1).select("a").text();
            if (StringUtils.isBlank(gotoCountyUrl)) {
                continue;
            }

            if (StringUtils.isBlank(appendCity)) {
                this.appendCity(gotoCountyUrl, provideName, provideCode, cityName);
            } else {
                if (appendCity.equals(cityName)) {
                    this.appendCity(gotoCountyUrl, provideName, provideCode, cityName);
                }
            }


        }
    }


    private void appendCity(String gotoCountyUrl, String provideName, String provideCode, String cityName) throws Exception {
        //String countytr = HttpUtil.get(baseUrl + gotoCountyUrl);
        boolean flag = true;
        String countytr = getHtml(baseUrl + gotoCountyUrl);
        var countytrDoc = Jsoup.parse(countytr);
        Elements countyClass = countytrDoc.select(".countytr");
        if (countyClass.size() == 0) {
            flag = false;
            countyClass = countytrDoc.select(".towntr");
        }

        var strCityUrl = gotoCountyUrl.split("/");
        var cityCode = strCityUrl[strCityUrl.length - 1].split("\\.")[0];
        var cityCodeLen = cityCode.length();
        var cityCode2 = cityCode;
        if (cityCodeLen < 6) {
            cityCodeLen = 6 - cityCodeLen;
            for (int l = 0; l < cityCodeLen; l++) {
                cityCode2 += "0";
            }
        }
        if (cityName.equals("市辖区")) {
            cityName = provideName;
        }
        cityCode = cityCode2;

        //拼接省code码插入数据库
        var len = provideCode.length();
        var provideCode2 = provideCode;
        if (len < 6) {
            len = 6 - len;
            for (int l = 0; l < len; l++) {
                provideCode2 += "0";
            }
        }
        if (!flag) {
            System.out.println("直辖市,进入此方法");
        } else {
            //todo 保存到数据库   保存城市
            System.out.println("保存数据库的市id:" + cityCode + " 市名称:" + cityName + " code码:" + provideCode2 + "level:" + 2);
            Chinas chinas = new Chinas().setCode(Long.valueOf(cityCode)).setName(cityName).setPid(Long.valueOf(provideCode2)).setLevel("2");
            this.addChinas(chinas);
            county(countyClass,
                    provideName,
                    provideCode, cityCode);
        }

    }


    /**
     * 街道
     *
     * @param countyClass
     * @param provideCode
     */
    private void county(Elements countyClass, String provideName, String provideCode, String cityCode) throws Exception {
        //3.县区  countytr
        for (int county = 0; county < countyClass.size(); county++) {
            var gotoTownUrl = countyClass.get(county).select("td").get(1).select("a").attr("href");
            var countyName = countyClass.get(county).select("td").get(1).select("a").text();
            if (StringUtils.isBlank(gotoTownUrl)) {
                continue;
            }

            String gotoTownUrls = StringUtils.substringBetween(gotoTownUrl, "/", ".");

            //todo 保存到数据库
            System.out.println("保存数据库的县 区数据id:" + gotoTownUrls + " 县区名称:" + countyName + " code码:" + cityCode + "level:" + 3);
            Chinas chinas = new Chinas().setCode(Long.valueOf(gotoTownUrls)).setName(countyName).setPid(Long.valueOf(cityCode)).setLevel("3");
            this.addChinas(chinas);
        }
    }

}

_服务类 _ChinasService

package com.hn.yuan.city.service;

import com.hn.yuan.city.entity.Chinas;
import com.baomidou.mybatisplus.extension.service.IService;

/**
 * <p>
 *  服务类
 * </p>
 * @author XIAOCAO
 * 
 */
public interface ChinasService extends IService<Chinas> {
    public void test(String appendProvide, String appendCity);
}

测试类 testDemo

package com.hn.yuan.city;

import com.hn.yuan.city.service.ChinasService;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;


@RunWith(SpringRunner.class)
@SpringBootTest
public class testDemo {

    @Autowired
    private ChinasService chinasService;

    @Test
    public void tests() {
        //获取所有的数据
        chinasService.test(null, null);
        //获取指定的城市
        //chinasService.test("河南省", "郑州市");

    }

}

注意:controller、service、mapper、entity层通过mybatis-plus自动生成的;此方法不涉及controller,mapper,entity;
全面省市区县居委5级详情看转载:https://blog.csdn.net/qq_15421685/article/details/124754314

Mysql数据表数据包含港澳台

https://pan.baidu.com/s/1CwnmUGSPFzhZdwIWNTwd_Q?pwd=1102
提取码:1102

在这里插入图片描述

各位看官》创作不易,点个赞!!!
诸君共勉:万事开头难,只愿肯放弃。

免责声明:本文章仅用于学习参考


实战模式-Vue+Java后台实现省市区三级联动
实战模式-微信小程序java后台+mysql实现省市区三级联动
微信小程序组件简易实现省市区三级联动
免责声明:本文章仅用于学习参考

  • 4
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值