JAVA爬虫爬取国家统计局行政区划数据(2021年最新数据)

一、引入jsoup依赖

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.9.2</version>
        </dependency>

二、爬虫程序

@Service
public class JavaJsoupService {


    /**
     * 本示例只爬取宁夏回族自治区五级行政区划的信息
     */
    private static String allName = "宁夏回族自治区";

    /**
     * 建立连接
     */
    private static Document connect(String url) {
        if (url == null || url.isEmpty()) {
            throw new IllegalArgumentException("无效的url");
        }
        try {
            return Jsoup.connect(url).timeout(200 * 2000).get();
        } catch (IOException e) {
            System.out.println(url+"地址不存在");
            return null;
        }
    }

    /**
     * 获取所有的省份(本示例只爬取宁夏回族自治区五级行政区划的信息)
     * @param url 请求地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html
     * @return
     */
    public List<SysArea> getProvinces(String url) {
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(url);
        Elements rowProvince = connect.select("tr.provincetr");
        for (Element provinceElement : rowProvince) {
            Elements select = provinceElement.select("a");
            for (Element province : select) {
                if(province.text().equals(allName)){
                    String code = province.select("a").attr("href");
                    String name = province.text();
                    SysArea sysArea = new SysArea();
                    sysArea.setAreaCode(code.replace(".html","0000000000"));
                    sysArea.setId(sysArea.getAreaCode());
                    sysArea.setAreaName(name);
                    sysArea.setLevel("1");
                    sysArea.setParentCode("0");
                    sysArea.setDelFlag("1");
                    sysArea.setStatus("1");
                    sysArea.setFullName(name);
                    sysAreas.add(sysArea);
                    String provinceUrl = url.replace("index.html",code);
                    System.err.println("++++++++++++++++++++++++++开始获取"+ name +"下属市区行政区划信息++++++++++++++++++++++++");
                    List<SysArea> cityAreaCodeList = getCityAreaCode(provinceUrl,code.replace(".html","0000000000"),name);
                    sysAreas.addAll(cityAreaCodeList);
                }
            }
        }
        return sysAreas;
    }


    /**
     * 获取市行政区划信息
     * @param provinceUrl 省份对应的地址
     * @param parentCode  需要爬取的省份行政区划(对于市的父级代码即为省行政区划)
     * @return
     */
    public static List<SysArea> getCityAreaCode(String provinceUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(provinceUrl);
        Elements rowCity = connect.select("tr.citytr");
        for (Element cityElement : rowCity) {
            String name = cityElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[1]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("2");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setFullName(upAreaName+split[1]);
            sysArea.setId(sysArea.getAreaCode());
            sysAreas.add(sysArea);
            String cityUrl = provinceUrl.replace(".html","/"+split[0].substring(0, 4)+".html");
            System.err.println("-------------------开始获取"+split[1]+"下属区县行政区划信息-----------------------");
            List<SysArea> downAreaCodeList = getDownAreaCode(cityUrl,split[0],upAreaName+split[1]);
            sysAreas.addAll(downAreaCodeList);
            //只爬取固原市的数据
            /*if("固原市".equals(split[1])){

            }*/
        }
        return sysAreas;
    }

    /**
     * 获取区县行政区划信息
     * @param cityUrl 城市对应的地址
     * @param parentCode  需要爬取的市行政区划(对于区县的父级代码即为市行政区划)
     * @return
     */
    public static List<SysArea> getDownAreaCode(String cityUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(cityUrl);
        Elements rowDown = connect.select("tr.countytr");
        for (Element downElement : rowDown) {
            String code = downElement.select("a").attr("href");
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            if(!"市辖区".equals(split[1])){
                SysArea sysArea = new SysArea();
                sysArea.setAreaCode(split[0]);
                sysArea.setAreaName(split[1]);
                sysArea.setParentCode(parentCode);
                sysArea.setLevel("3");
                sysArea.setDelFlag("1");
                sysArea.setStatus("1");
                sysArea.setFullName(upAreaName+split[1]);
                sysArea.setId(sysArea.getAreaCode());
                sysAreas.add(sysArea);
                String downUrl = cityUrl.replace(parentCode.substring(0,4)+".html",code);
                System.err.println("====================开始获取"+split[1]+"下属区划信息");
                List<SysArea> countryAreaList = getCountryAreaCodeList(downUrl,split[0],upAreaName+split[1]);
                sysAreas.addAll(countryAreaList);
            }
        }
        return sysAreas;
    }


    /**
     * 获取乡镇行政区划信息
     * @param downUrl 
     * @param parentCode 
     * @return
     */
    public static List<SysArea> getCountryAreaCodeList(String downUrl,String parentCode,String upAreaName){
        List<SysArea> sysAreas = new ArrayList<>();
        Document connect = connect(downUrl);
        Elements rowDown = connect.select("tr.towntr");
        for (Element downElement : rowDown) {
            String code = downElement.select("a").attr("href");
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[1]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("4");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setFullName(upAreaName+split[1]);
            sysArea.setId(sysArea.getAreaCode());
            sysAreas.add(sysArea);
            String countryUrl = downUrl.replace(parentCode.substring(0,6)+".html",code);
            System.err.println("====================开始获取"+split[1]+"下属区划信息");
            List<SysArea> villageAreaCodeList = getVillageAreaCodeList(countryUrl,split[0],upAreaName+split[1]);
            sysAreas.addAll(villageAreaCodeList);
        }
        return sysAreas;
    }


    /**
     * 获取村行政区划信息
     * @param countryUrl
     * @param parentCode
     * @return
     */
    public static List<SysArea> getVillageAreaCodeList(String countryUrl,String parentCode,String upAreaName){
        List<SysArea> villageAreaCodeList = new ArrayList<>();
        Document connect = connect(countryUrl);
        Elements rowDown = connect.select("tr.villagetr");
        for (Element downElement : rowDown) {
            String name = downElement.select("td").text();
            String[] split = name.split(" ");
            SysArea sysArea = new SysArea();
            sysArea.setAreaCode(split[0]);
            sysArea.setAreaName(split[2]);
            sysArea.setParentCode(parentCode);
            sysArea.setLevel("5");
            sysArea.setDelFlag("1");
            sysArea.setStatus("1");
            sysArea.setId(sysArea.getAreaCode());
            sysArea.setFullName(upAreaName+split[2]);
            villageAreaCodeList.add(sysArea);
        }
        return villageAreaCodeList;
    }
}

三、单元测试

//此处展示批量插入数据库
@Test
    public void 爬虫批量写入数据(){
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
        List<SysArea> sysAreas = javaJsoupService.getProvinces(url);
        System.err.println("爬虫相应数据为:"+JSON.toJSONString(sysAreas));
        int result = surveyCommonMapper.insertAreaInfo(sysAreas);
        System.err.println("插入数据条数:"+result);
    }

四、批量插入数据库

<!--测试批量插入爬虫获取的区划信息-->
    <insert id="insertAreaInfo" parameterType="java.util.List">
        insert into sys_area_20220304(id, area_code, area_name,parent_code, full_name,level,status,del_flag) values
        <foreach collection="list" item="item" index="index" separator=",">
            (#{item.id},#{item.areaCode},#{item.areaName},#{item.parentCode},#{item.fullName},#{item.level},#{item.status},#{item.delFlag})
        </foreach>
    </insert>
  • 2
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
您好,可以使用Java的Jsoup库来爬取国家统计局行政区划信息。具体步骤如下: 1. 打开国家统计局行政区划页面:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/ 2. 使用Jsoup库获取页面内容,并解析出需要的信息。 3. 遍历解析出的信息,可以将其存储到数据库或者文件中。 以下是示例代码: ```java import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; public class Main { public static void main(String[] args) throws IOException { // 打开国家统计局行政区划页面 String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"; Document doc = Jsoup.connect(url).get(); // 解析出需要的信息 Elements provinces = doc.select("tr.provincetr td a"); for (Element province : provinces) { String provinceName = province.text(); String provinceUrl = url + province.attr("href"); System.out.println(provinceName + " " + provinceUrl); Document provinceDoc = Jsoup.connect(provinceUrl).get(); Elements cities = provinceDoc.select("tr.citytr td a"); for (Element city : cities) { String cityName = city.text(); String cityUrl = url + city.attr("href"); System.out.println("\t" + cityName + " " + cityUrl); Document cityDoc = Jsoup.connect(cityUrl).get(); Elements counties = cityDoc.select("tr.countytr td a"); for (Element county : counties) { String countyName = county.text(); String countyUrl = url + county.attr("href"); System.out.println("\t\t" + countyName + " " + countyUrl); } } } } } ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值