Java 爬取2023年国家统计局的数据,更新全国的省市区地址和编码

2023年国家统计局: http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/
每年会更新最新的区域信息

一、准备mysql表结构,根据自己需求修改

CREATE TABLE `sys_area_new` (
  `id` char(32) NOT NULL DEFAULT '' COMMENT 'ID',
  `is_deleted` bit(1) DEFAULT NULL COMMENT '已删除',
  `create_date` bigint(20) DEFAULT NULL COMMENT '创建日期',
  `edit_date` bigint(20) DEFAULT NULL COMMENT '编辑日期',
  `creator` varchar(100) DEFAULT NULL COMMENT '创建者',
  `editor` varchar(100) DEFAULT NULL COMMENT '编辑者',
  `parent` char(32) DEFAULT NULL COMMENT '级联关系',
  `level` varchar(20) DEFAULT NULL COMMENT '层级',
  `code` varchar(50) DEFAULT NULL COMMENT '编号',
  `name` varchar(100) DEFAULT NULL COMMENT '名称',
  `full_name` varchar(100) DEFAULT NULL COMMENT '全称',
  `is_leaf` bit(1) DEFAULT NULL COMMENT '叶子节点',
  `parentcode` varchar(50) DEFAULT NULL COMMENT '父级地区编码',
  `yn` bit(1) DEFAULT b'1' COMMENT '状态:1启用 2已作废',
  `remark` varchar(255) DEFAULT NULL COMMENT '备注',
  PRIMARY KEY (`id`) USING BTREE,
  KEY `idx_sys_area` (`parent`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='地区';

二、引入pom依赖

<dependencies>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
<!--           注意你的项目中会不会有下面的依赖冲突-->
<!--            <exclusions>-->
<!--                <exclusion>-->
<!--                    <groupId>org.slf4j</groupId>-->
<!--                    <artifactId>slf4j-api</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.slf4j</groupId>-->
<!--                    <artifactId>slf4j-reload4j</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.slf4j</groupId>-->
<!--                    <artifactId>slf4j-log4j12</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.apache.commons</groupId>-->
<!--                    <artifactId>commons-lang3</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>commons-io</groupId>-->
<!--                    <artifactId>commons-io</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.apache.httpcomponents</groupId>-->
<!--                    <artifactId>httpclient</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>org.jsoup</groupId>-->
<!--                    <artifactId>jsoup</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>com.alibaba</groupId>-->
<!--                    <artifactId>fastjson</artifactId>-->
<!--                </exclusion>-->
<!--                <exclusion>-->
<!--                    <groupId>commons-collections</groupId>-->
<!--                    <artifactId>commons-collections</artifactId>-->
<!--                </exclusion>-->
<!--            </exclusions>-->
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
            <!--排除redis依赖-->
<!--            <exclusions>-->
<!--                <exclusion>-->
<!--                    <groupId>redis.clients</groupId>-->
<!--                    <artifactId>jedis</artifactId>-->
<!--                </exclusion>-->
<!--            </exclusions>-->
        </dependency>
        <!-- fastjson -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.9</version>
        </dependency>
        <!-- lombok -->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.8</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context</artifactId>
            <version>4.3.20.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.34</version>
        </dependency>


    </dependencies>

三、java 代码实现

1、AreaSpider.java 实现 PageProcessor 接口

package org.example;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static java.util.regex.Pattern.compile;

public class AreaSpider implements PageProcessor {

    /**
     * 公共路径url
     */
    private static String SPIDER_URL = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/";


    private Site site = Site.me().setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36").setTimeOut(1000 * 60).setCharset("UTF-8").setRetryTimes(10).setSleepTime(100);

    /**
     * 爬取数据的年度标识,用于区分批次
     */
    private String year;

    /**
     * 爬取从省级到包含的那一级数据
     * 例如: contain = "city,county";  即爬取省级到县级数据
     */
    private String contain;

    public AreaSpider(String year, String contain) {
        this.year = year;
        this.contain = contain;
    }

    @Override
    public void process(Page page) {
        page.setCharset("UTF-8");
        String url = page.getUrl().toString();
        String[] replaceUrl = url.replace(SPIDER_URL, "").replace(".html", "").split("/");
        if (url.matches(SPIDER_URL + "index.html")) {
            //抓取省级行政区划
            List<Map<String, Object>> province = page.getHtml().xpath("//tr[@class='provincetr']/td").nodes().stream()
                    .filter(selectable -> selectable.xpath("//a/text()") != null)
                    .filter(selectable -> selectable.links().all().size() > 0)
                    .map(selectable -> {
                        String name = selectable.xpath("//a/text()").toString();
                        String newUrl = selectable.links().all().get(0);
                        if (contain.contains("city")) {
                            page.addTargetRequest(newUrl);
                        }
                        String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
                        String areaCode = replace + "0000"; //北京市:110000
                        HashMap<String, Object> map = new HashMap<>();
                        map.put("C_NAME", name);
                        map.put("C_CODE", areaCode);
                        map.put("C_LEVEL", "province");
                        map.put("C_CASCADE", "/");//级联
                        map.put("C_FULL_NAME", name);
                        map.put("C_PARENT_CODE", 0L);
                        map.put("C_YEAR", year);
                        return map;
                    }).collect(Collectors.toList());

            page.putField("area", province);
        }

        //抓取市级行政单位
        if (replaceUrl.length == 1 && !replaceUrl[0].equals("index") && contain.contains("city")) {
            List<Map<String, Object>> city = new ArrayList<Map<String, Object>>();
            List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='citytr']/td").nodes();
            cityNodes.stream().forEach(node -> {
                String name = node.xpath("//a/text()").toString();
                if (!compile("[0-9]*").matcher(name).matches()) {
                    String newUrl = node.links().all().get(0);
                    if (contain.contains("county")) {
                        page.addTargetRequest(newUrl);
                    }
                    String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
                    String[] split = replace.split("/");
                    String parentId = split[0] + "0000";
                    String areaCode = split[split.length - 1] + "00";
                    HashMap<String, Object> map = new HashMap<>();
                    map.put("C_NAME", name);
                    map.put("C_CODE", areaCode);
                    map.put("C_LEVEL", "city");
                    map.put("C_CASCADE", "/" + parentId + "/" + areaCode);
                    //C_FULL_NAME 为上一级名称+name
                    map.put("C_FULL_NAME",  name);
                    map.put("C_PARENT_CODE", Long.valueOf(parentId));
                    map.put("C_YEAR", year);
                    city.add(map);
                }
            });
            page.putField("area", city);
        }

        //抓取县级行政单位
                if (replaceUrl.length == 2 && contain.contains("county")) {
            List<Map<String, Object>> county = new ArrayList<Map<String, Object>>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='countytr']/td").nodes();
            //还有市辖镇的情况,县/区级行政单位的tr标签为towntr
            if (countyNodes.size() == 0) {
                countyNodes = page.getHtml().xpath("//tr[@class='towntr']/td")
                        .nodes().stream().filter(selectable -> selectable.xpath("//a/text()") != null)
                        .filter(selectable -> selectable.links().all().size() > 0)
                        .collect(Collectors.toList());
            }

            for (int i = 0; i < countyNodes.size(); i += 2) {
                List<String> code = countyNodes.get(i).xpath("//*/text()").all();
                List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
                String areaCode = code.get(0);
                String areaName = name.get(0);
                if (code.size() > 1) {
                    areaCode = code.get(1);
                    areaName = name.get(1);
                    String newUrl = countyNodes.get(i).links().all().get(0);
                    if (contain.contains("town")) {
                        page.addTargetRequest(newUrl);
                    }
                }
                // 如果是直辖市截取areaCode的长度为6,如果是市辖镇则长度截取为9
                if( page.getHtml().xpath("//tr[@class='countytr']/td").nodes().size()==0){
                    areaCode = areaCode.substring(0, 9);
                }else {
                    areaCode = areaCode.substring(0, 6);
                }
                String parentId = areaCode.substring(0, 4) + "00";
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", "county");
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + parentId + "/" + areaCode);
                map.put("C_FULL_NAME", areaName);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                county.add(map);
            }
            page.putField("area", county);
        }

        //抓取镇级行政单位
        if (replaceUrl.length == 3 && contain.contains("town")) {
            List<Map<String, Object>> town = new ArrayList<>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='towntr']/td").nodes();
            for (int i = 0; i < countyNodes.size(); i += 2) {
                List<String> code = countyNodes.get(i).xpath("//*/text()").all();
                List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
                String areaCode = code.get(0);
                String areaName = name.get(0);
                if (code.size() > 1) {
                    areaCode = code.get(1);
                    areaName = name.get(1);
                    String newUrl = countyNodes.get(i).links().all().get(0);
                    if (contain.contains("village")) {
                        page.addTargetRequest(newUrl);
                    }
                }
                areaCode = areaCode.substring(0, 9);
                String parentId = areaCode.substring(0, 6);
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", "town");
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + parentId + "/" + areaCode);
                map.put("C_FULL_NAME", areaName);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                town.add(map);
            }
            page.putField("area", town);
        }

        //抓取乡级行政单位
        if (replaceUrl.length == 4 && contain.contains("village")) {
            List<Map<String, Object>> village = new ArrayList<Map<String, Object>>();
            List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='villagetr']/td").nodes();
            for (int i = 0; i < countyNodes.size(); i += 3) {
                String areaCode = countyNodes.get(i).xpath("//*/text()").get();
                String areaName = countyNodes.get(i + 2).xpath("//*/text()").get();
                String parentId = areaCode.substring(0, 9);
                HashMap<String, Object> map = new HashMap<>();
                map.put("C_NAME", areaName);
                map.put("C_CODE", areaCode);
                map.put("C_LEVEL", "village");
                map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + areaCode.substring(0, 6) + "/" + parentId + "/" + areaCode);
                map.put("C_FULL_NAME", areaName);
                map.put("C_PARENT_CODE", Long.valueOf(parentId));
                map.put("C_YEAR", year);
                village.add(map);
            }
            page.putField("area", village);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new AreaSpider("2022", "city, county")).addUrl("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html")
                .addPipeline(new SqlPipeline()).thread(16).run();
    }
}

2、新建 SqlPipeline 类,实现 Pipeline 接口,将爬取到的数据固化到数据库(当然也可以输出到 excel 文件等)

package org.example;

import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Date;
import java.util.List;
import java.util.Map;

@Component
@Slf4j
public class SqlPipeline implements Pipeline {
    static String driver = "com.mysql.jdbc.Driver";
    static String url = "jdbc:mysql://localhost:3306/test_area?characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai";
    static String username = "root";
    static String password = "root";
    static Connection conn = null;

    static{
        try {
            Class.forName(driver); //classLoader,加载对应驱动
            conn = DriverManager.getConnection(url, username, password);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
        List<Map<String, Object>> area = resultItems.get("area");
        System.out.println("地区总数:{}"+  area.size());
        if (area.size() == 0) {
            System.out.println(resultItems.getRequest().getUrl() + " 此页面未爬取数据,请稍后重试!");
        } else {
            area.stream().forEach(stringObjectMap -> {
                System.out.println("{}"+ stringObjectMap);
                //String sql = "insert into sys_area_new ('id','is_deleted','create_date', 'edit_date', 'creator', 'editor','parent','level','code','name','full_name','is_leaf','parentcode','yn','remark') VALUES (?, ?, ?, ?, ?, ?)";
                String sql = "insert into sys_area_new VALUES (?, 0, ?, ?, ?, ?,?, ?, ?, ?, ?, ?, ?, 1, ?)";
                PreparedStatement preparedStatement;
                try {
                    preparedStatement = conn.prepareStatement(sql);

                    preparedStatement.setString(1, UUIDUtils.generate());
//                    preparedStatement.setString(2,  );//类型bit(1)
                    preparedStatement.setString(2, String.valueOf(new Date().getTime()));
                    preparedStatement.setString(3, String.valueOf(new Date().getTime()));
                    preparedStatement.setString(4, "");
                    preparedStatement.setString(5, "");
                    preparedStatement.setString(6, stringObjectMap.get("C_CASCADE").toString()); //parent:级联关系
                    preparedStatement.setString(7, stringObjectMap.get("C_LEVEL").toString()); //level:province,city,county
                    preparedStatement.setString(8, stringObjectMap.get("C_CODE").toString()); //code
                    preparedStatement.setString(9, stringObjectMap.get("C_NAME").toString()); //name
                    preparedStatement.setString(10, stringObjectMap.get("C_FULL_NAME").toString()); //full_name
                    preparedStatement.setString(11, null); //is_leaf
                    preparedStatement.setString(12, stringObjectMap.get("C_PARENT_CODE").toString()); //parentcode
//                    preparedStatement.setString(13, "1"); //yn
                    preparedStatement.setString(13, stringObjectMap.get("C_YEAR").toString()); //remark

                    preparedStatement.executeUpdate();
                }catch (SQLException e) {
                    System.out.println("插入数据库错误:{}"+ e.getMessage()+ e);
                }
            });


        }

    }
}

四、完整代码

码云地址:https://gitee.com/tsui2021/area-demo.git

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

来lai

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值