java爬取省市区数据(仅供学习)

仅供学习、禁止恶意攻击!

一、效果

链接:http://preview.www.mca.gov.cn/article/sj/xzqh/2020/2020/202101041104.html

 

获取数据:

二、引入依赖,主要是jsoup

        <!--解析HTML-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.68</version>
        </dependency>

 

三、定义数据对象

package com.yarm.test.model;

import lombok.Data;

/**
 * Created by IntelliJ IDEA.
 * User: niki.yang
 * Date: 2021/1/13
 */
@Data
public class ChinaRegionsInfo {
    /**
     * 行政区域编码
     */
    private String code;

    /**
     * 行政区域名称
     */
    private String name;

    /**
     * 行政区域类型,1:省份,2:城市,3:区或者县城
     */
    private Integer type;

    /**
     * 上一级行政区域编码
     */
    private String parentCode;
}

四、执行爬取逻辑

package com.yarm.test;

import com.alibaba.fastjson.JSONObject;
import com.yarm.test.model.ChinaRegionsInfo;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

/**
 * Created by IntelliJ IDEA.
 * User: niki.yang
 * Date: 2021/1/13
 */
public class GetReginData {
    //需要抓取的网页地址
//    private static final String URL = "http://www.mca.gov.cn//article/sj/xzqh/2020/202006/202008310601.shtml";
    private static final String URL = "http://preview.www.mca.gov.cn/article/sj/xzqh/2020/2020/202101041104.html";

    public static void main(String[] args) throws IOException {
        List<ChinaRegionsInfo> regionsInfoList = new ArrayList<>();
        //抓取网页信息
        Document document = Jsoup.connect(URL).get();
        //获取真实的数据体
        Element element = document.getElementsByTag("tbody").get(0);
        String provinceCode = "";//省级编码
        String cityCode = "";//市级编码
        if (Objects.nonNull(element)) {
            Elements trs = element.getElementsByTag("tr");
            for (int i = 3; i < trs.size(); i++) {
                Elements tds = trs.get(i).getElementsByTag("td");
                if (tds.size() < 3) {
                    continue;
                }
                Element td1 = tds.get(1);//行政区域编码
                Element td2 = tds.get(2);//行政区域名称
                if (StringUtils.isNotEmpty(td1.text())) {
                    if (td1.classNames().contains("xl7030796")) {
                        if (td2.toString().contains("span")) {
                            //市级
                            ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
                            chinaRegions.setCode(td1.text());
                            chinaRegions.setName(td2.text());
                            chinaRegions.setType(2);
                            chinaRegions.setParentCode(provinceCode);
                            regionsInfoList.add(chinaRegions);
                            cityCode = td1.text();
                        } else {
                            //省级
                            ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
                            chinaRegions.setCode(td1.text());
                            chinaRegions.setName(td2.text());
                            chinaRegions.setType(1);
                            chinaRegions.setParentCode("");
                            regionsInfoList.add(chinaRegions);
                            provinceCode = td1.text();
                        }

                    } else {
                        //区或者县级
                        ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
                        chinaRegions.setCode(td1.text());
                        chinaRegions.setName(td2.text());
                        chinaRegions.setType(3);
                        chinaRegions.setParentCode(StringUtils.isNotEmpty(cityCode) ? cityCode : provinceCode);
                        regionsInfoList.add(chinaRegions);
                    }
                }
            }
        }
        //打印结果
        System.out.println(JSONObject.toJSON(regionsInfoList));
    }

}

五、运行结果

 

  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值