仅供学习、禁止恶意攻击!
一、效果
链接:http://preview.www.mca.gov.cn/article/sj/xzqh/2020/2020/202101041104.html
获取数据:
二、引入依赖,主要是jsoup
<!--解析HTML-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.68</version>
</dependency>
三、定义数据对象
package com.yarm.test.model;
import lombok.Data;
/**
* Created by IntelliJ IDEA.
* User: niki.yang
* Date: 2021/1/13
*/
@Data
public class ChinaRegionsInfo {
/**
* 行政区域编码
*/
private String code;
/**
* 行政区域名称
*/
private String name;
/**
* 行政区域类型,1:省份,2:城市,3:区或者县城
*/
private Integer type;
/**
* 上一级行政区域编码
*/
private String parentCode;
}
四、执行爬取逻辑
package com.yarm.test;
import com.alibaba.fastjson.JSONObject;
import com.yarm.test.model.ChinaRegionsInfo;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* Created by IntelliJ IDEA.
* User: niki.yang
* Date: 2021/1/13
*/
public class GetReginData {
//需要抓取的网页地址
// private static final String URL = "http://www.mca.gov.cn//article/sj/xzqh/2020/202006/202008310601.shtml";
private static final String URL = "http://preview.www.mca.gov.cn/article/sj/xzqh/2020/2020/202101041104.html";
public static void main(String[] args) throws IOException {
List<ChinaRegionsInfo> regionsInfoList = new ArrayList<>();
//抓取网页信息
Document document = Jsoup.connect(URL).get();
//获取真实的数据体
Element element = document.getElementsByTag("tbody").get(0);
String provinceCode = "";//省级编码
String cityCode = "";//市级编码
if (Objects.nonNull(element)) {
Elements trs = element.getElementsByTag("tr");
for (int i = 3; i < trs.size(); i++) {
Elements tds = trs.get(i).getElementsByTag("td");
if (tds.size() < 3) {
continue;
}
Element td1 = tds.get(1);//行政区域编码
Element td2 = tds.get(2);//行政区域名称
if (StringUtils.isNotEmpty(td1.text())) {
if (td1.classNames().contains("xl7030796")) {
if (td2.toString().contains("span")) {
//市级
ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
chinaRegions.setCode(td1.text());
chinaRegions.setName(td2.text());
chinaRegions.setType(2);
chinaRegions.setParentCode(provinceCode);
regionsInfoList.add(chinaRegions);
cityCode = td1.text();
} else {
//省级
ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
chinaRegions.setCode(td1.text());
chinaRegions.setName(td2.text());
chinaRegions.setType(1);
chinaRegions.setParentCode("");
regionsInfoList.add(chinaRegions);
provinceCode = td1.text();
}
} else {
//区或者县级
ChinaRegionsInfo chinaRegions = new ChinaRegionsInfo();
chinaRegions.setCode(td1.text());
chinaRegions.setName(td2.text());
chinaRegions.setType(3);
chinaRegions.setParentCode(StringUtils.isNotEmpty(cityCode) ? cityCode : provinceCode);
regionsInfoList.add(chinaRegions);
}
}
}
}
//打印结果
System.out.println(JSONObject.toJSON(regionsInfoList));
}
}
五、运行结果