统计局链接:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/index.html
2022年10月亲测可用。
效果演示:
控制台输出:
存入数据库:
一、爬取准备工作
- 本次爬取要用到的工具为Jsoup,引入Jsoup pom依赖:
<!-- JSOUP 解析-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
- 实体类:使用实体类接收,后续根据各自需求自行存入数据库或输出文件。
属性 | 注释 |
---|---|
id | 本级代码 |
pid | 父级代码(省的父级代码为0) |
name | 名称 |
treeLevel | 层级 |
leaf | 是否为叶子节点(是否叶子节点 0:否 1:是) |
sort | 排序 |
@Data
@EqualsAndHashCode(callSuper = false)
@Accessors(chain = true)
public class SysRegion implements Serializable {
private static final long serialVersionUID=1L;
private Long id;
private Long pid;
private String name;
private Integer treeLevel;
private Integer leaf;
private Long sort;
}
PS:数据库建表语句,如果后续有需求可以插入库中。
CREATE TABLE `sys_region` (
`id` bigint(20) NOT NULL COMMENT 'id',
`pid` bigint(20) DEFAULT NULL COMMENT '上级ID,一级为0',
`name` varchar(100) CHARACTER SET utf8 DEFAULT NULL COMMENT '名称',
`tree_level` tinyint(4) DEFAULT NULL COMMENT '层级',
`leaf` tinyint(4) DEFAULT NULL COMMENT '是否叶子节点 0:否 1:是',
`sort` bigint(20) DEFAULT NULL COMMENT '排序',
PRIMARY KEY (`id`) USING BTREE,
KEY `pid_idx` (`pid`),
KEY `tree_level_idx` (`tree_level`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='行政区域';
二、代码
前置条件:
代码粘贴即用,使用前要先修改需要爬取的省份,如果爬取全国所有省份,使用ArrayList的无参构造方法即可。
// 爬取“北京市”行政区划信息
private static final List PROVINCE_LIST = new ArrayList<>(Arrays.asList(“北京市”));
// 爬取全国所有省份信息
private static final List PROVINCE_LIST = new ArrayList<>();
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Mark Haktiong on 2022/10/23
*/
public class RegionCodeCrawling {
/**
* 国家统计局的首页链接
*/
private static final String LINK = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/";
/**
* 本次需要采集的省份,多个用逗号分隔,使用前先填写好需要爬取的省份
*/
private static final List<String> PROVINCE_LIST = new ArrayList<>(Arrays.asList("北京市"));
/**
* 存放结果的List
*/
private static List<SysRegion> REGION_RESULT_LIST = new ArrayList<>();
public static void main(String[] args) throws IOException {
int treeLevel = 1;
getProvince(treeLevel);
}
/**
* 爬取省份
* <tr class="provincetr">
* <td>
* <a href="11.html">北京市<br></a>
* </td>
* </tr>
*/
private static void getProvince(Integer treeLevel) throws IOException {
// 爬取省份
Document document = Jsoup.connect(LINK).get();
Elements provincetrAll = document.select(".provincetr");
int sort = 0;
for (Element provincetr : provincetrAll) {
for (Element td : provincetr.children()) {
Elements a = td.select("a");
String href = a.attr("href"); // 11.html
String name = a.text(); // 浙江省
if("".equals(href)){
// 爬取结束
continue;
}
Long code = Long.valueOf(href.split("\\.")[0]);
if (PROVINCE_LIST.isEmpty() || PROVINCE_LIST.contains(name)) {
SysRegion region = new SysRegion();
region.setId(code);
region.setPid(0L);
region.setName(name);
region.setTreeLevel(treeLevel);
region.setLeaf(0);
region.setSort(++sort);
System.out.println(region.toString());
REGION_RESULT_LIST.add(region);
getCity(href, treeLevel, code);
}
}
}
}
/**
* 爬取市
* <tr class="citytr">
* <td><a href="33/3309.html">330900000000</a></td>
* <td><a href="33/3309.html">舟山市</a></td>
* </tr>
*/
private static void getCity(String cityHref, Integer treeLevel, Long provinceCode) throws IOException {
treeLevel = treeLevel + 1;
Document document = Jsoup.connect(LINK + cityHref).get();
Elements citytrAll = document.select(".citytr");
int sort = 0;
for (Element citytr : citytrAll) {
Element codetd = citytr.child(0);
Element nametd = citytr.child(1);
String href = codetd.select("a").attr("href"); // 33/3308.html
String code = codetd.select("a").text(); // 330800000000
String name = nametd.select("a").text(); // 衢州市
SysRegion region = new SysRegion();
region.setId(Long.valueOf(code));
region.setPid(provinceCode);
region.setName(name);
region.setTreeLevel(treeLevel);
region.setSort(++sort);
System.out.println(region.toString());
REGION_RESULT_LIST.add(region);
getDistrict(href, treeLevel, code);
}
}
/**
* 爬取区
* <tr class="countytr">
* <td><a href="02/330203.html">330203000000</a></td>
* <td><a href="02/330203.html">海曙区</a></td>
* </tr>
*/
private static void getDistrict(String districtHref, Integer treeLevel, String cityCode) throws IOException {
treeLevel = treeLevel + 1;
Document document = Jsoup.connect(LINK + districtHref).get();
Elements countryAll = document.select(".countytr");
int sort = 0;
for (Element countrytr : countryAll) {
Element codetd = countrytr.child(0);
Element nametd = countrytr.child(1);
String href = codetd.select("a").attr("href"); // 02/330281.html
if ("".equals(href)) {
// 该级别空的,为市辖区。
continue;
}
String code = codetd.select("a").text(); // 330281000000
String name = nametd.select("a").text(); // 余姚市
SysRegion region = new SysRegion();
region.setId(Long.valueOf(code));
region.setPid(Long.valueOf(cityCode));
region.setName(name);
region.setTreeLevel(treeLevel);
region.setLeaf(1);
region.setSort(++sort);
System.out.println(region.toString());
REGION_RESULT_LIST.add(region);
getStreet(href, treeLevel, code);
}
}
/**
* 爬取街道
* <tr class="towntr">
* <td><a href="12/330212001.html">330212001000</a></td>
* <td><a href="12/330212001.html">下应街道</a></td>
* </tr>
*/
private static void getStreet(String streetHref, Integer treeLevel, String streetCode) throws IOException {
treeLevel = treeLevel + 1;
Document document = Jsoup.connect(LINK + streetCode.substring(0, 2) + "/" + streetHref).get();
Elements townAll = document.select(".towntr");
int sort = 0;
for (Element towntr : townAll) {
Element codetd = towntr.child(0);
Element nametd = towntr.child(1);
String href = codetd.select("a").attr("href"); // 12/330212001.html
String code = codetd.select("a").text(); // 330212001000
String name = nametd.select("a").text(); // 下应街道
SysRegion region = new SysRegion();
region.setId(Long.valueOf(code));
region.setPid(Long.valueOf(streetCode));
region.setName(name);
region.setTreeLevel(treeLevel);
region.setLeaf(1);
region.setSort(++sort);
System.out.println(region.toString());
REGION_RESULT_LIST.add(region);
getCommunity(href, treeLevel, code);
}
}
/**
* 爬取社区
* <tr class="villagetr">
* <td>330212001209</td>
* <td>112</td>
* <td>胜利村村委会</td>
* </tr>
*/
private static void getCommunity(String communityHref, Integer treeLevel, String communityCode) throws IOException {
treeLevel = treeLevel + 1;
String a = communityCode.substring(0, 2);
String b = communityCode.substring(2, 4);
Document document = Jsoup.connect(LINK + a + "/" + b + "/" + communityHref).get();
Elements villagetrAll = document.select(".villagetr");
int sort = 0;
for (Element villagetr : villagetrAll) {
Element codetd = villagetr.child(0);
Element nametd = villagetr.child(2);
String code = codetd.text(); // 330212001005
String name = nametd.text(); // 东兴社区居委会
SysRegion region = new SysRegion();
region.setId(Long.valueOf(code));
region.setPid(Long.valueOf(communityCode));
region.setName(name);
region.setTreeLevel(treeLevel);
region.setLeaf(1);
region.setSort(++sort);
System.out.println(region.toString());
REGION_RESULT_LIST.add(region);
}
}
}
三、注意事项
- 如果一次性要爬取多个省,统一存入list,可能造成内存使用过多。建议即爬即入库,或分开爬取一些省份。
- 代码只对省份进行了过滤,如果需要爬取某个省,某个市,你也可以模仿省份,在爬取之前加IF判断,将市区过滤出来。
- 某些情况可能由于网络波动,产生Read time out的错误,可以重新启动再试。
- 政府的网站重构概率小,一段时间使用本代码应该是没有任何问题,如果您发现代码不能使用了,可自行重构,或联系我更新,感谢支持。