pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.4.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>org.example</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<spring-boot.version>2.3.11.RELEASE</spring-boot.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<maven-jar-plugin.version>3.1.1</maven-jar-plugin.version>
<druid.version>1.2.6</druid.version>
<knife4j.version>3.0.2</knife4j.version>
<swagger-annotations.version>1.5.22</swagger-annotations.version>
<fastjson.version>1.2.76</fastjson.version>
<poi.version>4.1.2</poi.version>
<velocity.version>1.7</velocity.version>
<jwt.version>0.9.1</jwt.version>
<mybatis-plus.version>3.4.3</mybatis-plus.version>
<hutool.version>5.6.5</hutool.version>
<feign.version>2.2.6.RELEASE</feign.version>
<feign-okhttp.version>11.0</feign-okhttp.version>
<spring-boot-admin.version>2.3.1</spring-boot-admin.version>
</properties>
<packaging>pom</packaging>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
</plugins>
<resources>
<resource>
<!--打包该目录下的 application.yml -->
<directory>src/main/resources</directory>
<!-- 启用过滤 即该资源中的变量将会被过滤器中的值替换 -->
<filtering>true</filtering>
</resource>
</resources>
</build>
<repositories>
<repository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</project>
逻辑代码
@Component
public class StatisticsBureauCrawler {
private static final String BASE_URL = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022"; // URL
public static final String[] FOLDER_NAMES = {"province", "city", "county", "town"}; // 文件夹名称
private static final String[] PAGE_SUFFIXES = {"index.html", ".html", ".html", ".html"}; // 页面后缀
// Jsoup获取Document
private Document getHtmlDoc(String url) throws Exception {
return Jsoup.connect(url).timeout(30000).get();
}
// 解析省份列表
private List<AddressInfo> parseProvinceList(Document doc) {
List<AddressInfo> list = new ArrayList<>();
Elements trElements = doc.select("tr.provincetr > td > a");
for (Element tr : trElements) {
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(tr.attr("href").replace(".html", ""));
addressInfo.setName(tr.text());
addressInfo.setLevel("1");
addressInfo.setParentCode("0");
list.add(addressInfo);
}
return list;
}
// 解析城市列表
private List<AddressInfo> parseCityList(Document doc, String parentCode) {
List<AddressInfo> list = new ArrayList<>();
Elements trElements = doc.select("tr.citytr");
for (Element tr : trElements) {
Elements tdElements = tr.select("td > a");
String code = tdElements.get(0).text().trim();
String name = tdElements.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("2");
addressInfo.setParentCode(parentCode);
list.add(addressInfo);
}
return list;
}
// 解析县区列表
private List<AddressInfo> parseCountyList(Document doc, String parentCode) {
List<AddressInfo> list = new ArrayList<>();
Elements trElements = doc.select("tr.countytr");
for (Element tr : trElements) {
Elements tdElements = tr.select("td > a");
if(tdElements.size()>0){
String code = tdElements.get(0).text().trim();
String name = tdElements.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("3");
addressInfo.setParentCode(parentCode);
addressInfo.setLinkStatus(true);
list.add(addressInfo);
}else{
Elements tdElements1 = tr.select("td");
String code = tdElements1.get(0).text().trim();
String name = tdElements1.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("3");
addressInfo.setParentCode(parentCode);
addressInfo.setLinkStatus(false);
list.add(addressInfo);
}
}
return list;
}
// 解析街道列表
private List<AddressInfo> parseTownList(Document doc, String parentCode) {
List<AddressInfo> list = new ArrayList<>();
Elements trElements = doc.select("tr.towntr, tr.villagetr");
for (Element tr : trElements) {
Elements tdElements = tr.select("td > a");
if(tdElements.size()>0) {
String code = tdElements.get(0).text().trim();
String name = tdElements.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("4");
addressInfo.setLinkStatus(true);
addressInfo.setParentCode(parentCode);
list.add(addressInfo);
}else{
Elements tdElements1 = tr.select("td");
String code = tdElements1.get(0).text().trim();
String name = tdElements1.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("4");
addressInfo.setParentCode(parentCode);
addressInfo.setLinkStatus(false);
list.add(addressInfo);
}
}
return list;
}
private List<AddressInfo> parseVillagetrList(Document doc, String parentCode) {
List<AddressInfo> list = new ArrayList<>();
Elements trElements = doc.select("tr.villagetr");
for (Element tr : trElements) {
Elements tdElements = tr.select("td > a");
if(tdElements.size()>0) {
Elements tdElements1 = tr.select("td");
String code = tdElements1.get(0).text().trim();
String name = tdElements1.get(1).text().trim();
AddressInfo addressInfo = new AddressInfo();
addressInfo.setCode(code);
addressInfo.setName(name);
addressInfo.setLevel("5");
addressInfo.setParentCode(parentCode);
addressInfo.setLinkStatus(false);
list.add(addressInfo);
}
}
return list;
}
// 解析信息列表
List<AddressInfo> parseInfoList(String folderName, String code) throws Exception {
List<AddressInfo> list = new ArrayList<>();
// 获取页面源码
String url = BASE_URL + "/" + code ;
String pageSuffix = PAGE_SUFFIXES[Integer.parseInt(folderName)];
Document doc = getHtmlDoc(url + pageSuffix);
// 解析页面内容
switch (folderName) {
case "1":
list.addAll(parseCityList(doc, code));
break;
case "2":
list.addAll(parseCountyList(doc, code));
break;
case "3":
list.addAll(parseTownList(doc, code));
break;
default:
break;
}
// 递归解析下一级信息
for (AddressInfo addressInfo : list) {
if ("4".equals(addressInfo.getLevel())) {
continue;
}
List<AddressInfo> subList = parseInfoList(folderName, addressInfo.getCode());
addressInfo.setSubList(subList);
}
return list;
}
// 乡镇列表
public List<AddressInfo> crawlVillagetrList(AddressInfo addressInfo) throws Exception {
Document doc = getHtmlDoc(BASE_URL +"/"+addressInfo.getCode().substring(0,2)+"/"+addressInfo.getCode().substring(2,4)
+"/"+addressInfo.getCode().substring(4,6)+"/"+addressInfo.getCode().substring(0,9)+ ".html");
return parseVillagetrList(doc,addressInfo.getCode());
}
// 乡镇列表
public List<AddressInfo> crawlTownList(AddressInfo addressInfo) throws Exception {
Document doc = getHtmlDoc(BASE_URL +"/"+addressInfo.getCode().substring(0,2)+"/"+addressInfo.getCode().substring(2,4)+"/"+addressInfo.getCode().substring(0,6)+ ".html");
return parseTownList(doc,addressInfo.getCode());
}
// 区域列表
public List<AddressInfo> crawlCountyList(AddressInfo addressInfo) throws Exception {
Document doc = getHtmlDoc(BASE_URL +"/"+addressInfo.getParentCode()+"/"+addressInfo.getCode().substring(0,4)+ ".html");
return parseCountyList(doc,addressInfo.getCode());
}
// 爬取城市列表
public List<AddressInfo> crawlCityList(String code) throws Exception {
Document doc = getHtmlDoc(BASE_URL +"/"+code+ ".html");
return parseCityList(doc,code);
}
// 爬取省份列表
public List<AddressInfo> crawlProvinceList() throws Exception {
Document doc = getHtmlDoc(BASE_URL + "/index.html");
return parseProvinceList(doc);
}
}
package org.example;
import java.io.Serializable;
import java.util.List;
public class AddressInfo implements Serializable {
private String code; // 区划代码
private String name; // 名称
private String level; // 级别
private String parentCode; // 上级区划代码
private boolean linkStatus;
private List<AddressInfo> subList;
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getLevel() {
return level;
}
public void setLevel(String level) {
this.level = level;
}
public String getParentCode() {
return parentCode;
}
public void setParentCode(String parentCode) {
this.parentCode = parentCode;
}
public List<AddressInfo> getSubList() {
return subList;
}
public void setSubList(List<AddressInfo> subList) {
this.subList = subList;
}
public boolean isLinkStatus() {
return linkStatus;
}
public void setLinkStatus(boolean linkStatus) {
this.linkStatus = linkStatus;
}
}
package org.example;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import javax.annotation.PostConstruct;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
@SpringBootApplication
public class Application {
@Autowired
private StatisticsBureauCrawler crawler;
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
@PostConstruct
public void run() throws Exception {
// 爬取省份列表
List<AddressInfo> provinceList = crawler.crawlProvinceList();
File file = new File("address.txt");
FileWriter writer = new FileWriter(file);
// 遍历省份列表
for (AddressInfo province : provinceList) {
System.out.println(province.getCode() + "," + province.getName() + "," + province.getLevel() + "," + province.getParentCode());
writer.write(province.getCode() + "," + province.getName() + "," + province.getLevel() + "," + province.getParentCode()+ "\n");
List<AddressInfo> cityList = crawler.crawlCityList(province.getCode());
for (AddressInfo city : cityList) {
System.out.println(city.getCode() + "," + city.getName() + "," + city.getLevel() + "," + city.getParentCode());
writer.write(city.getCode() + "," + city.getName() + "," + city.getLevel() + "," + city.getParentCode()+ "\n");
List<AddressInfo> countyList = crawler.crawlCountyList(city);
for (AddressInfo country : countyList) {
System.out.println(country.getCode() + "," + country.getName() + "," + country.getLevel() + "," + country.getParentCode());
writer.write(country.getCode() + "," + country.getName() + "," + country.getLevel() + "," + country.getParentCode()+ "\n");
if (country.isLinkStatus()) {
List<AddressInfo> crawlTownList = crawler.crawlTownList(country);
for (AddressInfo town : crawlTownList) {
writer.write(town.getCode() + "," + town.getName() + "," + town.getLevel() + "," + town.getParentCode()+ "\n");
if (town.isLinkStatus()) {
List<AddressInfo> villagetrList = crawler.crawlVillagetrList(town);
for (AddressInfo villagetr : villagetrList) {
writer.write(villagetr.getCode() + "," + villagetr.getName() + "," + villagetr.getLevel() + "," + villagetr.getParentCode()+ "\n");
}
}
}
}
}
}
}
writer.close();
}
}
代码详情请参考 gitee:https://gitee.com/wanderers-in-hubei/crawler.git