爬取地址:https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/
数据库:
CREATE TABLE `area` (
`CODE` varchar(255) NOT NULL COMMENT '行政区域编号',
`NAME` varchar(500) DEFAULT NULL COMMENT '名称',
`PARENT` varchar(255) DEFAULT NULL COMMENT '父行政区域',
`LEVEL` INT(10) DEFAULT NULL,
PRIMARY KEY (`CODE`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
#引入依赖
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.20</version>
<scope>compile</scope>
</dependency>
#java代码
import cn.hutool.http.HttpUtil;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
public class CityUtils {
public static final String baseUrl = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/";
/**
* 省
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String url = baseUrl;
String str = HttpUtil.get(url).toUpperCase();
String[] arrs = str.split("<A");
for (String s : arrs) {
if (s.indexOf("HREF") != -1 && s.indexOf(".HTML") != -1) {
String a = s.substring(7, s.indexOf("\">"));
String areaid = a.replace(".HTML", "") + "0000";
String name = s.substring(s.indexOf("\">") + 2, s.indexOf("<BR />"));
saveCity(areaid, name, "0", 1);
System.out.println("爬取:" + areaid + "---" + name);
boolean success = false;
while (!success) {
try {
readShi(a, areaid);
success = true;
} catch (Exception e) {
e.printStackTrace();
success = false;
Thread.sleep(60000);
}
}
}
}
}
/**
* 市
* @param url
* @param paretid
* @throws Exception
*/
public static void readShi(String url, String paretid) throws Exception {
String content = HttpUtil.get(baseUrl + url).toUpperCase().toUpperCase();
String[] citys = content.split("CITYTR");
for (int c = 1, len = citys.length; c < len; c++) {
String[] strs = citys[c].split("<A HREF=\"");
String cityUrl = null;
String areaid = "";
String areaname = "";
for (int si = 1; si < 3; si++) {
if (si == 1) {//取链接和编码
cityUrl = strs[si].substring(0, strs[si].indexOf("\">"));
areaid = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
} else {
areaname = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
System.out.println("爬取:" + strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>")));
}
}
saveCity(areaid, areaname, paretid, 2);
boolean success = false;
while (!success) {
try {
readXian(cityUrl.substring(0, cityUrl.indexOf("/") + 1), cityUrl, areaid);
success = true;
} catch (IOException e) {
e.printStackTrace();
success = false;
Thread.sleep(60000);
}
}
}
}
/**
* 区、县
* @param prix
* @param url
* @param paretid
* @throws Exception
*/
public static void readXian(String prix, String url, String paretid) throws Exception {
String content = HttpUtil.get(baseUrl + url).toUpperCase().toUpperCase();
String[] citys = content.split("COUNTYTR");
for (int i = 1; i < citys.length; i++) {
String cityUrl = null;
String areaid = "";
String areaname = "";
if (citys[i].indexOf("<A HREF=\"") == -1) {
areaid = citys[i].substring(6, 18);
areaname = citys[i].substring(citys[i].indexOf("</TD><TD>") + 9, citys[i].lastIndexOf("</TD>"));
} else {
String[] strs = citys[i].split("<A HREF=\"");
for (int si = 1; si < 3; si++) {
if (si == 1) {//取链接和编码
cityUrl = strs[si].substring(0, strs[si].indexOf("\">"));
areaid = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
} else {
areaname = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
}
}
}
saveCity(areaid, areaname, paretid, 3);
boolean success = false;
while (!success) {
try {
if (null != cityUrl) {
readZhen(prix, cityUrl, areaid);
}
success = true;
} catch (IOException e) {
e.printStackTrace();
success = false;
Thread.sleep(60000);
}
}
}
}
/**
* 街道、镇
* @param prix
* @param url
* @param paretid
* @throws Exception
*/
public static void readZhen(String prix, String url, String paretid) throws Exception {
String content = HttpUtil.get(baseUrl+prix + url).toUpperCase().toUpperCase();
String[] citys = content.split("TOWNTR");
for (int i = 1; i < citys.length; i++) {
String[] strs = citys[i].split("<A HREF=\"");
String cityUrl = null;
String areaid = "";
String areaname = "";
for (int si = 1; si < 3; si++) {
if (si == 1) {//取链接和编码
cityUrl = strs[si].substring(0, strs[si].indexOf("\">"));
areaid = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
} else {
areaname = strs[si].substring(strs[si].indexOf("\">") + 2, strs[si].indexOf("</A>"));
}
}
saveCity(areaid, areaname, paretid, 4);
}
}
public static void saveCity(String areaid, String areaname, String paretid, int level) {
try {
String URL = "jdbc:mysql://127.0.0.1:3306/area?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false";
String USER = "root";
String PASSWORD = "*******";
Class.forName("com.mysql.jdbc.Driver");
Connection conn = DriverManager.getConnection(URL, USER, PASSWORD);
String s = "insert into area(code,`name`,parent,`level`) values(?,?,?,?)";
PreparedStatement pst = conn.prepareStatement(s);
pst.setString(1, level > 3 ? areaid.substring(0, 9) : areaid.substring(0, 6));
pst.setString(2, areaname);
pst.setString(3, paretid.replace("000000", ""));
pst.setInt(4, level);
pst.execute();
//关闭资源
pst.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}