背景
打算做一个省级联动的功能,由于网上没有找到符合自己要求的数据。便着手尝试写一个简单爬虫进行数据抓取。由于时间仓促,简单粗糙的写了份代码。(嵌套循环过多)可自行优化。复制代码,能直接运行;
大概花费一个多小时能运行完成
如果不想自己重新抓取,文章的最后有2份不同数据结构格式的sql,可直接使用;
部分城市没有5级的,例如广东省东莞市下一级是镇。
数据来源:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/
运行环境:jdk8;
代码
<!-- 依赖包 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
import lombok.var;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;
public class Test {
private static final Logger log = LoggerFactory.getLogger(Test.class);
private static AtomicInteger atomicInteger = new AtomicInteger(0);
private static AtomicInteger atomicIntegerErrorNum = new AtomicInteger(0);
public static void main(String[] args) {
//获取所有的数据
// test(null, null);
//获取指定的城市
test("广东省", "中山市");
}
private static String getHtml(String url) throws Exception {
atomicInteger.incrementAndGet();
log.info("调用接口次数 :" + atomicInteger.get());
log.info("请求开始时间 ==>" + DateUtil.formatDateTime(new Date()));
log.info(url);
String html = null;
try {
html = getHtml2(url);
} catch (Exception e) {
e.printStackTrace();
log.info(Thread.currentThread().getName() + " = 超时。。睡5秒再重试");
Thread.sleep(1000 * 5);
log.info(Thread.currentThread().getName() + " = 超时重试");
atomicIntegerErrorNum.incrementAndGet();
log.info("error num = " + atomicIntegerErrorNum.get());
try {
//重试1次
html = getHtml2(url);
} catch (Exception e2) {
e2.printStackTrace();
log.info("error num = " + atomicIntegerErrorNum.get());
log.info(Thread.currentThread().getName() + " 第二次超时重试");
log.info(Thread.currentThread().getName() + " = 第二次超时。。睡12分钟再重试");
log.info("error num = " + atomicIntegerErrorNum.get());
Thread.sleep(1000 * 60 * 12);
//重试2次
html = getHtml2(url);
}
}
log.info("请求结束时间 ==>" + DateUtil.formatDateTime(new Date()));
System.out.println("");
System.out.println("");
return html;
// }
}
private static String getHtml2(String url) throws Exception {
//链式构建请求
return HttpRequest.get(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36")//头信息,多个头信息多次调用此方法即可
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")//头信息,多个头信息多次调用此方法即可
.header("Cookie", "_trs_uv=" + IdUtil.simpleUUID() + "; SF_cookie_1=" + IdUtil.fastUUID())
.header("If-None-Match", "f32-5d4bccaa05a80-gzip")
// .header("If-Modified-Since",new Date())
// .form(paramMap)//表单内容
.timeout(15000)//超时,毫秒
.execute().body();
}
public static String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
public static void test(String appendProvide, String appendCity) {
System.out.println("===============开始抓取数据==================");
var url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
String html = HttpUtil.get(url);
var htmlDoc = Jsoup.parse(html);
var selectClasses = htmlDoc.getElementsByClass("provincetr");
var startDateStr = DateUtil.formatDateTime(new Date());
long startDate = System.currentTimeMillis();
// ExecutorService executorService = new ThreadPoolExecutor(4, 5, 1L, TimeUnit.SECONDS, new ArrayBlockingQueue(4), Executors.defaultThreadFactory());
for (int i = 0; i < selectClasses.size(); i++) {
int finalI = i;
// executorService.execute(() -> {
// System.out.println(Thread.currentThread().getName() + " " + "--->开始爬数据");
try {
//一个线程跑就行
startProvide(selectClasses, finalI, baseUrl, appendProvide, appendCity);
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("耗时 =》");
System.out.println((System.currentTimeMillis() - startDate) / 1000);
System.out.println("开始时间 ==> " + startDateStr);
System.out.println("结束时间 ==>" + DateUtil.formatDateTime(new Date()));
}
/**
* @param selectClasses
* @param i
* @param baseUrl
* @param appendProvide 需要查询的省 ,null表示查询所有
* @throws Exception
*/
private static void startProvide(Elements selectClasses, Integer i, String baseUrl, String appendProvide, String appendCity) throws Exception {
var provideCodes = selectClasses.get(i).children();
//1.省份 provincetr
for (int provideCodeIndex = 0; provideCodeIndex < provideCodes.size(); provideCodeIndex++) {
var provideCodeUrl = provideCodes.get(provideCodeIndex).select("a").attr("href");
var provideName = provideCodes.get(provideCodeIndex).select("a").text();
// System.out.println("省份 = " + provideName);
if (!StringUtils.isBlank(provideCodeUrl)) {
if (StringUtils.isBlank(appendProvide)) {
provide(provideCodeUrl, provideName, null);
} else {
if (appendProvide.equals(provideName)) {
provide(provideCodeUrl, provideName, appendCity);
}
}
}
}
}
/**
* 通过省,市获取下面的数据
*
* @param provideCodeUrl
*/
public static void provide(String provideCodeUrl, String provideName, String appendCity) throws Exception {
var provideCode = provideCodeUrl.split("\\.")[0];
// String gotoCityHtml = HttpUtil.get(baseUrl + provideCodeUrl);
String gotoCityHtml = getHtml(baseUrl + provideCodeUrl);
var cityHtmlDoc = Jsoup.parse(gotoCityHtml);
Elements selectCityClass = cityHtmlDoc.select(".citytr");
// var len = provideCode.length();
// var provideCode2 = provideCode;
// if (len < 6) {
// len = 6 - len;
// for (int l = 0; l < len; l++) {
// provideCode2 += "0";
// }
// }
if (StringUtils.isBlank(appendCity)) {
//todo 保存到数据库 保存省
city(selectCityClass, provideName, provideCode, null);
} else {
city(selectCityClass, provideName, provideCode, appendCity);
}
}
/**
* 城市
*
* @param selectCityClass
* @param provideName
* @param provideCode
* @throws Exception
*/
private static void city(Elements selectCityClass, String provideName, String provideCode, String appendCity) throws Exception {
//2.城市 citytr
for (int cityIndex = 0; cityIndex < selectCityClass.size(); cityIndex++) {
var gotoCountyUrl = selectCityClass.get(cityIndex).select("td").get(1).select("a").attr("href");
var cityName = selectCityClass.get(cityIndex).select("td").get(1).select("a").text();
System.out.println("城市 = " + cityName);
if (StringUtils.isBlank(gotoCountyUrl)) {
continue;
}
if (StringUtils.isBlank(appendCity)) {
appendCity(gotoCountyUrl, provideName, provideCode, cityName);
} else {
if (appendCity.equals(cityName)) {
appendCity(gotoCountyUrl, provideName, provideCode, cityName);
}
}
}
}
private static void appendCity(String gotoCountyUrl, String provideName, String provideCode, String cityName) throws Exception {
// String countytr = HttpUtil.get(baseUrl + gotoCountyUrl);
boolean flag = true;
String countytr = getHtml(baseUrl + gotoCountyUrl);
var countytrDoc = Jsoup.parse(countytr);
Elements countyClass = countytrDoc.select(".countytr");
if (countyClass.size() == 0) {
flag = false;
countyClass = countytrDoc.select(".towntr");
}
var strCityUrl = gotoCountyUrl.split("/");
var cityCode = strCityUrl[strCityUrl.length - 1].split("\\.")[0];
var cityCodeLen = cityCode.length();
var cityCode2 = cityCode;
if (cityCodeLen < 6) {
cityCodeLen = 6 - cityCodeLen;
for (int l = 0; l < cityCodeLen; l++) {
cityCode2 += "0";
}
}
if (cityName.equals("市辖区")) {
cityName = provideName;
}
cityCode = cityCode2;
if (!flag) {
town(countyClass,
provideName,
provideCode,
cityCode
, flag
);
} else {
//todo 保存到数据库 保存城市
county(countyClass,
provideName,
provideCode);
}
}
/**
* 街道
*
* @param countyClass
* @param provideCode
*/
private static void county(Elements countyClass, String provideName, String provideCode) throws Exception {
//3.县区 countytr
for (int county = 0; county < countyClass.size(); county++) {
var gotoTownUrl = countyClass.get(county).select("td").get(1).select("a").attr("href");
var countyName = countyClass.get(county).select("td").get(1).select("a").text();
System.out.println("县区 = " + countyName);
if (StringUtils.isBlank(gotoTownUrl)) {
continue;
}
//todo 保存到数据库
String towntr = getHtml(baseUrl + provideCode + "/" + gotoTownUrl);
var townDoc = Jsoup.parse(towntr);
Elements townClass = townDoc.select(".towntr");
var gotoTownCode = gotoTownUrl.split("/")[0];
// Thread.sleep(3000);
town(townClass,
provideName,
provideCode,
gotoTownCode, true);
}
}
/**
* @param townClass
* @param provideName
* @param provideCode
* @param gotoTownCode
* @param flag 是否地级市。true是,false否
* @throws Exception
*/
private static void town(Elements townClass, String provideName, String provideCode, String gotoTownCode, boolean flag) throws Exception {
//4.街道。镇 towntr
for (int town = 0; town < townClass.size(); town++) {
//towntr
var gotoVillageHref = townClass.get(town).select("td").get(1).select("a").attr("href");
var townName = townClass.get(town).select("td").get(1).select("a").text();
System.out.println("街道。镇 = " + townName);
if (StringUtils.isBlank(gotoVillageHref)) {
continue;
}
//todo 保存到数据库
//居委会
// String villageStr = HttpUtil.get(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref);
String villageStr = null;
if(flag){
villageStr = getHtml(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref);
}else{
villageStr = getHtml(baseUrl + provideCode + "/"+ gotoVillageHref);
}
var villageDoc = Jsoup.parse(villageStr);
Elements villagetr = villageDoc.select(".villagetr");
for (int villageIndex = 0; villageIndex < villagetr.size(); villageIndex++) {
var tds = villagetr.get(villageIndex).select("td");//[0].text();
var villageCode = tds.get(0).text();
var villageType = tds.get(1).text();
var villageName = tds.get(2).text();
System.out.println(villageCode + " " + villageType + " " + villageName);
//todo 保存到数据库
}
}
}
}
附录
2021全国行政区域4级,5级(不含港澳台)-MySQL文档类资源-CSDN下载
免责声明:本文章仅用于学习参考