在工作中,经常做一些有关地区、地址的需求,就是在网页或者App端,展示三级下拉选择省市区。本文旨在帮助我们从国家统计局获取最新的省市区数据用户项目中。
以下代码支持爬虫省市区镇街道,设置有2个全局变量,默认只爬取省市区保存到本地,然后从本地读取爬虫html网页解析成json对象,也可以转成Excel,自己找插件或者写代码转就可以了,非常方便。
首先说说这个爬虫的几个注意顶
- 因为爬虫需要多次与远端服务器连接,并发连接会遇到以下错误,跟代码没有关系,跟网络有关系,连接的时候多次重定向导致,没花过多时间研究解决办法,目前等过一会再次执行就可以了。有好的解决方法请在评论区评论一下,感激。
java.io.IOException: Too many redirects occurred trying to load URL
2.如果在读本地html网页文件的时候报错,FileNotFoundExceiption,说命爬下来的网页文件不完整,缺失了一部分,那么可以将根目录下的region文件夹删除,下次启动程序,会再次自从从远端爬取html网页文件写入到本地。
3.如果在日志中遇到,连接失败,正在重试。就是遇到了上边 说的 发生了多次重定向问题,这样写入到本地的网页文件可能不完整,如果再次启动程序,成功转出json,需要开发者自行检查转出的json数据是否完整。
package com.lockie.region;/* Copyright © 2020 pyacm.com and/or its affiliates. All rights reserved. */
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.lockie.region.entity.City;
import com.lockie.region.entity.County;
import com.lockie.region.entity.Province;
import com.lockie.region.entity.Street;
import com.lockie.region.entity.Town;
import com.lockie.region.enums.AreaLevelEnum;
import com.lockie.region.enums.OperationType;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpStatus;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.helpers.MessageFormatter;
/**
* 国家统计局数据爬虫
* @author lockie
* @date 2021-12-10 15:31
*/
@Slf4j
public class RegionTask {
public static final String BASE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/";
/** 国家统计局省市区地址 **/
public static final String AREA_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";
private static Boolean isRetry = false;
private static final Integer TIMEOUT = 500000;
private static final String REGION = "./region/";
private static Integer COUNT = 0;
private static final File file;
private static final File flagFile;
public static final Gson gson = new GsonBuilder().create();
public static final String HTML = ".html";
/** 是否爬虫 街道 第五级 **/
public static final Boolean IS_GET_STREET = false;
/** 是否爬虫 镇 第四级 **/
public static final Boolean IS_GET_TOEN = false;
/** 匹配 a标签 **/
public static final String A = "a";
public static final String TDA = "td a";
public static final String HREF = "href";
/** 匹配所有城市 **/
public static final String TEMPLATE = "table.{}table tbody tr.{}tr ";
public static final String TD = "td";
public static final String FLAG = "region.txt";
public static final CountDownLatch write = new CountDownLatch(1);
static {
file = new File(REGION);
flagFile = new File(REGION.concat(FLAG));
try {
if(!file.exists()){
file.mkdirs();
}
if(!file.exists()){
flagFile.createNewFile();
}
} catch (IOException e) {
e.printStackTrace();
}
}
/** 采用线程池 **/
public static ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("thread-pool-%d --- ").build();
/** 阻塞队列,常驻核心线程被占用完毕,请求被堆积到阻塞队列中 **/
public static ExecutorService threadPool = new ThreadPoolExecutor(50, 200, 0L, TimeUnit.SECONDS,
new LinkedBlockingQueue<>(30), threadFactory,new AbortPolicy());
/** 获得连接 **/
private static Connection getConnection(String u) {
Connection connection = Jsoup.connect(u).timeout(TIMEOUT);
connection.header(HttpConnection.CONTENT_ENCODING, Charsets.UTF_8.name());
connection.header("Accept", "*/*");
connection.header("Accept-Encoding", "gzip, deflate, br");
connection.header("Accept-Language", "zh-CN,zh;q=0.9");
connection.header("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
return connection;
}
/** 获得连接 **/
private static Document getDocument(String url) {
return Optional.ofNullable(url).map(u -> {
try {
Connection connection = getConnection(u);
Response execute = null;
do {
execute = connection.execute();
if (isRetry) {
/** 两秒延迟 **/
Thread.sleep(2000);
connection = getConnection(u);
}
} while (execute.statusCode() != HttpStatus.SC_OK);
return connection.post();
} catch (Exception e) {
log.info("无法链接,正在重试~");
}
return null;
}).orElse(null);
}
/** 爬虫数据写入到本地 **/
public static void writeToLocal(Element province,CyclicBarrier cyclicBarrier) {
try {
Elements proSelect = province.select(TDA);
String proName = proSelect.text();
if(StringUtils.isNotEmpty(proName)) {
String cityUrl = BASE_URL.concat(proSelect.attr(HREF));
Document cityDocument = null;
do {
cityDocument = getDocument(cityUrl);
if (cityDocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == cityDocument);
Elements cities = getCities(cityDocument, proName, proSelect.attr(HREF), OperationType.WRITE);
/** 写入 县 **/
for (Element city : cities) {
Elements citySelect = city.select(TDA);
if (citySelect.size() > 0) {
Element cityCodeElement = citySelect.get(0);
Element cityNameElement = citySelect.get(1);
String cityName = cityNameElement.text();
String countyUrl = cityCodeElement.absUrl(HREF);
String fileName = countyUrl.split(BASE_URL)[1];
Document countyDocument = null;
do {
countyDocument = getDocument(countyUrl);
if (countyDocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == countyDocument);
Elements counties = getCounties(countyDocument, proName, cityName, fileName,OperationType.WRITE);
if (IS_GET_TOEN) {
/** 写入 镇 **/
for (Element county : counties) {
Elements countySelect = county.select(TDA);
if (countySelect.size() > 0) {
Element countyCodeElement = countySelect.get(0);
Element countyNameElement = countySelect.get(1);
String countyName = countyNameElement.text();
String townUrl = countyCodeElement.absUrl(HREF);
String countyFileName = townUrl.split(BASE_URL)[1];
Document townDocument = null;
do {
townDocument = getDocument(townUrl);
if (townDocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == townDocument);
Elements towns = getTowns(townDocument, proName, cityName, countyName,
countyFileName,OperationType.WRITE);
if (IS_GET_STREET) {
/** 写入街道 **/
for (Element town : towns) {
Elements townSelect = town.select(TDA);
if (townSelect.size() > 0) {
Element townCodeElement = townSelect.get(0);
Element townNameElement = townSelect.get(1);
String townName = townNameElement.text();
String streetUrl = townCodeElement.absUrl(HREF);
String townFileName = streetUrl.split(BASE_URL)[1];
Document streetDocument = null;
do {
streetDocument = getDocument(streetUrl);
if (streetDocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == streetDocument);
getStreets(streetDocument, proName, cityName, countyName,
townName, townFileName,OperationType.WRITE);
}
}
}
}
}
}
}
}
cyclicBarrier.await();
log.info("{} 所有区数据爬取完毕。", proName);
}
} catch (Exception exception) {
exception.printStackTrace();
log.error("Write error,error = {}", exception);
}
}
/** 获得所有省 **/
public static Elements getProvinces(Document provinceDocument,OperationType operationType) throws IOException {
return getElements(provinceDocument, AreaLevelEnum.PROVINCE,null,null,null,null,
AreaLevelEnum.PROVINCE.getLevel().concat(HTML),operationType);
}
/** 获得所有市 **/
public static Elements getCities(Document cityDocument,String proName,String fileName,OperationType operationType) throws IOException {
return getElements(cityDocument,AreaLevelEnum.CITY,proName,null,null,null,fileName,operationType);
}
/** 所有区/县 **/
public static Elements getCounties(Document countyDocument,String proName,String cityName,String fileName,OperationType operationType) throws IOException {
return getElements(countyDocument,AreaLevelEnum.COUNTY,proName,cityName,null,null,fileName,operationType);
}
/** 所有镇 **/
public static Elements getTowns(Document townDocument,String proName,String cityName,
String countyName,String fileName,OperationType operationType) throws IOException {
return getElements(townDocument,AreaLevelEnum.TOWN,proName,cityName,countyName,null,fileName,operationType);
}
/** 所有街道/村 **/
public static Elements getStreets(Document streetDocument,String proName,String cityName,
String countyName,String townName,String fileName,OperationType operationType) throws IOException {
return getElements(streetDocument, AreaLevelEnum.VILLAGE,proName, cityName, countyName,townName,fileName,operationType);
}
private static Elements getElements(Document document,AreaLevelEnum level,String proName, String cityName,
String countyName,
String townName,
String fileName,
OperationType operationType) throws IOException {
try {
String le = level.getLevel();
Elements elements = null;
if(null != document) {
if(AreaLevelEnum.PROVINCE == level || AreaLevelEnum.VILLAGE == level){
elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage().concat(TD));
}else if(AreaLevelEnum.VILLAGE != level){
elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage());
}
if(OperationType.WRITE == operationType){
File file = new File(REGION.concat(fileName));
File parentFile = file.getParentFile();
if(!parentFile.exists()){
parentFile.mkdirs();
}
if(!file.exists()){
file.createNewFile();
}
FileUtils.writeStringToFile(file,document.html(), "gb2312");
String path = file.getAbsolutePath();
log.info("<---------------------------------->");
if(StringUtils.isNotEmpty(townName)){
log.info("{}-{}-{}-{} 所有街道/村 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,townName, path);
}else if(StringUtils.isNotEmpty(countyName)){
log.info("{}-{}-{} 所有镇 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,path);
}else if(StringUtils.isNotEmpty(cityName)){
log.info("{}-{} 所有区/县 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,path);
}else if(StringUtils.isNotEmpty(proName)){
log.info("{} 所有市 网页数据写入到本地完成~~,文件所在地址->{}",proName,path);
}else {
log.info("所有省 网页数据写入到本地完成~~,文件所在地址:{}",path);
}
log.info("<---------------------------------->");
}
}
return elements;
} catch (Exception e) {
e.printStackTrace();
log.error("GetAndWriteProvince method error,error = {}",e);
throw e;
}
}
private static class SyncWrite implements Runnable {
private Element element;
private CyclicBarrier cyclicBarrier;
public SyncWrite(Element element,CyclicBarrier cyclicBarrier){
this.element = element;
this.cyclicBarrier = cyclicBarrier;
}
@Override
public void run() {
writeToLocal(element,cyclicBarrier);
}
}
public static void write(){
if(file.list().length > 1){
log.info("所有省市区数据已经写入到本地完毕,若要更新所有省市区文件,请删除该文件夹即可,文件地址:{}",file.getAbsolutePath());
write.countDown();
return;
}
Document document = getDocument(AREA_URL);
System.out.println("等待所有 省份数据 爬取 完毕 !!!!!!!");
try{
Elements provinces = getProvinces(document,OperationType.WRITE);
if(null != provinces && provinces.size() > 0){
CyclicBarrier cyclicBarrier = new CyclicBarrier(provinces.size()-1, new Runnable() {
@Override
public void run() {
log.info("===================== 所有省数据写入完毕 ========================");
write.countDown();
}
});
/** 写入市 **/
for(Element province : provinces){
threadPool.submit(new SyncWrite(province,cyclicBarrier));
}
}
}catch(Exception e){
e.printStackTrace();
}
}
/** 读取文件为Document **/
private static Document readStringToDocument(String fileAddr){
try{
return Jsoup.parse(FileUtils.readFileToString(new File(fileAddr),"gb2312"));
}catch(Exception e){
e.printStackTrace();
log.error("Read string to document error,error = {}",e);
}
return null;
}
private static class Area implements Runnable {
private Boolean isGetTown;
private Boolean isGetStreet;
private String url;
private Province province;
private CyclicBarrier cyclicBarrier;
private List<Province> provincesList;
public Area(Boolean isGetTown, Boolean isGetStreet, String url, Province province,CyclicBarrier cyclicBarrier
,List<Province> provincesList) {
this.isGetTown = isGetTown;
this.isGetStreet = isGetStreet;
this.url = url;
this.province = province;
this.cyclicBarrier = cyclicBarrier;
this.provincesList = provincesList;
}
@SneakyThrows
@Override
public void run() {
Province province = this.province;
/** 完成省市区 街道封装 **/
String cityUrl = url.concat(province.getProvinceCode().concat(HTML));
Document document = readStringToDocument(cityUrl);
String cityLevel = AreaLevelEnum.CITY.getLevel();
Elements cityElements = document
.select(MessageFormatter.format(TEMPLATE, cityLevel, cityLevel).getMessage());
List cities = Optional.ofNullable(cityElements).filter(a -> a.size() > 0).map(u -> {
return u.stream().map(k -> {
return Optional.ofNullable(k.select(TDA)).filter(a -> a.size() > 0).map(a -> {
/** 代码 **/
Element codeElement = a.get(0);
/** 地级市名 **/
Element nameElement = a.get(1);
City city = new City();
/** 获得 城市代码 **/
city.setCityCode(codeElement.text().substring(0,6));
city.setCityName(nameElement.text());
city.setLevel(AreaLevelEnum.CITY.getLevel());
/** 区 地址URL **/
String countyUrl = url.concat(codeElement.attr(HREF));
Document countyDocument = readStringToDocument(countyUrl);
String countLevel = AreaLevelEnum.COUNTY.getLevel();
Elements countyElements = countyDocument
.select(MessageFormatter.format(TEMPLATE, countLevel, countLevel).getMessage());
List<County> counties = Optional.ofNullable(countyElements).filter(c -> c.size() > 0).map(x -> {
return x.stream().map(c -> {
return Optional.ofNullable(c.select(TDA)).filter(aq -> aq.size() > 0).map(aq -> {
Element countyCode = aq.get(0);
Element countyName = aq.get(1);
County county = new County();
county.setCountyCode(countyCode.text().substring(0,6));
county.setCountyName(countyName.text());
county.setLevel(AreaLevelEnum.COUNTY.getLevel());
/** 街道地址URL **/
String townUrl = url.concat(countyCode.attr(HREF));
/** 是否读取 镇 **/
if (isGetTown) {
Document townDocument = readStringToDocument(townUrl);
String townLevel = AreaLevelEnum.TOWN.getLevel();
Elements townElements = townDocument.select(
MessageFormatter.format(TEMPLATE, townLevel, townLevel).getMessage());
List<Town> towns = Optional.ofNullable(townElements).filter(w -> w.size() > 0)
.map(e -> {
return e.stream().map(r -> {
return Optional.ofNullable(e.select(TDA))
.filter(qq -> qq.size() > 0).map(qq -> {
Element townCode = qq.get(0);
Element townName = qq.get(1);
Town town = new Town();
town.setLevel(AreaLevelEnum.TOWN.getLevel());
town.setTownCode(townCode.text());
town.setTownName(townName.text());
/** 是否读取 街道 **/
if (isGetStreet) {
String streetUrl = url.concat(townCode.attr(HREF));
Document streetDocument = readStringToDocument(streetUrl);
String streetLevel = AreaLevelEnum.VILLAGE.getLevel();
Elements streetElements = streetDocument.select(
MessageFormatter
.format(TEMPLATE, streetLevel, streetLevel)
.getMessage());
List<Street> streets = Optional
.ofNullable(streetElements)
.filter(t -> t.size() > 0).map(t -> {
return t.stream().map(v -> {
return Optional.ofNullable(v.select(TD))
.filter(we -> we.size() > 0).map(we -> {
Element streetCode = we.get(0);
Element streetTypeCode = we.get(1);
Element streetName = we.get(2);
Street street = new Street();
street.setLevel(
AreaLevelEnum.VILLAGE
.getLevel());
street.setStreetCode(
streetCode.text());
street.setStreetTypeCode(
streetTypeCode.text());
street.setStreetName(
streetName.text());
return street;
}).orElse(null);
}).filter(rs -> null != rs)
.collect(Collectors.toList());
}).orElse(null);
town.setStreets(streets);
}
return town;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
county.setTowns(towns);
}
return county;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
city.setCounties(counties);
return city;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
province.setCities(cities);
System.out.println(province.getProvinceName() + ":" + gson.toJson(province));
provincesList.add(province);
cyclicBarrier.await();
}
}
public static void main(String[] args) {
try {
write();
write.await();
String proLevel = AreaLevelEnum.PROVINCE.getLevel();
Document proDocument = readStringToDocument(REGION.concat(File.separator).concat(proLevel).concat(HTML));
List<Province> provincesList = Lists.newLinkedList();
Elements provinces = getProvinces(proDocument, OperationType.READ);
CyclicBarrier barrier = new CyclicBarrier(provinces.size() - 1, new Runnable() {
@Override
public void run() {
log.info("所有省份json数据组装完毕!!!!");
/** 排名所有省市区 **/
Collections.sort(provincesList, Comparator.comparing(Province::getProvinceCode));
System.out.println("<---------- 执行结果开始 --------->");
System.out.println(gson.toJson(provincesList));
System.out.println("<---------- 执行结果结束 --------->");
threadPool.shutdown();
}
});
Optional.ofNullable(provinces).filter(a -> a.size() > 0).ifPresent(cs -> {
cs.stream().forEach(element -> {
Elements a = element.select(A);
if (StringUtils.isNotEmpty(a.text())) {
/** 组装 省 **/
Province province = new Province();
province.setLevel(AreaLevelEnum.PROVINCE.getLevel());
/** 获得 省级 代码 **/
String code = a.attr(HREF).trim().substring(0, 2);
province.setProvinceCode(code);
/** 获得省 名字 **/
String name = a.text();
province.setProvinceName(name);
try {
threadPool.execute(new Area(IS_GET_TOEN, IS_GET_STREET, REGION, province,barrier,provincesList));
} catch (Exception e) {
e.printStackTrace();
}
}
});
});
} catch (Exception e) {
e.printStackTrace();
log.error("Occur error,error = {}",e);
}
}
}
最终结果:
项目和转出的json文件等审核完毕,我会发布到CSDN,勿催。
省市区Json文件地址:点击下载2020年国家省市区Json文件
爬虫项目地址:点击下载项目