爬虫国际统计局省市区

    在工作中,经常做一些有关地区、地址的需求,就是在网页或者App端,展示三级下拉选择省市区。本文旨在帮助我们从国家统计局获取最新的省市区数据用户项目中。

    以下代码支持爬虫省市区镇街道,设置有2个全局变量,默认只爬取省市区保存到本地,然后从本地读取爬虫html网页解析成json对象,也可以转成Excel,自己找插件或者写代码转就可以了,非常方便。       

     首先说说这个爬虫的几个注意顶

  1. 因为爬虫需要多次与远端服务器连接,并发连接会遇到以下错误,跟代码没有关系,跟网络有关系,连接的时候多次重定向导致,没花过多时间研究解决办法,目前等过一会再次执行就可以了。有好的解决方法请在评论区评论一下,感激。

        java.io.IOException: Too many redirects occurred trying to load URL 

    2.如果在读本地html网页文件的时候报错,FileNotFoundExceiption,说命爬下来的网页文件不完整,缺失了一部分,那么可以将根目录下的region文件夹删除,下次启动程序,会再次自从从远端爬取html网页文件写入到本地。

    3.如果在日志中遇到,连接失败,正在重试。就是遇到了上边 说的 发生了多次重定向问题,这样写入到本地的网页文件可能不完整,如果再次启动程序,成功转出json,需要开发者自行检查转出的json数据是否完整。

package com.lockie.region;/* Copyright © 2020 pyacm.com and/or its affiliates. All rights reserved. */

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.lockie.region.entity.City;
import com.lockie.region.entity.County;
import com.lockie.region.entity.Province;
import com.lockie.region.entity.Street;
import com.lockie.region.entity.Town;
import com.lockie.region.enums.AreaLevelEnum;
import com.lockie.region.enums.OperationType;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpStatus;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.helpers.MessageFormatter;
/**
 * 国家统计局数据爬虫
 * @author lockie
 * @date 2021-12-10 15:31
 */
@Slf4j
public class RegionTask {

    public static final String BASE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/";

    /** 国家统计局省市区地址 **/
    public static final String AREA_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";

    private static Boolean isRetry = false;

    private static final Integer TIMEOUT = 500000;

    private static final String REGION = "./region/";

    private static Integer COUNT = 0;

    private static final File file;

    private static final File flagFile;

    public static final Gson gson = new GsonBuilder().create();

    public static final String HTML = ".html";

    /** 是否爬虫 街道  第五级 **/
    public static final Boolean IS_GET_STREET = false;

    /** 是否爬虫 镇  第四级 **/
    public static final Boolean IS_GET_TOEN = false;

    /** 匹配 a标签 **/
    public static final String A = "a";

    public static final String TDA = "td a";

    public static final String HREF = "href";

    /** 匹配所有城市 **/
    public static final String TEMPLATE = "table.{}table tbody tr.{}tr ";

    public static final String TD = "td";

    public static final String FLAG = "region.txt";

    public static final CountDownLatch write = new CountDownLatch(1);

    static {
        file = new File(REGION);
        flagFile = new File(REGION.concat(FLAG));
        try {
            if(!file.exists()){
                file.mkdirs();
            }
            if(!file.exists()){
                flagFile.createNewFile();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /** 采用线程池 **/
    public static ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("thread-pool-%d --- ").build();

    /** 阻塞队列,常驻核心线程被占用完毕,请求被堆积到阻塞队列中 **/
    public static ExecutorService threadPool = new ThreadPoolExecutor(50, 200, 0L, TimeUnit.SECONDS,
        new LinkedBlockingQueue<>(30), threadFactory,new AbortPolicy());


    /** 获得连接 **/
    private static Connection getConnection(String u) {
        Connection connection = Jsoup.connect(u).timeout(TIMEOUT);
        connection.header(HttpConnection.CONTENT_ENCODING, Charsets.UTF_8.name());
        connection.header("Accept", "*/*");
        connection.header("Accept-Encoding", "gzip, deflate, br");
        connection.header("Accept-Language", "zh-CN,zh;q=0.9");
        connection.header("User-Agent",
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
        return connection;
    }

    /** 获得连接 **/
    private static Document getDocument(String url) {
        return Optional.ofNullable(url).map(u -> {
            try {
                Connection connection = getConnection(u);
                Response execute = null;
                do {
                    execute = connection.execute();
                    if (isRetry) {
                        /** 两秒延迟 **/
                        Thread.sleep(2000);
                        connection = getConnection(u);
                    }
                } while (execute.statusCode() != HttpStatus.SC_OK);
                return connection.post();
            } catch (Exception e) {
                log.info("无法链接,正在重试~");
            }
            return null;
        }).orElse(null);
    }

    /** 爬虫数据写入到本地 **/
    public static void writeToLocal(Element province,CyclicBarrier cyclicBarrier) {
        try {
            Elements proSelect = province.select(TDA);
            String proName = proSelect.text();
            if(StringUtils.isNotEmpty(proName)) {
                String cityUrl = BASE_URL.concat(proSelect.attr(HREF));
                Document cityDocument = null;
                do {
                    cityDocument = getDocument(cityUrl);
                    if (cityDocument == null) {
                        isRetry = true;
                    } else {
                        isRetry = false;
                    }
                } while (null == cityDocument);
                Elements cities = getCities(cityDocument, proName, proSelect.attr(HREF), OperationType.WRITE);
                /** 写入 县 **/
                for (Element city : cities) {
                    Elements citySelect = city.select(TDA);
                    if (citySelect.size() > 0) {
                        Element cityCodeElement = citySelect.get(0);
                        Element cityNameElement = citySelect.get(1);
                        String cityName = cityNameElement.text();
                        String countyUrl = cityCodeElement.absUrl(HREF);
                        String fileName = countyUrl.split(BASE_URL)[1];
                        Document countyDocument = null;
                        do {
                            countyDocument = getDocument(countyUrl);
                            if (countyDocument == null) {
                                isRetry = true;
                            } else {
                                isRetry = false;
                            }
                        } while (null == countyDocument);
                        Elements counties = getCounties(countyDocument, proName, cityName, fileName,OperationType.WRITE);
                        if (IS_GET_TOEN) {
                            /** 写入 镇 **/
                            for (Element county : counties) {
                                Elements countySelect = county.select(TDA);
                                if (countySelect.size() > 0) {
                                    Element countyCodeElement = countySelect.get(0);
                                    Element countyNameElement = countySelect.get(1);
                                    String countyName = countyNameElement.text();
                                    String townUrl = countyCodeElement.absUrl(HREF);
                                    String countyFileName = townUrl.split(BASE_URL)[1];
                                    Document townDocument = null;
                                    do {
                                        townDocument = getDocument(townUrl);
                                        if (townDocument == null) {
                                            isRetry = true;
                                        } else {
                                            isRetry = false;
                                        }
                                    } while (null == townDocument);
                                    Elements towns = getTowns(townDocument, proName, cityName, countyName,
                                        countyFileName,OperationType.WRITE);
                                    if (IS_GET_STREET) {
                                        /** 写入街道 **/
                                        for (Element town : towns) {
                                            Elements townSelect = town.select(TDA);
                                            if (townSelect.size() > 0) {
                                                Element townCodeElement = townSelect.get(0);
                                                Element townNameElement = townSelect.get(1);
                                                String townName = townNameElement.text();
                                                String streetUrl = townCodeElement.absUrl(HREF);
                                                String townFileName = streetUrl.split(BASE_URL)[1];
                                                Document streetDocument = null;
                                                do {
                                                    streetDocument = getDocument(streetUrl);
                                                    if (streetDocument == null) {
                                                        isRetry = true;
                                                    } else {
                                                        isRetry = false;
                                                    }
                                                } while (null == streetDocument);
                                                getStreets(streetDocument, proName, cityName, countyName,
                                                    townName, townFileName,OperationType.WRITE);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }

                }
                cyclicBarrier.await();
                log.info("{} 所有区数据爬取完毕。", proName);
            }
        } catch (Exception exception) {
            exception.printStackTrace();
            log.error("Write error,error = {}", exception);
        }
    }

    /** 获得所有省 **/
    public static Elements getProvinces(Document provinceDocument,OperationType operationType) throws IOException {
        return getElements(provinceDocument, AreaLevelEnum.PROVINCE,null,null,null,null,
            AreaLevelEnum.PROVINCE.getLevel().concat(HTML),operationType);
    }

    /** 获得所有市 **/
    public static Elements getCities(Document cityDocument,String proName,String fileName,OperationType operationType) throws IOException {
        return getElements(cityDocument,AreaLevelEnum.CITY,proName,null,null,null,fileName,operationType);
    }

    /** 所有区/县 **/
    public static Elements getCounties(Document countyDocument,String proName,String cityName,String fileName,OperationType operationType) throws IOException {
        return getElements(countyDocument,AreaLevelEnum.COUNTY,proName,cityName,null,null,fileName,operationType);
    }

    /** 所有镇 **/
    public static Elements getTowns(Document townDocument,String proName,String cityName,
        String countyName,String fileName,OperationType operationType) throws IOException {
        return getElements(townDocument,AreaLevelEnum.TOWN,proName,cityName,countyName,null,fileName,operationType);
    }

    /** 所有街道/村 **/
    public static Elements getStreets(Document streetDocument,String proName,String cityName,
        String countyName,String townName,String fileName,OperationType operationType) throws IOException {
        return getElements(streetDocument, AreaLevelEnum.VILLAGE,proName, cityName, countyName,townName,fileName,operationType);
    }


    private static Elements getElements(Document document,AreaLevelEnum level,String proName, String cityName,
        String countyName,
        String townName,
        String fileName,
        OperationType operationType) throws IOException {
        try {
            String le = level.getLevel();
            Elements elements = null;
            if(null != document) {
                if(AreaLevelEnum.PROVINCE == level || AreaLevelEnum.VILLAGE == level){
                    elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage().concat(TD));
                }else if(AreaLevelEnum.VILLAGE != level){
                    elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage());
                }
                if(OperationType.WRITE == operationType){
                    File file = new File(REGION.concat(fileName));
                    File parentFile = file.getParentFile();
                    if(!parentFile.exists()){
                        parentFile.mkdirs();
                    }
                    if(!file.exists()){
                        file.createNewFile();
                    }
                    FileUtils.writeStringToFile(file,document.html(), "gb2312");
                    String path = file.getAbsolutePath();
                    log.info("<---------------------------------->");
                    if(StringUtils.isNotEmpty(townName)){
                        log.info("{}-{}-{}-{} 所有街道/村 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,townName, path);
                    }else if(StringUtils.isNotEmpty(countyName)){
                        log.info("{}-{}-{} 所有镇 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,path);
                    }else if(StringUtils.isNotEmpty(cityName)){
                        log.info("{}-{} 所有区/县 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,path);
                    }else if(StringUtils.isNotEmpty(proName)){
                        log.info("{} 所有市 网页数据写入到本地完成~~,文件所在地址->{}",proName,path);
                    }else {
                        log.info("所有省 网页数据写入到本地完成~~,文件所在地址:{}",path);
                    }
                    log.info("<---------------------------------->");
                }
            }
            return elements;
        } catch (Exception e) {
            e.printStackTrace();
            log.error("GetAndWriteProvince method error,error = {}",e);
            throw e;
        }
    }

    private static class SyncWrite implements Runnable {

        private Element element;

        private CyclicBarrier cyclicBarrier;

        public SyncWrite(Element element,CyclicBarrier cyclicBarrier){
            this.element = element;
            this.cyclicBarrier = cyclicBarrier;
        }

        @Override
        public void run() {
            writeToLocal(element,cyclicBarrier);
        }
    }


    public static void write(){
        if(file.list().length > 1){
            log.info("所有省市区数据已经写入到本地完毕,若要更新所有省市区文件,请删除该文件夹即可,文件地址:{}",file.getAbsolutePath());
            write.countDown();
            return;
        }
        Document document = getDocument(AREA_URL);
        System.out.println("等待所有 省份数据 爬取 完毕 !!!!!!!");
        try{
            Elements provinces = getProvinces(document,OperationType.WRITE);
            if(null != provinces && provinces.size() > 0){
                CyclicBarrier cyclicBarrier = new CyclicBarrier(provinces.size()-1, new Runnable() {
                    @Override
                    public void run() {
                        log.info("===================== 所有省数据写入完毕 ========================");
                        write.countDown();
                    }
                });
                /** 写入市 **/
                for(Element province : provinces){
                    threadPool.submit(new SyncWrite(province,cyclicBarrier));
                }
            }
        }catch(Exception e){
            e.printStackTrace();
        }
    }

    /** 读取文件为Document **/
    private static Document readStringToDocument(String fileAddr){
        try{
            return Jsoup.parse(FileUtils.readFileToString(new File(fileAddr),"gb2312"));
        }catch(Exception e){
            e.printStackTrace();
            log.error("Read string to document error,error = {}",e);
        }
        return null;
    }

    private static class Area implements Runnable {

        private Boolean isGetTown;

        private Boolean isGetStreet;

        private String url;

        private Province province;

        private CyclicBarrier cyclicBarrier;

        private List<Province> provincesList;

        public Area(Boolean isGetTown, Boolean isGetStreet, String url, Province province,CyclicBarrier cyclicBarrier
            ,List<Province> provincesList) {
            this.isGetTown = isGetTown;
            this.isGetStreet = isGetStreet;
            this.url = url;
            this.province = province;
            this.cyclicBarrier = cyclicBarrier;
            this.provincesList = provincesList;
        }

        @SneakyThrows
        @Override
        public void run() {
            Province province = this.province;
            /** 完成省市区 街道封装 **/
            String cityUrl = url.concat(province.getProvinceCode().concat(HTML));
            Document document = readStringToDocument(cityUrl);
            String cityLevel = AreaLevelEnum.CITY.getLevel();
            Elements cityElements = document
                .select(MessageFormatter.format(TEMPLATE, cityLevel, cityLevel).getMessage());
            List cities = Optional.ofNullable(cityElements).filter(a -> a.size() > 0).map(u -> {
                return u.stream().map(k -> {
                    return Optional.ofNullable(k.select(TDA)).filter(a -> a.size() > 0).map(a -> {
                        /** 代码 **/
                        Element codeElement = a.get(0);
                        /** 地级市名 **/
                        Element nameElement = a.get(1);

                        City city = new City();
                        /** 获得 城市代码 **/
                        city.setCityCode(codeElement.text().substring(0,6));
                        city.setCityName(nameElement.text());
                        city.setLevel(AreaLevelEnum.CITY.getLevel());

                        /** 区 地址URL **/
                        String countyUrl = url.concat(codeElement.attr(HREF));
                        Document countyDocument = readStringToDocument(countyUrl);
                        String countLevel = AreaLevelEnum.COUNTY.getLevel();
                        Elements countyElements = countyDocument
                            .select(MessageFormatter.format(TEMPLATE, countLevel, countLevel).getMessage());
                        List<County> counties = Optional.ofNullable(countyElements).filter(c -> c.size() > 0).map(x -> {
                            return x.stream().map(c -> {
                                return Optional.ofNullable(c.select(TDA)).filter(aq -> aq.size() > 0).map(aq -> {
                                    Element countyCode = aq.get(0);
                                    Element countyName = aq.get(1);
                                    County county = new County();
                                    county.setCountyCode(countyCode.text().substring(0,6));
                                    county.setCountyName(countyName.text());
                                    county.setLevel(AreaLevelEnum.COUNTY.getLevel());
                                    /** 街道地址URL **/
                                    String townUrl = url.concat(countyCode.attr(HREF));
                                    /** 是否读取 镇 **/
                                    if (isGetTown) {
                                        Document townDocument = readStringToDocument(townUrl);
                                        String townLevel = AreaLevelEnum.TOWN.getLevel();
                                        Elements townElements = townDocument.select(
                                            MessageFormatter.format(TEMPLATE, townLevel, townLevel).getMessage());
                                        List<Town> towns = Optional.ofNullable(townElements).filter(w -> w.size() > 0)
                                            .map(e -> {
                                                return e.stream().map(r -> {
                                                    return Optional.ofNullable(e.select(TDA))
                                                        .filter(qq -> qq.size() > 0).map(qq -> {
                                                            Element townCode = qq.get(0);
                                                            Element townName = qq.get(1);
                                                            Town town = new Town();
                                                            town.setLevel(AreaLevelEnum.TOWN.getLevel());
                                                            town.setTownCode(townCode.text());
                                                            town.setTownName(townName.text());

                                                            /** 是否读取 街道 **/
                                                            if (isGetStreet) {
                                                                String streetUrl = url.concat(townCode.attr(HREF));
                                                                Document streetDocument = readStringToDocument(streetUrl);
                                                                String streetLevel = AreaLevelEnum.VILLAGE.getLevel();
                                                                Elements streetElements = streetDocument.select(
                                                                    MessageFormatter
                                                                        .format(TEMPLATE, streetLevel, streetLevel)
                                                                        .getMessage());
                                                                List<Street> streets = Optional
                                                                    .ofNullable(streetElements)
                                                                    .filter(t -> t.size() > 0).map(t -> {
                                                                        return t.stream().map(v -> {
                                                                            return Optional.ofNullable(v.select(TD))
                                                                                .filter(we -> we.size() > 0).map(we -> {
                                                                                    Element streetCode = we.get(0);
                                                                                    Element streetTypeCode = we.get(1);
                                                                                    Element streetName = we.get(2);
                                                                                    Street street = new Street();
                                                                                    street.setLevel(
                                                                                        AreaLevelEnum.VILLAGE
                                                                                            .getLevel());
                                                                                    street.setStreetCode(
                                                                                        streetCode.text());
                                                                                    street.setStreetTypeCode(
                                                                                        streetTypeCode.text());
                                                                                    street.setStreetName(
                                                                                        streetName.text());
                                                                                    return street;
                                                                                }).orElse(null);
                                                                        }).filter(rs -> null != rs)
                                                                            .collect(Collectors.toList());
                                                                    }).orElse(null);
                                                                town.setStreets(streets);
                                                            }
                                                            return town;
                                                        }).orElse(null);
                                                }).filter(r -> null != r).collect(Collectors.toList());
                                            }).orElse(null);
                                        county.setTowns(towns);
                                    }
                                    return county;
                                }).orElse(null);
                            }).filter(r -> null != r).collect(Collectors.toList());
                        }).orElse(null);
                        city.setCounties(counties);
                        return city;
                    }).orElse(null);
                }).filter(r -> null != r).collect(Collectors.toList());
            }).orElse(null);
            province.setCities(cities);
            System.out.println(province.getProvinceName() + ":" + gson.toJson(province));
            provincesList.add(province);
            cyclicBarrier.await();
        }
    }



    public static void main(String[] args) {
        try {
            write();
            write.await();
            String proLevel = AreaLevelEnum.PROVINCE.getLevel();
            Document proDocument = readStringToDocument(REGION.concat(File.separator).concat(proLevel).concat(HTML));
            List<Province> provincesList = Lists.newLinkedList();
            Elements provinces = getProvinces(proDocument, OperationType.READ);
            CyclicBarrier barrier = new CyclicBarrier(provinces.size() - 1, new Runnable() {
                @Override
                public void run() {
                    log.info("所有省份json数据组装完毕!!!!");
                    /** 排名所有省市区 **/
                    Collections.sort(provincesList, Comparator.comparing(Province::getProvinceCode));
                    System.out.println("<---------- 执行结果开始 --------->");
                    System.out.println(gson.toJson(provincesList));
                    System.out.println("<---------- 执行结果结束 --------->");
                    threadPool.shutdown();
                }
            });
            Optional.ofNullable(provinces).filter(a -> a.size() > 0).ifPresent(cs -> {
                cs.stream().forEach(element -> {
                    Elements a = element.select(A);
                    if (StringUtils.isNotEmpty(a.text())) {
                        /** 组装 省 **/
                        Province province = new Province();
                        province.setLevel(AreaLevelEnum.PROVINCE.getLevel());
                        /** 获得 省级 代码 **/
                        String code = a.attr(HREF).trim().substring(0, 2);
                        province.setProvinceCode(code);
                        /** 获得省 名字 **/
                        String name = a.text();
                        province.setProvinceName(name);
                        try {
                            threadPool.execute(new Area(IS_GET_TOEN, IS_GET_STREET, REGION, province,barrier,provincesList));
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                });
            });
        } catch (Exception e) {
            e.printStackTrace();
            log.error("Occur error,error = {}",e);
        }
    }
}

 最终结果:

 项目和转出的json文件等审核完毕,我会发布到CSDN,勿催。

省市区Json文件地址:点击下载2020年国家省市区Json文件

爬虫项目地址:点击下载项目

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值