使用Jsoup去国家统计局官网爬取省市县三级json数据

1 篇文章 0 订阅
1 篇文章 0 订阅

使用Jsoup、多线程去国家统计局官网获取全国省市县三级json数据:

package com.imant.crawler.controller;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.imant.crawler.vo.AreaVo;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.*;

/**
 * 用于去www.stats.gov.cn爬取省市县的数据
 */
public class AreaCrawler {

    private static Gson gson = new GsonBuilder().create();

    private static String URL_PREFIX = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/";

    private static int count = 0;

    private static final ThreadFactory namedThreadFactory = new ThreadFactoryBuilder()
            .setNameFormat("areaCrawler-pool-%d").build();
    //创建线程池
    private static final ExecutorService pool     = new ThreadPoolExecutor(5, 200, 0L,
            TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(16), namedThreadFactory,
            new ThreadPoolExecutor.AbortPolicy());
    private static List<AreaVo> areaVoList = new ArrayList<>();


    public Document getDocument(String url) throws IOException {
        try{
            Connection conn = Jsoup.connect(url).timeout(5000);
            conn.header("Accept", "*/*");
            conn.header("Accept-Encoding", "gzip, deflate, br");
            conn.header("Accept-Language", "zh-CN,zh;q=0.9");
            conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
            Connection.Response response = null;
            do {
                response = conn.execute();
                if(count > 0){
                    Document doc = Jsoup.connect(url)
                            .data("query", "Java")
                            .userAgent("Mozilla")
                            .cookie("auth", "token")
                            .timeout(3000)
                            .post();
                    return doc;
                }
            }while(response.statusCode() != 200);
            return  conn.get();
        }catch (IOException e){
            throw e;
        }
    }



    public static void main(String[] args) throws InterruptedException, ExecutionException {
        AreaCrawler t = new AreaCrawler();
        Document doc = null;
        do {
            try{
                doc = t.getDocument("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html");
                count = 0;
            }catch (Exception e){
                count++;
            }
        }while(doc == null);
        //获取所有的省
        List<Future>  futureList = new ArrayList<>();
        Elements provinceElements = doc.select("table.provincetable tbody tr.provincetr td");
        for(Element element : provinceElements){
            if (StringUtils.isNotBlank(element.text())) {
                AreaVo areaVo = new AreaVo();
                areaVo.setName(element.text());
                String cityUrl = URL_PREFIX + element.select("td a").get(0).attr("href");
                String code = element.select("td a").get(0).attr("href").substring(0, 2) + "0000";
                areaVo.setCode(code);
                Future future = pool.submit(new CityThread(cityUrl, areaVo));
                futureList.add(future);
            }
        }
        /**
         * 等待所有的子线程执行完毕
         */
        for(Future future : futureList){
            future.get();
        }
        //关闭线程池
        pool.shutdownNow();
        Collections.sort(areaVoList, Comparator.comparing(AreaVo::getCode));
        System.out.println(gson.toJson(areaVoList));

    }


    private static class CityThread implements Runnable{
        String url;
        AreaVo areaVo;
        public CityThread(String url, AreaVo areaVo){
            this.url = url;
            this.areaVo = areaVo;
        }

        @Override
        public void run() {
            AreaCrawler t = new AreaCrawler();
            Document cityDocument = null;
            do{
                try {
                    cityDocument = t.getDocument(url);
                    count = 0;
                }catch (Exception e){
                    //e.printStackTrace();
                    count++;
                }
            }while(cityDocument == null);
            Elements cityElements = cityDocument.select("table.citytable tbody tr.citytr");
            List<AreaVo> cityList = new ArrayList<>();
            for(Element cityElement :cityElements){
                AreaVo cityVo = new AreaVo();
                String cityCode = cityElement.select("td").get(0).text().substring(0,6);
                String cityName = cityElement.select("td").get(1).text();
                cityVo.setCode(cityCode);
                cityVo.setName(cityName);
                if(cityElement.select("td a").size() > 0){
                    String countyUrl = URL_PREFIX + cityElement.select("td a").get(0).attr("href");
                    Document countyDocument = null;
                    do{
                        try{
                            countyDocument = t.getDocument(countyUrl);
                            count = 0;
                        }catch (Exception e){
                            //e.printStackTrace();
                            count++;
                        }
                    }while(countyDocument == null);
                    Elements countyElements = countyDocument.select("table tbody tr.countytr");
                    List<AreaVo> countyList = new ArrayList<>();
                    for(Element countyElement : countyElements){
                        AreaVo countyVo = new AreaVo();
                        String countyCode = countyElement.select("td").get(0).text().substring(0,6);
                        String countyName = countyElement.select("td").get(1).text();
                        countyVo.setCode(countyCode);
                        countyVo.setName(countyName);
                        countyList.add(countyVo);
                    }
                    cityVo.setChildren(countyList);
                }
                cityList.add(cityVo);
            }
            areaVo.setChildren(cityList);
            areaVoList.add(areaVo);
            System.out.println(areaVo.getName()+":"+gson.toJson(areaVo).toString());
        }
    }
}
package com.imant.crawler.vo;

import lombok.Data;

import java.util.List;

@Data
public class AreaVo {

    private String code;

    private String name;

    private List<AreaVo> children;

    @Override
    public String toString() {
        return "AreaVo{" +
                "code='" + code + '\'' +
                ", name='" + name + '\'' +
                ", children=" + children +
                '}';
    }
}
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.0</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.7</version>
        </dependency>


        <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.5</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>25.0-jre</version>
        </dependency>



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值