爬取杭州市区域数据

1、添加jar包

		<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>-->

2、代码

package com.fy.microservice.government.service.impl;

import com.fy.microservice.government.utils.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * 获取区域工具类
 */
public class AreaUtils {
    public static void main(String[] args) throws IOException {
        doGet("33/3301.html");

    }

    public static String doGet(String keywords) throws IOException {
        String urii = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
        if (keywords.length() == 12) {
            urii = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
        } else if (keywords.length() == 14) {
            urii = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/33/";
        } else if (keywords.length() == 17) {
            urii = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/33/01/";
        }
        URL url = new URL(urii + keywords);
        URLConnection URLconnection = url.openConnection();
        HttpURLConnection httpConnection = (HttpURLConnection) URLconnection;
        int responseCode = httpConnection.getResponseCode();
        if (responseCode == HttpURLConnection.HTTP_OK) {
            System.err.println("成功");
            InputStream in = httpConnection.getInputStream();
            InputStreamReader isr = new InputStreamReader(in);
            BufferedReader bufr = new BufferedReader(isr);
            String str;
            while ((str = bufr.readLine()) != null) {
                getData(str, keywords);
            }
            bufr.close();
        } else {
            System.err.println("失败");
        }
        return "";
    }

    public static Map<String, String> getData(String html, String parentId) throws IOException {
        //采用Jsoup解析
        Document doc = Jsoup.parse(html);
        //获取body元素,获取class="fc"的table元素
        String namemmm = "city";
        String level = "3";
        if (parentId.length() == 12) {
            namemmm = "county";
            level = "4";
        } else if (parentId.length() == 14) {
            namemmm = "town";
            level = "5";
        } else if (parentId.length() == 17) {
            namemmm = "village";
            level = "6";
        }

        Elements table = doc.body().getElementsByClass(namemmm + "table");
        //获取tbody元素
        Elements children = table.first().children();
        //获取tr元素集合
        Elements tr = children.get(0).getElementsByClass(namemmm + "tr");
        //遍历tr元素,获取td元素,并打印
        for (int i = 0; i < tr.size(); i++) {
            Element e1 = tr.get(i);
            Elements td = e1.getElementsByTag("td");
            String text1 = td.get(0).text();
            if (parentId.length() == 12) {
                text1 = text1.substring(0, 6);
            } else if (parentId.length() == 14) {
                text1 = text1.substring(0, 9);
            }
//            String text1 = td.get(0).text().replaceAll("(0)+$", "");;
            String url = td.get(0).getElementsByTag("a").attr("href");
            String text2 = td.get(1).text();
            if (xx(text2)) {
                text2 = td.get(2).text();
            }
            System.out.println("INSERT INTO `sys_org` (`id`, `name`, `pid`, `sort`, `level`, `longcode`, `code`) VALUES (" +
                    text1 + ",'" + text2 + "','" + parentId + "'," + (i + 1) + "," + level + ",'" + text1 + "','" +
                    text1 + "');");
            if (StringUtils.isNotBlank(url)) {
                doGet(url);
            }
        }
        return new HashMap<>();
    }

    public static boolean xx(String str) {
        Pattern pattern = Pattern.compile("[0-9]*");
        return pattern.matcher(str).matches();
    }

}

3、输出结果

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值