java使用jsoup抓取省市县乡四级数据

原理: 使用 jsoup解析国家统计局的前端数据,倒入自己的数据库。

核心代码
package com.itzhongzi.area;

import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;

public class Application {
    //    public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";
    public static String SITE_URL = "http://localhost:8080/local/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";
    private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
    private static List<String> errorList = new ArrayList<String>();
    private static int shengNum = 0;
    private static int shiNum = 0;
    private static int xianNum = 0;
    private static int zhenNum = 0;

    public static void main(String[] args) {
        System.out.println("抓取开始:" + new Date());
        getProvince();
        StringBuffer content = new StringBuffer();
        for (RegionEntry one : regions) {
            content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
            for (RegionEntry two : one.getSub()) {
                content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode() + "','").append(two.getName()).append("', 2);\r\n");
                for (RegionEntry three : two.getSub()) {
                    content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
                    for (RegionEntry four : three.getSub()) {
                        content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
                    }
                }
            }
        }
        FileOutputStream out = null;
//        Region.writeFile(content.toString());
        try {
            out = new FileOutputStream(new File("/Users/huanghainongye/Desktop/area/area.txt"));
            byte[] bytes = content.toString().getBytes();
            out.write(bytes);
            out.flush();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (out != null)
                try {
                    out.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
        }
        System.out.println("抓取完毕:" + new Date());
        System.out.println("省error:" + shengNum + "--" + "市error:" + shiNum +  "---" + "县error: " + xianNum + "----" + "镇error:" + zhenNum);
        System.out.println("错误列表:" + errorList.toString());
    }

    /**
     * 抓去省的数据
     */
    private static void getProvince() {
        Document doc;
        try {
            doc = Jsoup.connect(SITE_URL).timeout(0).get(); //Jsoup.connect(SITE_URL).get();
            Elements links = doc.select("tr.provincetr").select("a");
            RegionEntry region = null;
            for (Element e : links) {
                try {
                    region = new RegionEntry();
                    String href = e.attr("href");
                    String[] arr = href.split("\\.");
                    String code = arr[0];
                    if (arr[0].length() < 6) {
                        for (int i = 0; i < 6 - arr[0].length(); i++) {
                            code += "0";
                        }
                    }
                    region.setCode(code);
                    region.setName(e.text());
                    // href的绝地路径
                    String absHref = e.attr("abs:href");
                    System.out.println(absHref);
                    getCity(absHref, region);
                    regions.add(region);

                    System.out.println(region);
                    // 解析成json
                    JSONObject jsonobj = new JSONObject();
                    jsonobj.put("name", region.getName());
                    jsonobj.put("code", region.getCode());
                    jsonobj.put("subArea", region.getSub());
                    System.out.println(jsonobj);
                    System.out.println("-------------------------");

                } catch (Exception exp) {
                    errorList.add(e.text());
                    shengNum++;
                    exp.printStackTrace();
                }
            }
        } catch (Exception exception) {
            errorList.add("省份错误");
            exception.printStackTrace();
        }

    }

    /**
     * 抓去市的数据
     */
    public static void getCity(String url, RegionEntry region) {
        Document doc;
        try {
            System.out.println("getcity");
            doc = Jsoup.connect(url).timeout(0).get(); //Jsoup.connect(url).get().charset(charset);
//            <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'>乌鲁木齐市</a></td></tr>
            Elements links = doc.select("tr.citytr");
            RegionEntry city;
            for (Element e : links) {
                city = new RegionEntry();
                Elements alist = e.select("a");
                Element codeE = alist.get(0);
                Element codeN = alist.get(1);
                String name = codeN.text();
                String code = codeE.text();
                if ("市辖区".equals(name)) {
                    name = region.getName();
                    //code = region.getCode();
                }
                city.setCode(code);
                city.setName(name);
                String absHref = codeE.attr("abs:href");
                getArea(absHref, city);
                region.getSub().add(city);
            }
        } catch (Exception exception) {
            shiNum++;
            errorList.add(region.getName());
            exception.printStackTrace();
        }
    }

    /**
     * 获取区县地址
     *
     * @param url
     * @param region
     */
    private static void getArea(String url, RegionEntry region) {
        Document doc;
        try {

            doc = Jsoup.connect(url).timeout(0).get(); // Jsoup.connect(url).get();
            //<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'>长安区</a></td></tr>
            Elements links = doc.select("tr.countytr");
            RegionEntry area;
            for (Element e : links) {
                area = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    area.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    area.setName(name);
                    String absHref = codeE.attr("abs:href");
                    getTown(absHref, area);
                    region.getSub().add(area);
                } else {
                    alist = e.select("td");
                    area.setCode(alist.get(0).text());
                    area.setName(alist.get(1).text());
                    region.getSub().add(area);
                }
            }
        } catch (Exception exception) {
            errorList.add(region.getName());
            xianNum++;
            exception.printStackTrace();
        }

    }

    //乡镇
    private static void getTown(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).timeout(0).get(); // Jsoup.connect(url).get();
            //<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'>八宝山街道办事处</a></td></tr>
            Elements links = doc.select("tr.towntr");
            RegionEntry town;
            for (Element e : links) {

                town = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    town.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    town.setName(name);
                    region.getSub().add(town);
                } else {
                    alist = e.select("td");
                    town.setCode(alist.get(0).text());
                    town.setName(alist.get(1).text());
                    region.getSub().add(town);
                }
            }

        } catch (IOException e) {
            zhenNum++;
            errorList.add(region.getName());
            e.printStackTrace();
        }
    }

}

RegionEntry 实体类
package com.itzhongzi.area;

import java.util.ArrayList;
import java.util.List;

public class RegionEntry {
    private String code;
    private String name;
    private List<RegionEntry> sub = new ArrayList<RegionEntry>();

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<RegionEntry> getSub() {
        return sub;
    }

    public void setSub(List<RegionEntry> sub) {
        this.sub = sub;
    }

    public RegionEntry(String code, String name, List<RegionEntry> sub) {
        this.code = code;
        this.name = name;
        this.sub = sub;
    }

    public RegionEntry() {
    }

    @Override
    public String toString() {
        return "RegionEntry{" +
                "code='" + code + '\'' +
                ", name='" + name + '\'' +
                ", sub=" + sub +
                '}';
    }
}


结果展示

在这里插入图片描述

数据库结构创建
/*
 Navicat Premium Data Transfer

 Source Server         : 华为云测试
 Source Server Type    : MySQL
 Source Server Version : 50725
 Source Host           : 114.115.250.129:3306
 Source Schema         : area

 Target Server Type    : MySQL
 Target Server Version : 50725
 File Encoding         : 65001

 Date: 17/12/2019 10:47:30
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for sys_city
-- ----------------------------
DROP TABLE IF EXISTS `sys_city`;
CREATE TABLE `sys_city` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
  `province_code` varchar(100) NOT NULL COMMENT '省份编码',
  `city_code` varchar(100) NOT NULL COMMENT '市 编码',
  `city_name` varchar(100) NOT NULL COMMENT '市名字',
  `area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=344 DEFAULT CHARSET=utf8 COMMENT='市  的数据';

-- ----------------------------
-- Table structure for sys_county
-- ----------------------------
DROP TABLE IF EXISTS `sys_county`;
CREATE TABLE `sys_county` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
  `province_code` varchar(100) NOT NULL COMMENT '省份编码',
  `city_code` varchar(100) NOT NULL COMMENT '市 编码',
  `county_code` varchar(100) NOT NULL COMMENT '县 编码',
  `county_name` varchar(100) NOT NULL COMMENT '县名字',
  `area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3283 DEFAULT CHARSET=utf8 COMMENT='县级的数据';

-- ----------------------------
-- Table structure for sys_province
-- ----------------------------
DROP TABLE IF EXISTS `sys_province`;
CREATE TABLE `sys_province` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
  `province_code` varchar(100) NOT NULL COMMENT '省编码',
  `province_name` varchar(100) NOT NULL COMMENT '省名字',
  `area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=32 DEFAULT CHARSET=utf8 COMMENT='记录各个省的数据';

-- ----------------------------
-- Table structure for sys_town
-- ----------------------------
DROP TABLE IF EXISTS `sys_town`;
CREATE TABLE `sys_town` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
  `province_code` varchar(100) NOT NULL COMMENT '省份编码',
  `city_code` varchar(100) NOT NULL COMMENT '市 编码',
  `county_code` varchar(100) NOT NULL COMMENT '县 编码',
  `sys_code` varchar(100) NOT NULL COMMENT '镇 编码',
  `sys_name` varchar(100) NOT NULL COMMENT '镇名字',
  `area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=43487 DEFAULT CHARSET=utf8 COMMENT='镇级的数据';

SET FOREIGN_KEY_CHECKS = 1;

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ITzhongzi

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值