Java使用Jsoup爬取省市区乡镇的数据源码

标题Java使用Jsoup爬取省市区乡镇的数据源码

由于业务需求,需要让用户选择地址信息。所以在想在网上找一份最新的省市区及乡镇的数据。可是竟然都要积分C币一类,所以自己就利用apache的Jsoup写了一个爬取国家统计局的乡镇划分数据
1.介绍org.jsoup
jsoup是一个Java的html解析器
2.Maven依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.9.2</version>
      </dependency>

3.编写实体类,用于储存数据
RegionEntry.java

package cn.jiangdoc.utils;

import java.util.ArrayList;
import java.util.List;

public class RegionEntry {
    private String code;
    private String name;
    private List<RegionEntry> sub = new ArrayList<>();

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<RegionEntry> getSub() {
        return sub;
    }

    public void setSub(List<RegionEntry> sub) {
        this.sub = sub;
    }

    public RegionEntry(String code, String name, List<RegionEntry> sub) {
        this.code = code;
        this.name = name;
        this.sub = sub;
    }

    public RegionEntry() {
    }
}

4.正式开始我们的爬虫数据
AddressData .java

package cn.jiangdoc.utils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 抓取
 *
 * @author jiangdoc
 * @date 2019-3-16
 */
public class AddressData {
    public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
    private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
    public static void main(String[] args) {
        System.out.println("抓取开始:" + new Date());
        getProvince();
        StringBuffer content = new StringBuffer();
        for (RegionEntry one : regions) {
            content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
            for (RegionEntry two : one.getSub()) {
                content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode()+"','").append(two.getName()).append("', 2);\r\n");
                for (RegionEntry three : two.getSub()) {
                    content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
                    for(RegionEntry four:three.getSub()){
                        content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
                    }
                }
            }
        }
        FileOutputStream out = null;
//        Region.writeFile(content.toString());
        try{
           out = new FileOutputStream(new File("G:\\log\\city.txt"));
            byte[] bytes = content.toString().getBytes();
            out.write(bytes);
            out.flush();
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            if(out!=null)
                try{
                out.close();
            }catch (Exception e){
                e.printStackTrace();
                }
        }
        System.out.println("抓取完毕:" + new Date());
    }
    private static void getProvince() {
        Document doc;
        try {
            doc = Jsoup.connect(SITE_URL).get(); //Jsoup.connect(SITE_URL).get();
            Elements links = doc.select("tr.provincetr").select("a");
            RegionEntry region = null;
            for (Element e : links) {
                region = new RegionEntry();
                String href = e.attr("href");
                String[] arr = href.split("\\.");
                String code = arr[0];
                if (arr[0].length() < 6) {
                    for (int i = 0; i < 6 - arr[0].length(); i++) {
                        code += "0";
                    }
                }
                region.setCode(code);
                region.setName(e.text());
//                href的绝地路径
                String absHref = e.attr("abs:href");
                System.out.println(absHref);
                getCity(absHref, region);
                regions.add(region);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 获取市地址
     * @param url
     * @param region
     */
    private static void getCity(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); //Jsoup.connect(url).get().charset(charset);
//            <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'>乌鲁木齐市</a></td></tr>
            Elements links = doc.select("tr.citytr");
            RegionEntry city;
            for (Element e : links) {
                city = new RegionEntry();
                Elements alist = e.select("a");
                Element codeE = alist.get(0);
                Element codeN = alist.get(1);
                String name = codeN.text();
                String code = codeE.text();
                if ("市辖区".equals(name)) {
                    name = region.getName();
                    //code = region.getCode();
                }
                city.setCode(code);
                city.setName(name);
                String absHref = codeE.attr("abs:href");
                getArea(absHref, city);
                region.getSub().add(city);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 获取区县地址
     * @param url
     * @param region
     */
    private static void getArea(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'>长安区</a></td></tr>
            Elements links = doc.select("tr.countytr");
            RegionEntry area;
            for (Element e : links) {
                area = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    area.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    area.setName(name);
                    String absHref = codeE.attr("abs:href");
                    getTown(absHref, area);
                    region.getSub().add(area);
                } else {
                    alist = e.select("td");
                    area.setCode(alist.get(0).text());
                    area.setName(alist.get(1).text());
                    region.getSub().add(area);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    //乡镇
    private static void getTown(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'>八宝山街道办事处</a></td></tr>
            Elements links = doc.select("tr.towntr");
            RegionEntry town;
            for (Element e : links) {
                town = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    town.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    town.setName(name);
                    region.getSub().add(town);
                } else {
                    alist = e.select("td");
                    town.setCode(alist.get(0).text());
                    town.setName(alist.get(1).text());
                    region.getSub().add(town);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

注意:运行中可能会出现链接超时,很正常,在访问比较少的时间段,成功率会大很多。还有就是广东省的东菀市和中山市比较特殊没有区级的划分;

自己矫正后的数据的下载地址:点击下载

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值