国家地理区域获取(国家统计局抓包)

国家地理区域获取(国家统计局抓包)

  • 国家统计局统计地址

    • http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
    • 国家统计局统计用区划分
  • 引入java HTML解析器Jsoup

    • <!--java HTML解析器-->
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.11.3</version>
      </dependency>
      
  • 编写java程序代码

    •   @Test
          public void getAreaTest() throws IOException {
      
              //创建文件输出流
              FileWriter fileWriter = new FileWriter(new File("d://china_city.sql"));
              //设置访问url(国际统计局的统计地址)
              String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
              //使用jsoup获取页面内容
              Document doc = Jsoup.parse(new URL(baseUrl), 100000);
              //获取页面中<tr>标签且class = "provincetr" ,然后查询所有的a标签元素
              Elements provinceTds = doc.select("tr[class=provincetr]").select("a");
              String cityName;
              String parentCode;
              String baseFormat = "insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ({0},{1},{2},{3});";
              //遍历数据
              for (Element element : provinceTds) {
                  //获取标签文本内容
                  cityName = element.text();
                  //获取省级code
                  String href = element.attr("href");
                  int index = href.indexOf(".html");
                  String provinceCode = href.substring(0, index);
                  parentCode = "0";
                  String provincePath = "''";
                  //格式化需要输出的字符串
                  String format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + provinceCode + "'", "'" + parentCode + "'", provincePath);
                  //System.out.println(format);
                  //输出流输出内容并刷新
                  fileWriter.write(format);
                  fileWriter.write("\r\n");
                  fileWriter.flush();
                  //获取市级访问url
                  String cityUrl = baseUrl + href;
                  doc = Jsoup.parse(new URL(cityUrl), 100000);
                  获取页面中<tr>标签且class = "citytr" 
                  Elements cityTds = doc.select("tr[class=citytr]");
                  for (Element cityTd : cityTds) {
                      //获取元素中的所有a标签元素
                      Elements tds = cityTd.select("a");
                      //获取城市名
                      cityName = tds.get(1).text();
                      parentCode = provinceCode;
                      //获取城市代码
                      String cityCode = tds.get(0).text();
                      //获取a标签的访问链接
                      href = tds.get(0).attr("href");
                      //格式化字符串
                      format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + cityCode + "'", "'" + parentCode + "'", "'" + parentCode + "'");
                      fileWriter.write(format);
                      fileWriter.write("\r\n");
                      fileWriter.flush();
                      //System.out.println(format);
                      String countryUrl = baseUrl + href;
                      doc = Jsoup.parse(new URL(countryUrl), 100000);
                      Elements countryTds = doc.select("tr[class=countytr]");
                      for (Element countryTd : countryTds) {
                          tds = countryTd.select("td");
                          cityName = tds.get(1).text();
                          parentCode = cityCode;
                          String parentPath = provinceCode + "," + parentCode;
                          String countryCode = tds.get(0).text();
                          format = MessageFormat.format(baseFormat, "'" + cityName + "'", "'" + countryCode + "'", "'" + parentCode + "'", "'" + parentPath + "'");
                          fileWriter.write(format);
                          fileWriter.write("\r\n");
                          fileWriter.flush();
                          //System.out.println(format);
                      }
                      try {
                          Thread.sleep(50);
                      } catch (InterruptedException e) {
                          e.printStackTrace();
                      }
                  }
      
              }
              fileWriter.close();
          }
      
  • 获取统计内容(部分)如下

    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('北京市','11','0','');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','110100000000','11','11');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('东城区','110101000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('西城区','110102000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('朝阳区','110105000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('丰台区','110106000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('石景山区','110107000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('海淀区','110108000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('门头沟区','110109000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('房山区','110111000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('通州区','110112000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('顺义区','110113000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('昌平区','110114000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('大兴区','110115000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('怀柔区','110116000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('平谷区','110117000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('密云区','110118000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('延庆区','110119000000','110100000000','11,110100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('天津市','12','0','');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','120100000000','12','12');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('和平区','120101000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河东区','120102000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河西区','120103000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('南开区','120104000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河北区','120105000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('红桥区','120106000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('东丽区','120110000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('西青区','120111000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('津南区','120112000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('北辰区','120113000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('武清区','120114000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('宝坻区','120115000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('滨海新区','120116000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('宁河区','120117000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('静海区','120118000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('蓟州区','120119000000','120100000000','12,120100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('河北省','13','0','');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('石家庄市','130100000000','13','13');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('市辖区','130101000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('长安区','130102000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('桥西区','130104000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('新华区','130105000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('井陉矿区','130107000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('裕华区','130108000000','130100000000','13,130100000000');
    insert into sys_province_city (city_name,city_code,parent_code,parent_path) values ('藁城区','130109000000','130100000000','13,130100000000');
    

以上抓取方式可能,有所拙劣,如果代价有更好的方法,可以留言互相参考!!!

国家统计局抓取的地图省市区划代码和城划分代码(最新2020/06/03),共596071条数据。来源于国家统计局http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/。 数据结构: CREATE TABLE `area` ( `areaid` varchar(255) COLLATE utf8_unicode_ci NOT NULL, `area_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `fatherid` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `area_type` int(255) DEFAULT NULL COMMENT '区域代码:\r\n100 :城镇,110:城区,111 :主城区,112 :城乡结合区,120 :镇区,121 :镇中心区,122:镇乡结合区,123:特殊区域200 :乡村,210:乡中心区,220:村庄\r\n\r\n', `is_delete` int(255) DEFAULT '0', PRIMARY KEY (`areaid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 部分数据: INSERT INTO `area` VALUES ('110000000000','北京市',NULL,NULL,0); INSERT INTO `area` VALUES ('110100000000','市辖区','110000000000',NULL,0); INSERT INTO `area` VALUES ('110101000000','东城区','110100000000',NULL,0); INSERT INTO `area` VALUES ('110101001000','东华门街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101001001','多福巷社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001002','银闸社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001005','东厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001006','智德社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001007','南池子社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001008','黄图岗社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001009','灯市口社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001010','正义路社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001011','甘雨社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001013','台基厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001014','韶九社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001015','王府井社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101002000','景山街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101002001','隆福寺社区居委会','110101002000',111,0); INSERT INTO `area` VALUES ('110101002002','吉祥社区居
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值