原理: 使用
jsoup解析国家统计局的前端数据,倒入自己的数据库。
核心代码
package com.itzhongzi.area;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
public class Application {
// public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";
public static String SITE_URL = "http://localhost:8080/local/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html";
private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
private static List<String> errorList = new ArrayList<String>();
private static int shengNum = 0;
private static int shiNum = 0;
private static int xianNum = 0;
private static int zhenNum = 0;
public static void main(String[] args) {
System.out.println("抓取开始:" + new Date());
getProvince();
StringBuffer content = new StringBuffer();
for (RegionEntry one : regions) {
content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
for (RegionEntry two : one.getSub()) {
content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode() + "','").append(two.getName()).append("', 2);\r\n");
for (RegionEntry three : two.getSub()) {
content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
for (RegionEntry four : three.getSub()) {
content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
}
}
}
}
FileOutputStream out = null;
// Region.writeFile(content.toString());
try {
out = new FileOutputStream(new File("/Users/huanghainongye/Desktop/area/area.txt"));
byte[] bytes = content.toString().getBytes();
out.write(bytes);
out.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (out != null)
try {
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
System.out.println("抓取完毕:" + new Date());
System.out.println("省error:" + shengNum + "--" + "市error:" + shiNum + "---" + "县error: " + xianNum + "----" + "镇error:" + zhenNum);
System.out.println("错误列表:" + errorList.toString());
}
/**
* 抓去省的数据
*/
private static void getProvince() {
Document doc;
try {
doc = Jsoup.connect(SITE_URL).timeout(0).get(); //Jsoup.connect(SITE_URL).get();
Elements links = doc.select("tr.provincetr").select("a");
RegionEntry region = null;
for (Element e : links) {
try {
region = new RegionEntry();
String href = e.attr("href");
String[] arr = href.split("\\.");
String code = arr[0];
if (arr[0].length() < 6) {
for (int i = 0; i < 6 - arr[0].length(); i++) {
code += "0";
}
}
region.setCode(code);
region.setName(e.text());
// href的绝地路径
String absHref = e.attr("abs:href");
System.out.println(absHref);
getCity(absHref, region);
regions.add(region);
System.out.println(region);
// 解析成json
JSONObject jsonobj = new JSONObject();
jsonobj.put("name", region.getName());
jsonobj.put("code", region.getCode());
jsonobj.put("subArea", region.getSub());
System.out.println(jsonobj);
System.out.println("-------------------------");
} catch (Exception exp) {
errorList.add(e.text());
shengNum++;
exp.printStackTrace();
}
}
} catch (Exception exception) {
errorList.add("省份错误");
exception.printStackTrace();
}
}
/**
* 抓去市的数据
*/
public static void getCity(String url, RegionEntry region) {
Document doc;
try {
System.out.println("getcity");
doc = Jsoup.connect(url).timeout(0).get(); //Jsoup.connect(url).get().charset(charset);
// <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'>乌鲁木齐市</a></td></tr>
Elements links = doc.select("tr.citytr");
RegionEntry city;
for (Element e : links) {
city = new RegionEntry();
Elements alist = e.select("a");
Element codeE = alist.get(0);
Element codeN = alist.get(1);
String name = codeN.text();
String code = codeE.text();
if ("市辖区".equals(name)) {
name = region.getName();
//code = region.getCode();
}
city.setCode(code);
city.setName(name);
String absHref = codeE.attr("abs:href");
getArea(absHref, city);
region.getSub().add(city);
}
} catch (Exception exception) {
shiNum++;
errorList.add(region.getName());
exception.printStackTrace();
}
}
/**
* 获取区县地址
*
* @param url
* @param region
*/
private static void getArea(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).timeout(0).get(); // Jsoup.connect(url).get();
//<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'>长安区</a></td></tr>
Elements links = doc.select("tr.countytr");
RegionEntry area;
for (Element e : links) {
area = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
area.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
area.setName(name);
String absHref = codeE.attr("abs:href");
getTown(absHref, area);
region.getSub().add(area);
} else {
alist = e.select("td");
area.setCode(alist.get(0).text());
area.setName(alist.get(1).text());
region.getSub().add(area);
}
}
} catch (Exception exception) {
errorList.add(region.getName());
xianNum++;
exception.printStackTrace();
}
}
//乡镇
private static void getTown(String url, RegionEntry region) {
Document doc;
try {
doc = Jsoup.connect(url).timeout(0).get(); // Jsoup.connect(url).get();
//<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'>八宝山街道办事处</a></td></tr>
Elements links = doc.select("tr.towntr");
RegionEntry town;
for (Element e : links) {
town = new RegionEntry();
Elements alist = e.select("a");
if (alist.size() > 0) {
Element codeE = alist.get(0);
String code = codeE.text();
town.setCode(code);
Element codeN = alist.get(1);
String name = codeN.text();
town.setName(name);
region.getSub().add(town);
} else {
alist = e.select("td");
town.setCode(alist.get(0).text());
town.setName(alist.get(1).text());
region.getSub().add(town);
}
}
} catch (IOException e) {
zhenNum++;
errorList.add(region.getName());
e.printStackTrace();
}
}
}
RegionEntry 实体类
package com.itzhongzi.area;
import java.util.ArrayList;
import java.util.List;
public class RegionEntry {
private String code;
private String name;
private List<RegionEntry> sub = new ArrayList<RegionEntry>();
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<RegionEntry> getSub() {
return sub;
}
public void setSub(List<RegionEntry> sub) {
this.sub = sub;
}
public RegionEntry(String code, String name, List<RegionEntry> sub) {
this.code = code;
this.name = name;
this.sub = sub;
}
public RegionEntry() {
}
@Override
public String toString() {
return "RegionEntry{" +
"code='" + code + '\'' +
", name='" + name + '\'' +
", sub=" + sub +
'}';
}
}
结果展示
数据库结构创建
/*
Navicat Premium Data Transfer
Source Server : 华为云测试
Source Server Type : MySQL
Source Server Version : 50725
Source Host : 114.115.250.129:3306
Source Schema : area
Target Server Type : MySQL
Target Server Version : 50725
File Encoding : 65001
Date: 17/12/2019 10:47:30
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for sys_city
-- ----------------------------
DROP TABLE IF EXISTS `sys_city`;
CREATE TABLE `sys_city` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`province_code` varchar(100) NOT NULL COMMENT '省份编码',
`city_code` varchar(100) NOT NULL COMMENT '市 编码',
`city_name` varchar(100) NOT NULL COMMENT '市名字',
`area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=344 DEFAULT CHARSET=utf8 COMMENT='市 的数据';
-- ----------------------------
-- Table structure for sys_county
-- ----------------------------
DROP TABLE IF EXISTS `sys_county`;
CREATE TABLE `sys_county` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`province_code` varchar(100) NOT NULL COMMENT '省份编码',
`city_code` varchar(100) NOT NULL COMMENT '市 编码',
`county_code` varchar(100) NOT NULL COMMENT '县 编码',
`county_name` varchar(100) NOT NULL COMMENT '县名字',
`area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=3283 DEFAULT CHARSET=utf8 COMMENT='县级的数据';
-- ----------------------------
-- Table structure for sys_province
-- ----------------------------
DROP TABLE IF EXISTS `sys_province`;
CREATE TABLE `sys_province` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`province_code` varchar(100) NOT NULL COMMENT '省编码',
`province_name` varchar(100) NOT NULL COMMENT '省名字',
`area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=32 DEFAULT CHARSET=utf8 COMMENT='记录各个省的数据';
-- ----------------------------
-- Table structure for sys_town
-- ----------------------------
DROP TABLE IF EXISTS `sys_town`;
CREATE TABLE `sys_town` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`province_code` varchar(100) NOT NULL COMMENT '省份编码',
`city_code` varchar(100) NOT NULL COMMENT '市 编码',
`county_code` varchar(100) NOT NULL COMMENT '县 编码',
`sys_code` varchar(100) NOT NULL COMMENT '镇 编码',
`sys_name` varchar(100) NOT NULL COMMENT '镇名字',
`area_num` int(11) NOT NULL COMMENT '1 省 2 市 3 县 4 乡',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=43487 DEFAULT CHARSET=utf8 COMMENT='镇级的数据';
SET FOREIGN_KEY_CHECKS = 1;