前言:
参考文章:
https://www.cnblogs.com/yangzhilong/p/3530700.html
https://www.cnblogs.com/liushaofeng89/p/4873086.html
最近因为用户反馈省份数据表单有部分缺失,百度了一圈度娘以后决定还是自己拉取一下,省份数据的来源于国家统计局,笔者拉取的是2019年,2020-02-25拉取的数据。
省份数据来源:国家统计局
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/
笔者用的是java 中jsoup ,关于jsoup的用法,可参考下面这个文章:https://www.open-open.com/jsoup/
开始
1.准备一张表 region_directory
CREATE TABLE `region_directory` (
`id` int(32) NOT NULL AUTO_INCREMENT,
`pid` int(32) DEFAULT NULL COMMENT '父级ID',
`name` varchar(64) DEFAULT NULL COMMENT '地域名称',
`name_CN` varchar(64) DEFAULT NULL COMMENT '地域英文名',
`create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`update_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
`create_user` varchar(255) DEFAULT NULL COMMENT '创建人',
`update_user` varchar(255) DEFAULT NULL COMMENT '修改人',
`is_open` char(2) DEFAULT NULL COMMENT '是否开启 (0代表未开启 1代表开启)',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2421 DEFAULT CHARSET=utf8 COMMENT='地域表';
2.需要在pom文件中引入jsoup 的jar 包。
官方上现在有更高版本,我这边使用的是目前使用人数比较多的版本。
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
3.拉取数据的代码主要在getRegionDirectory 这个接口中。
4.需要注意的一点是:下图中的这个name ,这个name 代表的是全国省份一级数据,我加了一个判断,先拉取北京市的数据,之所以加这个判断的原因是 数据量比较大,我如果一次性拉取过多的话,连接会报502 ,现在很多网站会做这种恶意攻击的防范,这里需要注意。
4.1 这就是上述图片中描述的502报错
5.接下来就可以在浏览器上访问拉取数据的接口:
控制台打印一下 获取的数据:
存到数据库中的数据:
6.文章中涉及的所有代码
RegionDirectoryController
package com.bos.controller.basic;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.bos.data.model.RegionDirectoryModel;
import com.bos.data.model.vo.basic.RegionVo;
import com.bos.data.repositories.jpa.setting.RegionDirectoryJPARepository;
import com.google.gson.Gson;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.transaction.interceptor.TransactionAspectSupport;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* @Author tanghh
* @Date 2020/6/23 10:37
*/
@RestController
@RequestMapping(value = "/region")
public class RegionDirectoryController {
private Logger logger = LoggerFactory.getLogger(RegionDirectoryController.class);
@Autowired
private RegionDirectoryJPARepository regionDirectoryJPARepository;
private static List<String> types = new ArrayList<>();
private static List<String> specialCitys = new ArrayList<>();
/**
* 省份
*/
public static final String LEVEL_PROVINCE = "provincetr";
/**
* 城市
*/
public static final String LEVEL_CITY = "citytr";
/**
* 区
*/
public static final String LEVEL_COUNTY = "countytr";
/**
* 街道
*/
public static final String LEVEL_TOWN = "towntr";
/**
* 居委会
*/
public static final String LEVEL_VILLAGE = "villagetr";
public static final int LEVEL_MODE_STRING = 1;
public static final int LEVEL_MODE_NUMBER = 2;
public static final String CHARSET = "GBK";
static {
types.add(LEVEL_PROVINCE);
types.add(LEVEL_CITY);
types.add(LEVEL_COUNTY);
types.add(LEVEL_TOWN);
types.add(LEVEL_VILLAGE);
}
/**
* 这个列表存放的是比较特殊的市,它们是属于LEVEL_CITY,但下一级却跳过了LEVEL_COUNTY,而直接到LEVEL_TOWN
* 由于数据较多,不能一一比对,使用当中发现属于这种情况的城市加入到这里即可
*/
static {
specialCitys.add("东莞市");
specialCitys.add("中山市");
specialCitys.add("儋州市");
}
//**************************以下值请根据实际情况修改*************************************
/**
* 抓取的首页
*/
public static final String webUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
/**
* 保存路径
*/
public static final String savePath = "C:/project/latestEbo/ebo-web/ebo/src/main/resources/china.json";
/**
* 抓取数据的范围[]支持第一级和中国,比如中国,广东省,北京市
*/
public static final String AREA = "中国";
public static int TARGET_LEVEL = 3;
/**
* 表示抓取数据的层级采用的模式:LEVEL_MODE_STRING--表示按字符级别 LEVEL_MODE_NUMBER--表示按数字级别
*/
public static int LEVEL_MODE = LEVEL_MODE_NUMBER;
//**************************以上值请根据实际情况修改*************************************
@GetMapping(value = "/getRegionDirectory")
public void getRegionDirectory() {
try {
System.out.println("开始抓取,请耐心等待!!!");
System.out.println("抓取范围:" + AREA + ",抓取模式(1--字符 2--数字):" + LEVEL_MODE + ",抓取层级:" + TARGET_LEVEL + "(模式为字符:1--province,2--city,3--county,4--town,5--village;)");
long starttime = System.currentTimeMillis();
RegionVo region = new RegionVo("000000000000", "中国", 0);
region.child = new ArrayList<>();
Document doc = getDocument(webUrl);
Elements provincetr = doc.getElementsByClass(LEVEL_PROVINCE);
for (Element e : provincetr) {
Elements a = e.getElementsByTag("a");
for (Element ea : a) {
//拿到绝对路径
String nextUrl = ea.attr("abs:href");
String[] arr = nextUrl.split("/");
String code = arr[arr.length - 1].split("\\.")[0] + "0000000000";
String name = ea.text();
if (name.equals("北京市")) {
if (AREA.equals("中国") || AREA.equals(name)) {
System.out.println(name);
RegionVo child = new RegionVo(code, name, 1);
region.child.add(child);
int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(LEVEL_PROVINCE) : child.level;
//表示还需要继续解析
if (currentlevel < TARGET_LEVEL) {
parseNext(types.get(1), nextUrl, child);
}
}
}
}
}
//解析json
String jsonStr = new Gson().toJson(region);
System.out.println(jsonStr);
JSONObject jsonObject = JSONObject.parseObject(jsonStr);
JSONArray childJsonArray = jsonObject.getJSONArray("child");
for (int i = 0; i < childJsonArray.size(); i++) {
JSONObject childJsonObject = (JSONObject) childJsonArray.get(i);
//获取省份级别数据
String provinceName = childJsonObject.getString("name");
RegionDirectoryModel regionDirectoryModel = new RegionDirectoryModel(0, provinceName, "汤辉红", "汤辉红");
RegionDirectoryModel provinceModel = regionDirectoryJPARepository.saveAndFlush(regionDirectoryModel);
JSONArray jsonArray = childJsonObject.getJSONArray("child");
for (Object o : jsonArray) {
//获取省份下的市数据
JSONObject itemJsonObject = (JSONObject) o;
String cityName = itemJsonObject.getString("name");
RegionDirectoryModel cityModel = new RegionDirectoryModel(provinceModel.getId(), cityName, "汤辉红", "汤辉红");
RegionDirectoryModel newCityModel = regionDirectoryJPARepository.saveAndFlush(cityModel);
JSONArray finalChildJsonArray = itemJsonObject.getJSONArray("child");
for (Object o1 : finalChildJsonArray) {
//获取城市下的县数据
JSONObject finalJsonObject = (JSONObject) o1;
String prefectureName = finalJsonObject.getString("name");
RegionDirectoryModel prefectureModel = new RegionDirectoryModel(newCityModel.getId(), prefectureName, "汤辉红", "汤辉红");
regionDirectoryJPARepository.save(prefectureModel);
}
}
}
long endtime = System.currentTimeMillis();
System.out.println("抓取完毕!!!耗时:" + (endtime - starttime) / 1000 / 60 + "min");
} catch (Exception e) {
logger.error("获取省份数据失败", e);
TransactionAspectSupport.currentTransactionStatus().setRollbackOnly();
}
}
private static Document getDocument(String url) throws IOException {
return Jsoup.parse(new URL(url).openStream(), CHARSET, url);
}
/**
* @param type 见LEVEL_
* @return
*/
private static int getLevel(String type) {
return types.indexOf(type) + 1;
}
private static void saveJson(RegionVo region) throws IOException {
FileWriter fw = new FileWriter(new File(savePath));
BufferedWriter bw = new BufferedWriter(fw);
bw.write(new Gson().toJson(region));
bw.flush();
bw.close();
}
/**
* 解析下一级数据
*
* @param type 见LEVEL_开头
* @param url 要抓取的网页url
* @param region 将要保存的数据
* @throws Exception
*/
public static void parseNext(String type, String url, RegionVo region) throws Exception {
region.child = new ArrayList<>();
Document doc = getDocument(url);
Elements es = doc.getElementsByClass(type);
if (LEVEL_VILLAGE.equals(type)) {
//<tr class="villagetr"><td>110101001001</td><td>111</td><td>多福巷社区居委会</td></tr>
for (Element e : es) {
Elements tds = e.getElementsByTag("td");
String code = tds.get(0).text();
String name = tds.get(2).text();
RegionVo child = new RegionVo(code, name, region.level + 1);
region.child.add(child);
System.out.println(space(child.level) + name);
}
} else {
//需要处理以下两种情况
//第一种:<tr class="countytr"><td>130101000000</td><td>市辖区</td></tr>
//第二种:<tr class="countytr"><td><a href="01/130102.html">130102000000</a></td><td><a href="01/130102.html">长安区</a></td></tr>
for (Element e : es) {
String code = null;
String name = null;
String nextUrl = null;
Elements a = e.getElementsByTag("a");
if (a.isEmpty()) {
//属于第一种情况
Elements tds = e.getElementsByTag("td");
code = tds.get(0).text();
name = tds.get(1).text();
} else {
//13/1301.html
nextUrl = a.get(0).attr("abs:href");
code = a.get(0).text();
name = a.get(1).text();
}
RegionVo child = new RegionVo(code, name, region.level + 1);
region.child.add(child);
System.out.println(space(child.level) + name);
int currentlevel = LEVEL_MODE == LEVEL_MODE_STRING ? getLevel(type) : child.level;
if (!a.isEmpty() && currentlevel < TARGET_LEVEL) {
//如果是东莞市,LEVEL_CITY下一级是LEVEL_TOWN,而不是LEVEL_COUNTY这里需要特殊处理
String nextType = null;
if (LEVEL_MODE == LEVEL_MODE_NUMBER
&& (specialCitys.contains(name))) {
nextType = LEVEL_TOWN;
} else {
nextType = types.get(types.indexOf(type) + 1);
}
parseNext(nextType, nextUrl, child);
}
}
}
}
private static String space(int level) {
if (level > 5) {
return "";
}
return " ".substring(0, level);
}
}
RegionVo
package com.bos.data.model.vo.basic;
import lombok.Data;
import java.util.List;
/**
* @Author tanghh
* @Date 2020/6/23 10:41
*/
@Data
public class RegionVo {
/**
* 编码
*/
public String code;
/**
* 名称
*/
public String name;
/**
* 当前级别
*/
public int level;
/**
* 子数据
*/
public List<RegionVo> child;
public RegionVo(String code, String name, int level) {
this.code = code;
this.name = name;
this.level = level;
}
}
RegionDirectoryModel
package com.bos.data.model;
import javax.persistence.*;
import java.io.Serializable;
import java.sql.Timestamp;
import java.util.Objects;
/**
* @author luojie 2018/7/4
*/
@Entity
@Table(name = "region_directory", schema = "test", catalog = "")
public class RegionDirectoryModel implements Serializable {
private Integer id;
private Integer pid;
private String name;
private String nameCn;
private String isOpen="0";
private Timestamp createTime;
private Timestamp updateTime;
private String createUser;
private String updateUser;
@Id
@Column(name = "id")
@GeneratedValue(strategy = GenerationType.IDENTITY)
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
@Basic
@Column(name = "name")
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Basic
@Column(name = "name_CN")
public String getNameCn() {
return nameCn;
}
public void setNameCn(String nameCn) {
this.nameCn = nameCn;
}
@Basic
@Column(name = "pid")
public Integer getPid() {
return pid;
}
public void setPid(Integer pid) {
this.pid = pid;
}
@Basic
@Column(name = "is_open")
public String getIsOpen() {
return isOpen;
}
public void setIsOpen(String isOpen) {
this.isOpen = isOpen;
}
@Basic
@Column(name = "create_time")
public Timestamp getCreateTime() {
return createTime;
}
public void setCreateTime(Timestamp createTime) {
this.createTime = createTime;
}
@Basic
@Column(name = "update_time")
public Timestamp getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Timestamp updateTime) {
this.updateTime = updateTime;
}
@Basic
@Column(name = "create_user")
public String getCreateUser() {
return createUser;
}
public void setCreateUser(String createUser) {
this.createUser = createUser;
}
@Basic
@Column(name = "update_user")
public String getUpdateUser() {
return updateUser;
}
public void setUpdateUser(String updateUser) {
this.updateUser = updateUser;
}
public RegionDirectoryModel() {
}
public RegionDirectoryModel(Integer pid, String name, String createUser, String updateUser) {
this.pid = pid;
this.name = name;
this.createUser = createUser;
this.updateUser = updateUser;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
RegionDirectoryModel that = (RegionDirectoryModel) o;
return id == that.id &&
Objects.equals(name, that.name) &&
Objects.equals(nameCn, that.nameCn) &&
Objects.equals(pid, that.pid);
}
@Override
public int hashCode() {
return Objects.hash(id, name, nameCn, pid);
}
}
本篇文章就到这里,
如果觉得笔者写的不错的话,欢迎评论点赞。
下篇文章贴出所有省份数据。