1、准备maven包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.14</version>
</dependency>
2、对象类
public class AreaDto {
private Integer id;
private String name;
private String code;
private Integer parentId;
private Integer sort;
private Integer status;
private String ruleId;
private List<AreaDto> children=new ArrayList<AreaDto>();
}
3、编写工具类
public class RuleIdUtils {
public static String getRootRuleId(int num) {
if(num<2)
throw new RuntimeException("必须大于等于2!");
StringBuffer sb=new StringBuffer("");
for(int i=0;i<num-1;i++) {
sb.append("0");
}
sb.append("1");
return sb.toString();
}
public static String getFirstChildRuleId(String parentRuleId,Integer num) {
String firstId=getRootRuleId(num);
return parentRuleId+firstId;
}
public static String getNextRuleId(String RuleId) {
int length=RuleId.length();
long next=Long.parseLong(RuleId)+1L;
StringBuffer sb=new StringBuffer(String.valueOf(next));
if(length<sb.length()) {
throw new RuntimeException("超出最大范围!");
}else if(length>sb.length()) {
StringBuffer s1=new StringBuffer("");
for(int i=0;i<(length-sb.length());i++) {
s1.append("0");
}
sb=s1.append(sb);
}
return sb.toString();
}
public static void main(String[] args) {
System.out.println(getRootRuleId(3));
System.out.println(getFirstChildRuleId("001002",3));
System.out.println(getNextRuleId("001002001"));
}
}
public class AreaUtils {
// 统计局行政区域链接
public static final String REGION_URL="http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022";
public static Document parseUrl(String url) {
try {
//Document doc=Jsoup.connect(url).get();
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
String result=null;
if (response != null){
HttpEntity entity = response.getEntity(); //获取网页内容
result = EntityUtils.toString(entity, "UTF-8");
}
if (response != null){
response.close();
}
if (httpClient != null){
httpClient.close();
}
if(StringUtils.isNotEmpty(result)) {
return Jsoup.parse(result);
}
return null;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
public static List<AreaDto> getArea() {
List<AreaDto> data=new ArrayList<AreaDto>();
// provincetr
String className="provincetr";
String parentHtml="/index.html";
String ruleId="0000";
int sort=1;
Document doc=parseUrl(REGION_URL+parentHtml);
Elements elements=doc.getElementsByClass(className);
Elements links=elements.select("a[href]");
Iterator<Element> it=links.iterator();
while (it.hasNext()) {
Element element=it.next();
AreaDto area=new AreaDto();
area.setCode(element.attr("href").split("\\.")[0]);
area.setName(element.html().replace("<br>", ""));
ruleId=RuleIdUtils.getNextRuleId(ruleId);
area.setRuleId(ruleId);
area.setSort(sort++);
getCityChildren(REGION_URL+"/"+element.attr("href"), area, "citytr");
data.add(area);
}
return data;
}
private static void getCityChildren(String link,AreaDto parent,String className) {
int sort=1;
String ruleId=parent.getRuleId()+"0000";
Document doc=parseUrl(link);
if(null==doc)
return;
Elements elements=doc.getElementsByClass(className);
Elements trs=elements.select("tr");
Iterator<Element> it=trs.iterator();
List<AreaDto> data=new ArrayList<AreaDto>();
while (it.hasNext()) {
Element element=it.next();
Elements tds=element.select("td");
if(tds.size()==2) {
AreaDto area=new AreaDto();
Element td1=tds.get(0).select("a").first();
area.setCode(td1.html());
Element td2=tds.get(1).select("a").first();
area.setName(td2.html());
ruleId=RuleIdUtils.getNextRuleId(ruleId);
area.setRuleId(ruleId);
area.setSort(sort++);
getCountyChildren(REGION_URL+"/"+td2.attr("href"), area, "countytr");
data.add(area);
}
}
parent.setChildren(data);
}
private static void getCountyChildren(String link,AreaDto parent,String className) {
int sort=1;
String ruleId=parent.getRuleId()+"0000";
Document doc=parseUrl(link);
if(null==doc)
return;
Elements elements=doc.getElementsByClass(className);
Elements trs=elements.select("tr");
Iterator<Element> it=trs.iterator();
List<AreaDto> data=new ArrayList<AreaDto>();
while (it.hasNext()) {
Element element=it.next();
Elements tds=element.select("td");
if(tds.size()==2) {
AreaDto area=new AreaDto();
Element td1=tds.get(0).select("a").first();
// 因页面有些没有超链接
if(null==td1) {
td1=tds.get(0);
}
area.setCode(td1.html());
Element td2=tds.get(1).select("a").first();
// 因页面有些没有超链接
if(null==td2) {
td2=tds.get(1);
}
area.setName(td2.html());
ruleId=RuleIdUtils.getNextRuleId(ruleId);
area.setRuleId(ruleId);
area.setSort(sort++);
data.add(area);
}
}
parent.setChildren(data);
}
public static void main(String[] args) {
List<AreaDto> data=getArea();
System.out.println(JSONObject.toJSONString(data));
}
}
3、说明
因省市区太多数据了,有时开发工具不一定能打印出来,建议直接边解析边插入数据库。嫌麻烦就直接用sql创建https://download.csdn.net/download/keng2206/87650437。