爬虫行政区号并获取电话区号和邮编
1、mysql建表语句
CREATE TABLE `his_sys_plat_region` (
`Id` bigint(20) NOT NULL AUTO_INCREMENT,
`RegionId` bigint(20) NOT NULL DEFAULT '0' COMMENT '地区ID',
`ParentRegionId` bigint(20) NOT NULL DEFAULT '0' COMMENT '上级地区ID',
`RegionName` varchar(50) DEFAULT NULL COMMENT '地区名称',
`RegionNameEN` varchar(50) DEFAULT NULL COMMENT '英文地区名称',
`Depth` int(11) NOT NULL DEFAULT '0' COMMENT '地区层级',
`IdPath` varchar(255) DEFAULT NULL COMMENT '地区ID路径',
`NamePath` varchar(255) DEFAULT NULL COMMENT '地区名称路径',
`PostCode` varchar(20) DEFAULT NULL COMMENT '邮政编码',
`TelephoneCode` varchar(20) DEFAULT NULL COMMENT '电话区号',
`OrderNum` int(11) NOT NULL DEFAULT '0' COMMENT '排序号',
`Lat` varchar(60) DEFAULT NULL COMMENT '地区中心经度',
`Lng` varchar(60) DEFAULT NULL COMMENT '地区中心纬度',
`Active` tinyint(4) NOT NULL DEFAULT '1' COMMENT '是否有效',
`QueryStr` varchar(8000) DEFAULT NULL COMMENT '查询字符串',
`CreateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`UpdateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
PRIMARY KEY (`Id`),
KEY `idx_his_sys_plat_region_rid` (`RegionId`),
KEY `idx_his_sys_plat_region_pid` (`ParentRegionId`),
KEY `idx_his_sys_plat_region_idp` (`IdPath`)
) ENGINE=InnoDB AUTO_INCREMENT=3194 DEFAULT CHARSET=utf8mb4;
2、对应的实体类
import java.util.Date;
import java.util.Map;
import yb.common.util.InitListMap;
import yb.core.tablemodel.annotation.Column;
import yb.core.tablemodel.annotation.Table;
import yz.his.model.BaseModel;
@Table(name = "his_sys_plat_region")
public class HisSysPlatRegionModel extends BaseModel {
protected transient Map<String, String> fieldDictTypes = InitListMap.toHashMap(new String[] {});
@Column(name = "Id", primaryKey = true, insertable = false)
private long Id;
@Column(name = "RegionId")
private long RegionId = 0L;
@Column(name = "ParentRegionId")
private long ParentRegionId = 0L;
@Column(name = "RegionName")
private String RegionName;
@Column(name = "RegionNameEN")
private String RegionNameEN;
@Column(name = "Depth")
private int Depth = 0;
@Column(name = "IdPath")
private String IdPath;
@Column(name = "NamePath")
private String NamePath;
@Column(name = "PostCode")
private String PostCode;
@Column(name = "TelephoneCode")
private String TelephoneCode;
@Column(name = "OrderNum")
private int OrderNum = 0;
@Column(name = "Lat")
private String Lat;
@Column(name = "Lng")
private String Lng;
@Column(name = "Active")
private int Active = 1;
@Column(name = "QueryStr")
private String QueryStr;
@Column(name = "CreateTime", insertable = false, updateable = false)
private Date CreateTime;
@Column(name = "UpdateTime", insertable = false, updateable = false)
private Date UpdateTime;
public long getId() {
return Id;
}
public void setId(long Id) {
this.Id = Id;
}
public long getRegionId() {
return RegionId;
}
public void setRegionId(long RegionId) {
this.RegionId = RegionId;
}
public long getParentRegionId() {
return ParentRegionId;
}
public void setParentRegionId(long ParentRegionId) {
this.ParentRegionId = ParentRegionId;
}
public String getRegionName() {
return RegionName;
}
public void setRegionName(String RegionName) {
this.RegionName = RegionName;
}
public String getRegionNameEN() {
return RegionNameEN;
}
public void setRegionNameEN(String RegionNameEN) {
this.RegionNameEN = RegionNameEN;
}
public int getDepth() {
return Depth;
}
public void setDepth(int Depth) {
this.Depth = Depth;
}
public String getIdPath() {
return IdPath;
}
public void setIdPath(String IdPath) {
this.IdPath = IdPath;
}
public String getNamePath() {
return NamePath;
}
public void setNamePath(String NamePath) {
this.NamePath = NamePath;
}
public String getPostCode() {
return PostCode;
}
public void setPostCode(String PostCode) {
this.PostCode = PostCode;
}
public String getTelephoneCode() {
return TelephoneCode;
}
public void setTelephoneCode(String TelephoneCode) {
this.TelephoneCode = TelephoneCode;
}
public int getOrderNum() {
return OrderNum;
}
public void setOrderNum(int OrderNum) {
this.OrderNum = OrderNum;
}
public String getLat() {
return Lat;
}
public void setLat(String Lat) {
this.Lat = Lat;
}
public String getLng() {
return Lng;
}
public void setLng(String Lng) {
this.Lng = Lng;
}
public int getActive() {
return Active;
}
public void setActive(int Active) {
this.Active = Active;
}
public String getQueryStr() {
return QueryStr;
}
public void setQueryStr(String QueryStr) {
this.QueryStr = QueryStr;
}
public Date getCreateTime() {
return CreateTime;
}
public void setCreateTime(Date CreateTime) {
this.CreateTime = CreateTime;
}
public Date getUpdateTime() {
return UpdateTime;
}
public void setUpdateTime(Date UpdateTime) {
this.UpdateTime = UpdateTime;
}
@Override
public Map<String, String> getFieldDictTypes() {
return fieldDictTypes;
}
}
3、同步代码
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import yb.common.util.Http;
import yb.common.util.ObjectTransfer;
import yb.common.util.Tester;
import yb.core.bundle.annotation.YB;
import yb.core.business.YBService;
import yb.core.dao.IDao;
import yb.core.loader.BundleLoader;
import yb.module.pinyin.PinyinFormat;
import yb.module.pinyin.PinyinHelper;
import yz.his.common.util.LocationPoint;
import yz.his.common.util.NameKeywordsUtil;
import yz.his.model.sys.HisSysPlatRegionModel;
@YB
public class SyncRegionServer extends YBService {
private static final String PROV = "province";
private static final String CITY = "city";
private static final String REGION = "region";
private String getData(String url) throws Exception {
if (url == null || url.isEmpty())
return "";
String result = "";
Http http = new Http();
result = new String(http.get(url), "utf-8");
return result;
}
private List<HisSysPlatRegionModel> getDataStructure(String str) throws Exception {
final Pattern provMatcher = Pattern.compile(
"<td height=19 class=xl658256 style='height:14.25pt'></td>\\s+" + "<td class=xl708256>(.*)</td>\\s+"
+ "<td class=xl708256>(.*)</td>\\s+" + "<td class=xl658256></td>\\s+"
+ "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>\\s+" + "<td class=xl638256></td>",
Pattern.DOTALL);
final Pattern regionMatcher = Pattern.compile("<td height=19 class=xl658256 style='height:14.25pt'></td>\\s+"
+ "<td class=xl718256>(.*)</td>\\s+"
+ "<td class=xl718256><span style='mso-spacerun:yes'> </span>(.*)</td>\\s+"
+ "<td class=xl658256></td>\\s+" + "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>\\s+" + "<td class=xl638256></td>",
Pattern.DOTALL);
final Pattern specialMatcher = Pattern.compile("<td height=19 class=xl698256 style='height:14.25pt'></td>\\s+"
+ "<td class=xl708256>(.*)</td>\\s+" + "<td class=xl708256>(.*)</td>\\s+"
+ "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>", Pattern.DOTALL);
String[] info = str.split("</tr>");
Map<Long, HisSysPlatRegionModel> entityMap = new HashMap<>();
List<HisSysPlatRegionModel> list = new ArrayList<HisSysPlatRegionModel>();
HisSysPlatRegionModel china = new HisSysPlatRegionModel();
china.setRegionId(86L);
china.setRegionName("中国");
china.setDepth(1);
china.setIdPath("|86|");
china.setNamePath("中国");
list.add(china);
Matcher pm = null;
Matcher cm = null;
Matcher rm = null;
Matcher sm = null;
Matcher matcher = null;
String belong = null;
for (String s : info) {
pm = provMatcher.matcher(s);
rm = regionMatcher.matcher(s);
sm = specialMatcher.matcher(s);
if (pm.find()) {
matcher = pm;
if (!pm.group(2).contains("<")) {
belong = PROV;
} else {
belong = CITY;
}
} else if (rm.find()) {
belong = REGION;
matcher = rm;
} else if (sm.find()) {
belong = PROV;
matcher = sm;
} else {
continue;
}
HisSysPlatRegionModel entity = getEntity(matcher, belong, entityMap);
list.add(entity);
}
return list;
}
private HisSysPlatRegionModel getEntity(Matcher matcher, String belong, Map<Long, HisSysPlatRegionModel> entityMap)
throws Exception {
HisSysPlatRegionModel entity = new HisSysPlatRegionModel();
String regionId = matcher.group(1);
String regionName = matcher.group(2);
entity.setRegionId(ObjectTransfer.longValue(regionId));
Long key = null;
HisSysPlatRegionModel value = null;
LocationPoint location = null;
switch (belong) {
case PROV:
entity.setParentRegionId(86L);
entity.setDepth(2);
entity.setIdPath("|86|" + regionId + "|");
entity.setNamePath("中国|" + regionName);
if (regionName.endsWith("市")) {
List<String> telephoneCodeAndPostCode = getTelephoneCodeAndPostCode(regionName);
System.out.println(ObjectTransfer.print(telephoneCodeAndPostCode));
if (telephoneCodeAndPostCode != null && telephoneCodeAndPostCode.size() == 2) {
entity.setTelephoneCode(telephoneCodeAndPostCode.get(0));
entity.setPostCode(telephoneCodeAndPostCode.get(1));
}
}
break;
case CITY:
regionName = regionName.replace("<span style='mso-spacerun:yes'> </span>", "");
key = ObjectTransfer.longValue(regionId.substring(0, 2).concat("0000"));
if ((value = entityMap.get(key)) != null) {
entity.setParentRegionId(key);
entity.setIdPath(value.getIdPath() + regionId + "|");
entity.setNamePath(value.getNamePath() + "|" + regionName);
entity.setDepth(3);
List<String> telephoneCodeAndPostCode = getTelephoneCodeAndPostCode(regionName);
System.out.println(ObjectTransfer.print(telephoneCodeAndPostCode));
if (telephoneCodeAndPostCode != null && telephoneCodeAndPostCode.size() == 2) {
entity.setTelephoneCode(telephoneCodeAndPostCode.get(0));
entity.setPostCode(telephoneCodeAndPostCode.get(1));
}
}
break;
case REGION:
key = ObjectTransfer.longValue(regionId.substring(0, 4).concat("00"));
if ((value = entityMap.get(key)) != null) {
entity.setDepth(4);
} else {
key = ObjectTransfer.longValue(regionId.substring(0, 2).concat("0000"));
value = entityMap.get(key);
entity.setDepth(3);
}
entity.setParentRegionId(key);
entity.setIdPath(value.getIdPath() + regionId + "|");
entity.setNamePath(value.getNamePath() + "|" + regionName);
break;
default:
break;
}
entity.setRegionName(regionName);
String queryStr = getQueryStr(regionName);
entity.setQueryStr(queryStr);
entityMap.put(ObjectTransfer.longValue(regionId), entity);
return entity;
}
private String getQueryStr(String word) {
if (word == null || word.isEmpty()) {
return "";
}
StringBuffer sb = new StringBuffer(word);
Map<String, String> map = NameKeywordsUtil.pinyinOfName(word);
for (Entry<String, String> entry : map.entrySet()) {
if (entry == null || entry.getKey() == null)
continue;
sb.append("|" + entry.getKey());
}
return sb.toString();
}
private void start() throws Exception {
IDao dao = dao("his");
String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/2019/201911250933.html";
String data = getData(url);
List<HisSysPlatRegionModel> resultList = getDataStructure(data);
dao.insertList(resultList);
}
public List<String> getTelephoneCodeAndPostCode(String area) throws Exception {
if (area == null || area.isEmpty()) {
return null;
}
String search = "";
if (area.endsWith("市"))
search = area.replace("市", "");
if (area.endsWith("盟"))
search = area.replace("盟", "");
if (area.endsWith("地区"))
search = area.replace("地区", "");
if (area.endsWith("州"))
search = area.substring(0, 2);
String convertToPinyinString = PinyinHelper.convertToPinyinString(search, "", PinyinFormat.WITHOUT_TONE);
String url = "http://quhao.tianqi.com/" + convertToPinyinString + "/";
Pattern tcPattern = Pattern
.compile("<td><a title=\"(.*)是哪里的区号\" target=\"_blank\" href=\"/(.*)/\">(.*)</a></td>");
Pattern pcPattern = Pattern
.compile("<td><a title=\"(.*)是哪里的邮编\" target=\"_blank\" href=\"http://youbian.tianqi.com/"
+ convertToPinyinString + "/\">(.*)</a>");
Http http = new Http();
byte[] bs = http.get(url);
String result = new String(bs, "utf-8");
Matcher tcMatcher = tcPattern.matcher(result);
Matcher pcMatcher = pcPattern.matcher(result);
List<String> list = new ArrayList<>();
if (tcMatcher.find()) {
list.add(tcMatcher.group(1));
} else {
String url2 = "https://www.chahaoba.com/%E4%B8%AD%E5%9B%BD%E7%94%B5%E8%AF%9D%E5%8C%BA%E5%8F%B7?search="
+ area;
byte[] bs2 = http.get(url2);
String tc = new String(bs2, "utf-8");
Pattern pattern = Pattern.compile("<ul><li> 国内拨打:<a href=\"/21\" title=\"21\">(.*)</a></li>");
Matcher matcher = pattern.matcher(tc);
if (matcher.find()) {
list.add(matcher.group(1));
}
list.add("");
return list;
}
if (pcMatcher.find()) {
list.add(pcMatcher.group(1));
}
return list;
}
public static void main(String[] args) {
try {
Tester.init();
BundleLoader loader = Tester.initContext(3000, 3000);
SyncRegionServer server = loader.getBean(SyncRegionServer.class);
long startTime = System.currentTimeMillis();
server.start();
System.out.println("共计用时:" + (System.currentTimeMillis() - startTime) / 1000);
} catch (Exception e) {
e.printStackTrace();
} finally {
Tester.close();
}
}
}
4、实际数据库效果