爬虫行政区号并获取电话区号和邮编

爬虫行政区号并获取电话区号和邮编

1、mysql建表语句

CREATE TABLE `his_sys_plat_region` (
  `Id` bigint(20) NOT NULL AUTO_INCREMENT,
  `RegionId` bigint(20) NOT NULL DEFAULT '0' COMMENT '地区ID',
  `ParentRegionId` bigint(20) NOT NULL DEFAULT '0' COMMENT '上级地区ID',
  `RegionName` varchar(50) DEFAULT NULL COMMENT '地区名称',
  `RegionNameEN` varchar(50) DEFAULT NULL COMMENT '英文地区名称',
  `Depth` int(11) NOT NULL DEFAULT '0' COMMENT '地区层级',
  `IdPath` varchar(255) DEFAULT NULL COMMENT '地区ID路径',
  `NamePath` varchar(255) DEFAULT NULL COMMENT '地区名称路径',
  `PostCode` varchar(20) DEFAULT NULL COMMENT '邮政编码',
  `TelephoneCode` varchar(20) DEFAULT NULL COMMENT '电话区号',
  `OrderNum` int(11) NOT NULL DEFAULT '0' COMMENT '排序号',
  `Lat` varchar(60) DEFAULT NULL COMMENT '地区中心经度',
  `Lng` varchar(60) DEFAULT NULL COMMENT '地区中心纬度',
  `Active` tinyint(4) NOT NULL DEFAULT '1' COMMENT '是否有效',
  `QueryStr` varchar(8000) DEFAULT NULL COMMENT '查询字符串',
  `CreateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `UpdateTime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  PRIMARY KEY (`Id`),
  KEY `idx_his_sys_plat_region_rid` (`RegionId`),
  KEY `idx_his_sys_plat_region_pid` (`ParentRegionId`),
  KEY `idx_his_sys_plat_region_idp` (`IdPath`)
) ENGINE=InnoDB AUTO_INCREMENT=3194 DEFAULT CHARSET=utf8mb4;

2、对应的实体类

import java.util.Date;
import java.util.Map;

import yb.common.util.InitListMap;
import yb.core.tablemodel.annotation.Column;
import yb.core.tablemodel.annotation.Table;
import yz.his.model.BaseModel;

/**
 * 数据库表his_sys_plat_region的model类
 * 
 * @author 系统自动生成
 * @create 2019-12-20
 */
@Table(name = "his_sys_plat_region")
public class HisSysPlatRegionModel extends BaseModel {
	protected transient Map<String, String> fieldDictTypes = InitListMap.toHashMap(new String[] {});

	@Column(name = "Id", primaryKey = true, insertable = false)
	private long Id;

	@Column(name = "RegionId")
	private long RegionId = 0L; // 地区ID

	@Column(name = "ParentRegionId")
	private long ParentRegionId = 0L; // 上级地区ID

	@Column(name = "RegionName")
	private String RegionName; // 地区名称

	@Column(name = "RegionNameEN")
	private String RegionNameEN; // 英文地区名称

	@Column(name = "Depth")
	private int Depth = 0; // 地区层级

	@Column(name = "IdPath")
	private String IdPath; // 地区ID路径

	@Column(name = "NamePath")
	private String NamePath; // 地区名称路径

	@Column(name = "PostCode")
	private String PostCode; // 邮政编码

	@Column(name = "TelephoneCode")
	private String TelephoneCode; // 电话区号

	@Column(name = "OrderNum")
	private int OrderNum = 0; // 排序号

	@Column(name = "Lat")
	private String Lat; // 地区中心经度

	@Column(name = "Lng")
	private String Lng; // 地区中心纬度

	@Column(name = "Active")
	private int Active = 1; // 是否有效

	@Column(name = "QueryStr")
	private String QueryStr; // 查询字符串

	@Column(name = "CreateTime", insertable = false, updateable = false)
	private Date CreateTime; // 创建时间

	@Column(name = "UpdateTime", insertable = false, updateable = false)
	private Date UpdateTime; // 更新时间

	/**
	 * Id
	 */
	public long getId() {
		return Id;
	}

	/**
	 * Id
	 */
	public void setId(long Id) {
		this.Id = Id;
	}

	/**
	 * 地区ID<br/>
	 * default: 0
	 */
	public long getRegionId() {
		return RegionId;
	}

	/**
	 * 地区ID<br/>
	 * default: 0
	 */
	public void setRegionId(long RegionId) {
		this.RegionId = RegionId;
	}

	/**
	 * 上级地区ID<br/>
	 * default: 0
	 */
	public long getParentRegionId() {
		return ParentRegionId;
	}

	/**
	 * 上级地区ID<br/>
	 * default: 0
	 */
	public void setParentRegionId(long ParentRegionId) {
		this.ParentRegionId = ParentRegionId;
	}

	/**
	 * 地区名称
	 */
	public String getRegionName() {
		return RegionName;
	}

	/**
	 * 地区名称
	 */
	public void setRegionName(String RegionName) {
		this.RegionName = RegionName;
	}

	/**
	 * 英文地区名称
	 */
	public String getRegionNameEN() {
		return RegionNameEN;
	}

	/**
	 * 英文地区名称
	 */
	public void setRegionNameEN(String RegionNameEN) {
		this.RegionNameEN = RegionNameEN;
	}

	/**
	 * 地区层级<br/>
	 * default: 0
	 */
	public int getDepth() {
		return Depth;
	}

	/**
	 * 地区层级<br/>
	 * default: 0
	 */
	public void setDepth(int Depth) {
		this.Depth = Depth;
	}

	/**
	 * 地区ID路径
	 */
	public String getIdPath() {
		return IdPath;
	}

	/**
	 * 地区ID路径
	 */
	public void setIdPath(String IdPath) {
		this.IdPath = IdPath;
	}

	/**
	 * 地区名称路径
	 */
	public String getNamePath() {
		return NamePath;
	}

	/**
	 * 地区名称路径
	 */
	public void setNamePath(String NamePath) {
		this.NamePath = NamePath;
	}

	/**
	 * 邮政编码
	 */
	public String getPostCode() {
		return PostCode;
	}

	/**
	 * 邮政编码
	 */
	public void setPostCode(String PostCode) {
		this.PostCode = PostCode;
	}

	/**
	 * 电话区号
	 */
	public String getTelephoneCode() {
		return TelephoneCode;
	}

	/**
	 * 电话区号
	 */
	public void setTelephoneCode(String TelephoneCode) {
		this.TelephoneCode = TelephoneCode;
	}

	/**
	 * 排序号<br/>
	 * default: 0
	 */
	public int getOrderNum() {
		return OrderNum;
	}

	/**
	 * 排序号<br/>
	 * default: 0
	 */
	public void setOrderNum(int OrderNum) {
		this.OrderNum = OrderNum;
	}

	/**
	 * 地区中心经度
	 */
	public String getLat() {
		return Lat;
	}

	/**
	 * 地区中心经度
	 */
	public void setLat(String Lat) {
		this.Lat = Lat;
	}

	/**
	 * 地区中心纬度
	 */
	public String getLng() {
		return Lng;
	}

	/**
	 * 地区中心纬度
	 */
	public void setLng(String Lng) {
		this.Lng = Lng;
	}

	/**
	 * 是否有效<br/>
	 * default: 1
	 */
	public int getActive() {
		return Active;
	}

	/**
	 * 是否有效<br/>
	 * default: 1
	 */
	public void setActive(int Active) {
		this.Active = Active;
	}

	/**
	 * 查询字符串
	 */
	public String getQueryStr() {
		return QueryStr;
	}

	/**
	 * 查询字符串
	 */
	public void setQueryStr(String QueryStr) {
		this.QueryStr = QueryStr;
	}

	/**
	 * 创建时间<br/>
	 * default: CURRENT_TIMESTAMP
	 */
	public Date getCreateTime() {
		return CreateTime;
	}

	/**
	 * 创建时间<br/>
	 * default: CURRENT_TIMESTAMP
	 */
	public void setCreateTime(Date CreateTime) {
		this.CreateTime = CreateTime;
	}

	/**
	 * 更新时间<br/>
	 * default: CURRENT_TIMESTAMP
	 */
	public Date getUpdateTime() {
		return UpdateTime;
	}

	/**
	 * 更新时间<br/>
	 * default: CURRENT_TIMESTAMP
	 */
	public void setUpdateTime(Date UpdateTime) {
		this.UpdateTime = UpdateTime;
	}

	@Override
	public Map<String, String> getFieldDictTypes() {
		return fieldDictTypes;
	}
}

3、同步代码

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import yb.common.util.Http;
import yb.common.util.ObjectTransfer;
import yb.common.util.Tester;
import yb.core.bundle.annotation.YB;
import yb.core.business.YBService;
import yb.core.dao.IDao;
import yb.core.loader.BundleLoader;
import yb.module.pinyin.PinyinFormat;
import yb.module.pinyin.PinyinHelper;
import yz.his.common.util.LocationPoint;
import yz.his.common.util.NameKeywordsUtil;
import yz.his.model.sys.HisSysPlatRegionModel;

@YB
public class SyncRegionServer extends YBService {
	private static final String PROV = "province";
	private static final String CITY = "city";
	private static final String REGION = "region";

	private String getData(String url) throws Exception {
		if (url == null || url.isEmpty())
			return "";
		String result = "";
		Http http = new Http();
		result = new String(http.get(url), "utf-8");
		return result;
	}

	private List<HisSysPlatRegionModel> getDataStructure(String str) throws Exception {
		// 想要获取字段部分的分割模式
		final Pattern provMatcher = Pattern.compile(
				"<td height=19 class=xl658256 style='height:14.25pt'></td>\\s+" + "<td class=xl708256>(.*)</td>\\s+"
						+ "<td class=xl708256>(.*)</td>\\s+" + "<td class=xl658256></td>\\s+"
						+ "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
						+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>\\s+" + "<td class=xl638256></td>",
				Pattern.DOTALL);
		/*
		 * final Pattern cityMatcher = Pattern.compile(
		 * "<td height=19 class=xl658256 style='height:14.25pt'></td>\\s+" +
		 * "<td class=xl708256>(.*)</td>\\s+" +
		 * "<td class=xl708256><span style='mso-spacerun:yes'> </span>(.*)</td>\\s+"
		 * + "<td class=xl658256></td>\\s+" + "<td class=xl698256></td>\\s+" +
		 * "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+" +
		 * "<td class=xl638256></td>\\s+" + "<td class=xl638256></td>",
		 * Pattern.DOTALL);
		 */
		final Pattern regionMatcher = Pattern.compile("<td height=19 class=xl658256 style='height:14.25pt'></td>\\s+"
				+ "<td class=xl718256>(.*)</td>\\s+"
				+ "<td class=xl718256><span style='mso-spacerun:yes'>   </span>(.*)</td>\\s+"
				+ "<td class=xl658256></td>\\s+" + "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
				+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>\\s+" + "<td class=xl638256></td>",
				Pattern.DOTALL);
		final Pattern specialMatcher = Pattern.compile("<td height=19 class=xl698256 style='height:14.25pt'></td>\\s+"
				+ "<td class=xl708256>(.*)</td>\\s+" + "<td class=xl708256>(.*)</td>\\s+"
				+ "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+" + "<td class=xl698256></td>\\s+"
				+ "<td class=xl698256></td>\\s+" + "<td class=xl638256></td>", Pattern.DOTALL);

		// 想要获取的整段数据的分割模式
		String[] info = str.split("</tr>");
		Map<Long, HisSysPlatRegionModel> entityMap = new HashMap<>();
		List<HisSysPlatRegionModel> list = new ArrayList<HisSysPlatRegionModel>();

		HisSysPlatRegionModel china = new HisSysPlatRegionModel();
		china.setRegionId(86L);
		china.setRegionName("中国");
		china.setDepth(1);
		china.setIdPath("|86|");
		china.setNamePath("中国");
		list.add(china);// 中国

		Matcher pm = null;
		Matcher cm = null;
		Matcher rm = null;
		Matcher sm = null;
		Matcher matcher = null;
		String belong = null;
		for (String s : info) {
			pm = provMatcher.matcher(s);
			rm = regionMatcher.matcher(s);
			sm = specialMatcher.matcher(s);
			if (pm.find()) {
				matcher = pm;
				if (!pm.group(2).contains("<")) {
					belong = PROV;
				} else {
					belong = CITY;
				}
			} else if (rm.find()) {
				belong = REGION;
				matcher = rm;
			} else if (sm.find()) {
				belong = PROV;
				matcher = sm;
			} else {
				continue;
			}

			HisSysPlatRegionModel entity = getEntity(matcher, belong, entityMap);
			list.add(entity);
		}
		return list;
	}

	private HisSysPlatRegionModel getEntity(Matcher matcher, String belong, Map<Long, HisSysPlatRegionModel> entityMap)
			throws Exception {
		HisSysPlatRegionModel entity = new HisSysPlatRegionModel();
		String regionId = matcher.group(1);
		String regionName = matcher.group(2);
		entity.setRegionId(ObjectTransfer.longValue(regionId));
		Long key = null;
		HisSysPlatRegionModel value = null;
		LocationPoint location = null;
		switch (belong) {
		case PROV:
			entity.setParentRegionId(86L);
			entity.setDepth(2);
			entity.setIdPath("|86|" + regionId + "|");
			entity.setNamePath("中国|" + regionName);
			if (regionName.endsWith("市")) {
				List<String> telephoneCodeAndPostCode = getTelephoneCodeAndPostCode(regionName);
				System.out.println(ObjectTransfer.print(telephoneCodeAndPostCode));
				if (telephoneCodeAndPostCode != null && telephoneCodeAndPostCode.size() == 2) {
					entity.setTelephoneCode(telephoneCodeAndPostCode.get(0));
					entity.setPostCode(telephoneCodeAndPostCode.get(1));
				}
			}
			break;

		case CITY:
			regionName = regionName.replace("<span style='mso-spacerun:yes'> </span>", "");
			key = ObjectTransfer.longValue(regionId.substring(0, 2).concat("0000"));
			if ((value = entityMap.get(key)) != null) {
				entity.setParentRegionId(key);
				entity.setIdPath(value.getIdPath() + regionId + "|");
				entity.setNamePath(value.getNamePath() + "|" + regionName);
				entity.setDepth(3);

				List<String> telephoneCodeAndPostCode = getTelephoneCodeAndPostCode(regionName);
				System.out.println(ObjectTransfer.print(telephoneCodeAndPostCode));
				if (telephoneCodeAndPostCode != null && telephoneCodeAndPostCode.size() == 2) {
					entity.setTelephoneCode(telephoneCodeAndPostCode.get(0));
					entity.setPostCode(telephoneCodeAndPostCode.get(1));
				}
			}
			// location =
			// BaiduUtil.getLocationByAddress(entity.getNamePath().replace("|",
			// ""));
			// if (location != null) {
			// entity.setLng(String.valueOf(location.getLongtitude()));
			// entity.setLat(String.valueOf(location.getLatitude()));
			// }
			break;
		case REGION:
			key = ObjectTransfer.longValue(regionId.substring(0, 4).concat("00"));
			if ((value = entityMap.get(key)) != null) {
				entity.setDepth(4);
			} else {
				key = ObjectTransfer.longValue(regionId.substring(0, 2).concat("0000"));
				value = entityMap.get(key);
				entity.setDepth(3);
			}
			entity.setParentRegionId(key);
			entity.setIdPath(value.getIdPath() + regionId + "|");
			entity.setNamePath(value.getNamePath() + "|" + regionName);
			// location =
			// BaiduUtil.getLocationByAddress(entity.getNamePath().replace("|",
			// ""));
			// if (location != null) {
			// entity.setLng(String.valueOf(location.getLongtitude()));
			// entity.setLat(String.valueOf(location.getLatitude()));
			// }
			break;

		default:
			break;
		}

		entity.setRegionName(regionName);
		String queryStr = getQueryStr(regionName);
		entity.setQueryStr(queryStr);

		entityMap.put(ObjectTransfer.longValue(regionId), entity);
		return entity;
	}

	private String getQueryStr(String word) {
		if (word == null || word.isEmpty()) {
			return "";
		}
		StringBuffer sb = new StringBuffer(word);
		Map<String, String> map = NameKeywordsUtil.pinyinOfName(word);
		for (Entry<String, String> entry : map.entrySet()) {
			if (entry == null || entry.getKey() == null)
				continue;
			sb.append("|" + entry.getKey());
		}
		return sb.toString();
	}

	private void start() throws Exception {
		IDao dao = dao("his");
		String url = "http://www.mca.gov.cn/article/sj/xzqh/2019/2019/201911250933.html";
		String data = getData(url);
		List<HisSysPlatRegionModel> resultList = getDataStructure(data);
		dao.insertList(resultList);
	}

	public List<String> getTelephoneCodeAndPostCode(String area) throws Exception {
		if (area == null || area.isEmpty()) {
			return null;
		}
		String search = "";
		if (area.endsWith("市"))
			search = area.replace("市", "");
		if (area.endsWith("盟"))
			search = area.replace("盟", "");
		if (area.endsWith("地区"))
			search = area.replace("地区", "");
		if (area.endsWith("州"))
			search = area.substring(0, 2);
		String convertToPinyinString = PinyinHelper.convertToPinyinString(search, "", PinyinFormat.WITHOUT_TONE);
		String url = "http://quhao.tianqi.com/" + convertToPinyinString + "/";
		Pattern tcPattern = Pattern
				.compile("<td><a title=\"(.*)是哪里的区号\" target=\"_blank\" href=\"/(.*)/\">(.*)</a></td>");
		Pattern pcPattern = Pattern
				.compile("<td><a title=\"(.*)是哪里的邮编\" target=\"_blank\" href=\"http://youbian.tianqi.com/"
						+ convertToPinyinString + "/\">(.*)</a>");
		Http http = new Http();
		byte[] bs = http.get(url);
		String result = new String(bs, "utf-8");
		Matcher tcMatcher = tcPattern.matcher(result);
		Matcher pcMatcher = pcPattern.matcher(result);
		List<String> list = new ArrayList<>();
		if (tcMatcher.find()) {
			list.add(tcMatcher.group(1));
		} else {
			String url2 = "https://www.chahaoba.com/%E4%B8%AD%E5%9B%BD%E7%94%B5%E8%AF%9D%E5%8C%BA%E5%8F%B7?search="
					+ area;
			byte[] bs2 = http.get(url2);
			String tc = new String(bs2, "utf-8");
			Pattern pattern = Pattern.compile("<ul><li> 国内拨打:<a href=\"/21\" title=\"21\">(.*)</a></li>");
			Matcher matcher = pattern.matcher(tc);
			if (matcher.find()) {
				list.add(matcher.group(1));
			}
			list.add("");
			return list;
		}
		if (pcMatcher.find()) {
			list.add(pcMatcher.group(1));
		}
		return list;
	}

	/*
	 * main test
	 * 
	 * 
	 */
	public static void main(String[] args) {
		try {
			Tester.init();
			BundleLoader loader = Tester.initContext(3000, 3000);
			SyncRegionServer server = loader.getBean(SyncRegionServer.class);

			long startTime = System.currentTimeMillis();
			server.start();

			System.out.println("共计用时:" + (System.currentTimeMillis() - startTime) / 1000);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			Tester.close();
		}
	}
}

4、实际数据库效果

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值