1. 目的:python3.6,调取百度邮编数据,存mysql
2. 环境:pycharm、python3.6、mysql
3. 准备:mysql建表sql,直接复制运行
a. province表:省份表,用省份关键字去查百度邮编
DROP TABLE IF EXISTS `province`;
CREATE TABLE `province` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`mark` int(11) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=36 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of province
-- ----------------------------
INSERT INTO `province` VALUES ('1', '北京市', '0');
INSERT INTO `province` VALUES ('2', '天津市', '0');
INSERT INTO `province` VALUES ('3', '上海市', '0');
INSERT INTO `province` VALUES ('4', '重庆市', '0');
INSERT INTO `province` VALUES ('5', '河北省', '0');
INSERT INTO `province` VALUES ('6', '山西省', '0');
INSERT INTO `province` VALUES ('7', '辽宁省', '0');
INSERT INTO `province` VALUES ('8', '吉林省', '0');
INSERT INTO `province` VALUES ('9', '黑龙江省', '0');
INSERT INTO `province` VALUES ('10', '江苏省', '0');
INSERT INTO `province` VALUES ('11', '浙江省', '0');
INSERT INTO `province` VALUES ('12', '安徽省', '0');
INSERT INTO `province` VALUES ('13', '福建省', '0');
INSERT INTO `province` VALUES ('14', '江西省', '0');
INSERT INTO `province` VALUES ('15', '山东省', '0');
INSERT INTO `province` VALUES ('16', '河南省', '0');
INSERT INTO `province` VALUES ('17', '湖北省', '0');
INSERT INTO `province` VALUES ('18', '湖南省', '0');
INSERT INTO `province` VALUES ('19', '广东省', '0');
INSERT INTO `province` VALUES ('20', '海南省', '0');
INSERT INTO `province` VALUES ('21', '四川省', '0');
INSERT INTO `province` VALUES ('22', '贵州省', '0');
INSERT INTO `province` VALUES ('23', '云南省', '0');
INSERT INTO `province` VALUES ('24', '陕西省', '0');
INSERT INTO `province` VALUES ('25', '甘肃省', '0');
INSERT INTO `province` VALUES ('26', '青海省', '0');
INSERT INTO `province` VALUES ('27', '台湾省', '0');
INSERT INTO `province` VALUES ('28', '内蒙古自治区', '0');
INSERT INTO `province` VALUES ('29', '广西壮族自治区', '0');
INSERT INTO `province` VALUES ('30', '西藏自治区', '0');
INSERT INTO `province` VALUES ('31', '宁夏回族自治区', '0');
INSERT INTO `province` VALUES ('32', '新疆维吾尔自治区', '0');
INSERT INTO `province` VALUES ('33', '香港特别行政区', '0');
INSERT INTO `province` VALUES ('34', '澳门特别行政区', '0');
b.邮编结果表:查到结果,存放此表
CREATE TABLE `postal_t` (
`id` int(5) NOT NULL AUTO_INCREMENT,
`country` varchar(100) NOT NULL DEFAULT 'china',
`address` varchar(200) NOT NULL COMMENT '邮编地址',
`postcode` varchar(100) NOT NULL,
`parentcode` varchar(100) DEFAULT NULL COMMENT '父级code',
`createTime` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`mark` int(1) NOT NULL DEFAULT '0' COMMENT '0未查询 4查询失败 6查询成功 ',
PRIMARY KEY (`id`),
KEY `id` (`id`) USING BTREE,
KEY `parentid` (`parentcode`) USING BTREE,
KEY `address` (`address`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
3. phthon代码
import pymysql.cursors
import re
import urllib.request
from urllib import parse
from bs4 import BeautifulSoup
# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据url获取网页数据
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read().decode(encoding='gbk', errors='strict')
return html
# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据url 解析数据并返回List
def getAllDataByUrl(addressOne, flag):
endList = []
url = 'http://opendata.baidu.com/post/s?rn=20&p=mini&wd='
str2 = parse.quote(addressOne, encoding='gbk', errors='strict')
newUrl = url + str2;
for allInt in range(1000):
endUrl=newUrl
if flag == 0:
pageNum = '&pn=' + str((allInt + 1) * 20)
print('正在处理的页数:', allInt * 20)
endUrl = endUrl + str(pageNum)
print(endUrl)
a = getHtml(endUrl)
htmla = BeautifulSoup(a, 'html.parser')
# 获得 table标签
table = htmla.table
if table == None:
break
# 获得 li标签
# print("=========")
# li = htmla.li
# print(li)
# 去掉所有标签
# dr = re.compile(r'<[^>]+>', re.S)
# dd = dr.sub('', str(li))
# print(dd)
# 去掉所有em标签内容
rem = re.compile('<\s*em[^>]*>[^<]*<\s*/\s*em\s*>', re.I)
s = rem.sub('', str(table))
# 去掉所有th标签内容
rth = re.compile('<\s*th[^>]*>[^<]*<\s*/\s*th\s*>', re.I) # 去掉所有th标签内容
s = rth.sub('', str(s))
# 去掉 空tr标签
rtr = re.compile('<tr></tr>', re.I) # 去掉所有th标签内容
s = rtr.sub('', str(s))
# print("====================")
# 创建一个list 存放数据
soup = BeautifulSoup(str(s), "html.parser")
# 获得所有tr标签数据
taglist = soup.find_all('tr')
for trtag in taglist:
tdlist = trtag.find_all('td') # 在每个tr标签下,查找所有的td标签
endList.append(tdlist[0].string + '-' + addressOne + tdlist[1].string)
if flag == 1:
break
return endList
# # # # # # # # # # # # # # # # # # # # # # # # #
# 批量增加
def batchSomeData(endlist):
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='fhone', charset='utf8')
cursor = conn.cursor()
sql_insert = "insert into postal_t(postcode,address) values(%s,%s)"
resultList = []
if len(endlist) > 0:
for a in endlist:
dr = re.compile(' ', re.S)
# print(a)
dd = dr.sub('', str(a))
sa = dd.split('-')
data11 = (sa[0], sa[1])
resultList.append(data11)
else:
print("数据为空")
try:
cursor.executemany(sql_insert, resultList)
print(cursor.rowcount)
conn.commit()
print("插入数据长度:", len(resultList))
except Exception as e:
print('异常!!!!!!!!!!:数据长度:', len(resultList))
print(e)
conn.rollback()
cursor.close()
conn.close()
# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据id修改数据
def updateData(id):
sql = "UPDATE province SET mark=1 WHERE id = %s "
data = (id)
cursor.execute(sql % data)
conn.commit()
# ****** 执行部分 ******
# # # # # # # # # # # # # # # # # # # # # # # # #
# 建立mysql连接
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='fhone', charset='utf8')
# sql:获取mark为0的数据,mark0表示未处理,1表示已处理
sql = "SELECT * FROM province WHERE MARK=%s"
# 传递参数
data = (0)
# 建立游标
cursor = conn.cursor()
# 执行,返回count数
count = cursor.execute(sql % data)
print('结果长度:', count)
# 得到返回结果集
result = cursor.fetchall()
# 遍历结果集
for dt in result:
print('正在处理:', dt[1])
flag = 0 # 直辖市中没有分页,flag 1代表没有分页,url不拼接分页参数
if str(dt[1]).endswith('市'):
flag = 1
endli = getAllDataByUrl(dt[1], flag) # 返回结果集
batchSomeData(endli) # 将返回结果集插入数据库
endli.clear()
updateData(dt[0]) #将此id修改为已处理状态
4. 结果
5. 若想获得下一层邮编,则根据postal_t新表存放的行政区划再继续调