python3.6爬取百度邮编存mysql

最新推荐文章于 2022-11-24 08:28:39 发布

花与少年°

最新推荐文章于 2022-11-24 08:28:39 发布

阅读量280

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_37630333/article/details/89552465

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1. 目的：python3.6，调取百度邮编数据，存mysql
在这里插入图片描述
2. 环境：pycharm、python3.6、mysql
3. 准备：mysql建表sql，直接复制运行
a. province表：省份表，用省份关键字去查百度邮编

DROP TABLE IF EXISTS `province`;
CREATE TABLE `province` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `name` varchar(255) DEFAULT NULL,
  `mark` int(11) DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=36 DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of province
-- ----------------------------
INSERT INTO `province` VALUES ('1', '北京市', '0');
INSERT INTO `province` VALUES ('2', '天津市', '0');
INSERT INTO `province` VALUES ('3', '上海市', '0');
INSERT INTO `province` VALUES ('4', '重庆市', '0');
INSERT INTO `province` VALUES ('5', '河北省', '0');
INSERT INTO `province` VALUES ('6', '山西省', '0');
INSERT INTO `province` VALUES ('7', '辽宁省', '0');
INSERT INTO `province` VALUES ('8', '吉林省', '0');
INSERT INTO `province` VALUES ('9', '黑龙江省', '0');
INSERT INTO `province` VALUES ('10', '江苏省', '0');
INSERT INTO `province` VALUES ('11', '浙江省', '0');
INSERT INTO `province` VALUES ('12', '安徽省', '0');
INSERT INTO `province` VALUES ('13', '福建省', '0');
INSERT INTO `province` VALUES ('14', '江西省', '0');
INSERT INTO `province` VALUES ('15', '山东省', '0');
INSERT INTO `province` VALUES ('16', '河南省', '0');
INSERT INTO `province` VALUES ('17', '湖北省', '0');
INSERT INTO `province` VALUES ('18', '湖南省', '0');
INSERT INTO `province` VALUES ('19', '广东省', '0');
INSERT INTO `province` VALUES ('20', '海南省', '0');
INSERT INTO `province` VALUES ('21', '四川省', '0');
INSERT INTO `province` VALUES ('22', '贵州省', '0');
INSERT INTO `province` VALUES ('23', '云南省', '0');
INSERT INTO `province` VALUES ('24', '陕西省', '0');
INSERT INTO `province` VALUES ('25', '甘肃省', '0');
INSERT INTO `province` VALUES ('26', '青海省', '0');
INSERT INTO `province` VALUES ('27', '台湾省', '0');
INSERT INTO `province` VALUES ('28', '内蒙古自治区', '0');
INSERT INTO `province` VALUES ('29', '广西壮族自治区', '0');
INSERT INTO `province` VALUES ('30', '西藏自治区', '0');
INSERT INTO `province` VALUES ('31', '宁夏回族自治区', '0');
INSERT INTO `province` VALUES ('32', '新疆维吾尔自治区', '0');
INSERT INTO `province` VALUES ('33', '香港特别行政区', '0');
INSERT INTO `province` VALUES ('34', '澳门特别行政区', '0');

b.邮编结果表：查到结果，存放此表

CREATE TABLE `postal_t` (
  `id` int(5) NOT NULL AUTO_INCREMENT,
  `country` varchar(100) NOT NULL DEFAULT 'china',
  `address` varchar(200) NOT NULL COMMENT '邮编地址',
  `postcode` varchar(100) NOT NULL,
  `parentcode` varchar(100) DEFAULT NULL COMMENT '父级code',
  `createTime` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  `mark` int(1) NOT NULL DEFAULT '0' COMMENT '0未查询 4查询失败 6查询成功 ',
  PRIMARY KEY (`id`),
  KEY `id` (`id`) USING BTREE,
  KEY `parentid` (`parentcode`) USING BTREE,
  KEY `address` (`address`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

3. phthon代码

import pymysql.cursors
import re
import urllib.request
from urllib import parse
from bs4 import BeautifulSoup


# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据url获取网页数据
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read().decode(encoding='gbk', errors='strict')
    return html


# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据url 解析数据并返回List
def getAllDataByUrl(addressOne, flag):
    endList = []
    url = 'http://opendata.baidu.com/post/s?rn=20&p=mini&wd='
    str2 = parse.quote(addressOne, encoding='gbk', errors='strict')
   newUrl = url + str2;

    for allInt in range(1000):
        endUrl=newUrl
        if flag == 0:
            pageNum = '&pn=' + str((allInt + 1) * 20)
            print('正在处理的页数：', allInt * 20)
            endUrl = endUrl + str(pageNum)

        print(endUrl)
        a = getHtml(endUrl)
        htmla = BeautifulSoup(a, 'html.parser')
        # 获得 table标签
        table = htmla.table
        if table == None:
            break
        # 获得 li标签
        # print("=========")
        # li = htmla.li
        # print(li)
        # 去掉所有标签
        # dr = re.compile(r'<[^>]+>', re.S)
        # dd = dr.sub('', str(li))
        # print(dd)
        # 去掉所有em标签内容
        rem = re.compile('<\s*em[^>]*>[^<]*<\s*/\s*em\s*>', re.I)
        s = rem.sub('', str(table))
        # 去掉所有th标签内容
        rth = re.compile('<\s*th[^>]*>[^<]*<\s*/\s*th\s*>', re.I)  # 去掉所有th标签内容
        s = rth.sub('', str(s))
        # 去掉 空tr标签
        rtr = re.compile('<tr></tr>', re.I)  # 去掉所有th标签内容
        s = rtr.sub('', str(s))
        # print("====================")
        # 创建一个list 存放数据

        soup = BeautifulSoup(str(s), "html.parser")
        # 获得所有tr标签数据
        taglist = soup.find_all('tr')
        for trtag in taglist:
            tdlist = trtag.find_all('td')  # 在每个tr标签下,查找所有的td标签
            endList.append(tdlist[0].string + '-' + addressOne + tdlist[1].string)
        if flag == 1:
            break
    return endList


# # # # # # # # # # # # # # # # # # # # # # # # #
# 批量增加

def batchSomeData(endlist):
    conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='fhone', charset='utf8')
    cursor = conn.cursor()
    sql_insert = "insert into postal_t(postcode,address) values(%s,%s)"
    resultList = []
    if len(endlist) > 0:
        for a in endlist:
            dr = re.compile(' ', re.S)
            # print(a)
            dd = dr.sub('', str(a))
            sa = dd.split('-')
            data11 = (sa[0], sa[1])
            resultList.append(data11)
    else:
        print("数据为空")
    try:
        cursor.executemany(sql_insert, resultList)
        print(cursor.rowcount)
        conn.commit()
        print("插入数据长度：", len(resultList))
    except Exception as e:
        print('异常！！！！！！！！！！：数据长度：', len(resultList))
        print(e)
        conn.rollback()

    cursor.close()
    conn.close()

# # # # # # # # # # # # # # # # # # # # # # # # #
# 根据id修改数据
def updateData(id):
    sql = "UPDATE province SET mark=1 WHERE id = %s "
    data = (id)
    cursor.execute(sql % data)
    conn.commit()



# ****** 执行部分 ******
# # # # # # # # # # # # # # # # # # # # # # # # #

# 建立mysql连接
conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='fhone', charset='utf8')
# sql：获取mark为0的数据，mark0表示未处理，1表示已处理
sql = "SELECT * FROM province WHERE MARK=%s"
# 传递参数
data = (0)
# 建立游标
cursor = conn.cursor()
# 执行，返回count数
count = cursor.execute(sql % data)
print('结果长度：', count)
# 得到返回结果集
result = cursor.fetchall()
# 遍历结果集
for dt in result:
    print('正在处理：', dt[1])
    flag = 0  # 直辖市中没有分页，flag 1代表没有分页，url不拼接分页参数
    if str(dt[1]).endswith('市'):
        flag = 1
    endli = getAllDataByUrl(dt[1], flag)  # 返回结果集
    batchSomeData(endli)  # 将返回结果集插入数据库
    endli.clear()
    updateData(dt[0]) #将此id修改为已处理状态

4. 结果
在这里插入图片描述
5. 若想获得下一层邮编，则根据postal_t新表存放的行政区划再继续调