运行效果如下,邮编直接具体到地级市,县等行政区划地区:
从北京开始,一直到台湾全部的邮政编码都有
代码实现:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/11/21 10:05
# @Author : huni
# @File : 全国邮编.py
# @Software: PyCharm
import requests
from lxml import etree
import sqlite3
def getData(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
cata = requests.get(url=url, headers=headers)
# 处理中文乱码问题
cata = cata.text.encode('ISO-8859-1').decode('GB18030')
tree = etree.HTML(cata)
tr_list = tree.xpath('//*[@id="quanguo"]/tr//@href')
all_info_list = []
for i in tr_list:
url1 = 'https://www.ip138.com' + i
cata1 = requests.get(url=url1,headers=headers)
#处理中文乱码问题
cata1 = cata1.text.encode('ISO-8859-1').decode('GB18030')
tree1 = etree.HTML(cata1)
tr_list1 = tree1.xpath('/html/body/div[1]/div[2]/div[2]/div[2]/table/tr[@bgcolor="#ffffff"]')
for tr in tr_list1:
info = [el.replace('\xa0',' ') for el in tr.xpath('.//text()')]
if len(info) == 4:
all_info_list.append(info[:3])
if len(info) == 6:
all_info_list.append(info[:3])
if info[3:6] != [' ', ' ', ' ']:
all_info_list.append(info[3:6])
return all_info_list
#数据保存到数据库
def saveDatadb(dbpath,all_info_list):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in all_info_list:
for index in range(len(data)):
data[index] = '"'+str(data[index])+'"'
sql = '''
insert into 邮编表 (
市县区名,邮政编码,长途区号)
values(%s)'''%",".join(data)
# print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
#初始化数据库,创建表
def init_db(dbpath):
sql = '''
create table if not exists 邮编表
(
市县区名 varchar ,
邮政编码 numeric ,
长途区号 numeric
)
''' # 创建数据表
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def main():
url = 'https://www.ip138.com/post/'
all_info_list = getData(url)
dbpath = '全国邮编.db'
saveDatadb(dbpath,all_info_list)
if __name__ == '__main__':
main()
print('保存成功')