#-*-coding:utf-8-*-
import re,json,requests
url="http://www.ip138.com/post/"
headers = { # 伪装成浏览器,防止反爬,通用
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5558.400 QQBrowser/10.1.1695.400'}
# response=requests.get(url=url,headers=headers).content.decode("gbk")
# with open("中国邮编首页.html",'w',encoding='utf-8')as fq:
# fq.write(response)
with open("中国邮编首页.html",'r',encoding='utf-8')as fq:
response=fq.read()
p=re.compile(r'<a href="/(.*?)/" target="_blank">(.*?)</a></td>')#列表套元组
province=p.findall(response)
print(province)
print(len(province))
dictall={}
m=0
for i in province:
print(i)
dictcity = {}
# url = f"http://www.ip138.com/{i[0]}"
# response=requests.get(url=url,headers=headers).content.decode("gbk")
# with open(f"各省邮编/{i[1]}.html",'w',encoding='utf-8')as fq:
# fq.write(response)
with open(f"各省邮编/{i[1]}.html",'r',encoding='utf-8')as fq:
response=fq.read()
bigcities=[""]
# p=re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td>')
# xian=p.findall(response)
# print(len(xian),xian)
p2=re.compile(r'<tr bgcolor="#ffffff">(.*?)<tr><td colspan="6"></td></tr>',re.S)#把省的地区分出来
cities=p2.findall(response)
if cities==[]:#直辖市和港澳
#p4 = re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a>', re.S)
p4 = re.compile(r'<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td><td><a href="/.*?/">.*?</a></td>', re.S)
bigcitylist= p4.findall(response)
for bigcity in bigcitylist:
dictcity[bigcity[0]]=bigcity[1]
m+=1
dictall[i[1]] = dictcity
continue
print('cities',cities)
for city in cities:
print(city)
p3=re.compile(r'<td><a href=".*?/"><b>(.*?)</a></b></td><td><a href="/.*?/">(.*?)</a></td>',re.S)
area=p3.findall(city)[0]
#p4=re.compile(r'<tr bgcolor="#ffffff"><td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td>.*?<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td></tr><tr><td colspan="6"></td></tr>',re.S)
p4 = re.compile(r'<td>(.*?)</td><td><a href="/.*?/">(.*?)</a></td><td><a href="/.*?/">.*?</a></td>', re.S)
xians=p4.findall(city)[1:]
if xians==[]:
dictcity[area[0]]=area[1]
m+=1
else:
dictxian={}
dictxian[area[0]] = area[1]
m+=1
for xian in xians:
dictxian[xian[0]]=xian[1]
m+=1
#dictxian[xian[2]] = xian[3]
dictcity[area[0]]=dictxian
print(area,xians)
dictall[i[1]]=dictcity
strall= json.dump(dictall, open(f'中国邮编正则7.json', 'w', encoding="utf-8"), ensure_ascii=False)
print(m)
正则中国邮政
最新推荐文章于 2023-08-09 11:39:02 发布