仅做测试使用,频繁获取导致的不良后果自负
推荐获取的html保存至本地,再进行生成SQL文件
import urllib3
import os
import sys
from urllib import request,parse
from bs4 import BeautifulSoup
urllib3.disable_warnings()
URL='http://www.mca.gov.cn/article/sj/xzqh/2019/201901-06/201902031029.html'
def locationCode(url):
http=urllib3.PoolManager()
localtion=http.request('GET',url)
soup=BeautifulSoup(localtion.data,"html.parser")
# soup=BeautifulSoup(open('./201902031029.html',mode='r',encoding='utf-8'),"html.parser")
table=soup.select('table')[0]
province="110000"#初始为北京市
sql="INSERT INTO region(`id`,`name`,`level`,`parent_id`) VALUES"
sqlFile=open('./code.sql','a',encoding='utf-8');
tempStr=""
codeList=['11','12','31','50']#对北京、天津、上海、重庆直辖市的地区特殊处理
for tr in table.select('tr') :
# td=tr.findAll(name="td",attrs={"class":"xl7028065"},limit=2)
td=tr.select("td.xl7028065")
if len(td)>0:
code=td[0].get_text()
name=td[1].get_text()
if len(code)==6:
if code[2:]=='0000':
print("省:"+code)
if province!=code:
sqlFile.write((sql+tempStr)[:-1]+";\n")#在更换省之前写入
tempStr=""
province=code
tempStr+="("+code+",'"+name+"',1,0),"
elif code[4:]=='00':
print("-市:"+code)
tempStr+="("+code+",'"+name+"',2,"+code[:2]+"0000),"
else :
print("--县:"+code)
tempCode=code[:2]
if (tempCode in codeList):
tempStr+="("+code+",'"+name+"',3,"+tempCode+"0000),"
else:
tempStr+="("+code+",'"+name+"',3,"+code[:4]+"00),"
locationCode(URL)