#coding=utf-8
import requests
from bs4 import BeautifulSoup
#import pymysql
import time
import sqlite3
import os
import _thread
import traceback
class Administrative(object):
def __init__(self):
self.db = sqlite3.connect('sqlite3.db',check_same_thread=False)
sql='DROP TABLE IF EXISTS china;'
sql2='CREATE TABLE china (cid INTEGER PRIMARY KEY ,province_name varchar(255),city_code varchar(255),city_name varchar(255),county_code varchar(255),county_name varchar(255),xiang_code varchar(255),xiang_name varchar(255),cun_code varchar(255),cun_name varchar(255) )'
self.db.execute(sql)
self.db.execute(sql2)
self.main()
self.db.close()
def main(self):
base_url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
trs=self.get_response(base_url,'provincetr')
for tr in trs:#循环每一行
#datas=[]
for td in tr:#循环每个省
if td.a==None : continue
province_name=td.a.get_text()
#可以只下载指定省的数据
if province_name==r'海南省' or province_name==r'广东省' :
pass
else:
continue
province_url=base_url+td.a.get('href')
print(province_name)
trs=self.get_response(province_url,None)
for tr in trs[1:]:#循环每个市
city_code=tr.find_all('td')[0].string
city_name=tr.find_all('td')[1].string
city_url=base_url+tr.find_all('td')[1].a.get('href')
trs=self.get_response(city_url,None)
for tr in trs[1:]:#循环每个区
county_code=tr.find_all('td')[0].string
county_name=tr.find_all('td')[1].string
if tr.find_all('td')[1].a==None: continue
xiang_url=city_url.replace(os.path.basename(city_url),'')+tr.find_all('td')[1].a.get('href')
trs=self.get_response(xiang_url,None)
datas=[]
for tr in trs[1:]:#循环每个乡镇
xiang_code=tr.find_all('td')[0].string
xiang_name=tr.find_all('td')[1].string
if tr.find_all('td')[1].a==None: continue
cun_url=xiang_url.replace(os.path.basename(xiang_url),'')+tr.find_all('td')[1].a.get('href')
trs=self.get_response(cun_url,None)
if trs==None : continue
if trs[1:]==None : continue
try:
for tr in trs[1:]:#循环每个村
if tr==None:continue
try:
cun_code=tr.find_all('td')[0].string
cun_name=tr.find_all('td')[2].string
data=(province_name,city_code,city_name,county_code,county_name,xiang_code,xiang_name,cun_code,cun_name)
datas.append(data)
except Exception as e:
traceback.print_exc()
except Exception as e:
traceback.print_exc()
sql="insert into china (province_name,city_code,city_name,county_code,county_name,xiang_code,xiang_name,cun_code,cun_name) values (?,?,?,?,?,?,?,?,?)"
tmpdatas=datas
self.connect_mysql(sql,tmpdatas)
def get_response(self,url,attr):
try:
response=None
while True: # 循环
try:
response=requests.get(url)
if response==None : continue
response.encoding='gbk'#编码转换
soup=BeautifulSoup(response.text,'lxml')
tbodylist=soup.find_all('tbody')
if len(tbodylist)<=1 : continue
table=tbodylist[1].tbody.tbody.table
if attr:
trs=table.find_all('tr',attrs={'class':attr})
else:
trs=table.find_all('tr')
return trs
except Exception as e:
traceback.print_exc()
time.sleep(5)
continue # 继续发请求
break # 无异常就跳出循环
except Exception as e:
response=requests.get(url)
response.encoding='gbk'
print('出现错误')
print(response.text)
traceback.print_exc()
def connect_mysql(self,sql,data):
cursor = self.db.cursor()
try:
result=None
if data:
if isinstance(data[0],tuple):
cursor.executemany(sql,data)
else:
cursor.execute(sql,data)
except Exception as e:
traceback.print_exc()
self.db.rollback();
finally:
cursor.close()
self.db.commit(); #提交操作
return result
if __name__=='__main__':
Administrative()
这段代码参照了CoderYYN写的从 国家统计局获取最新国家省市区的行政区域数据 但只到了区县第三级 ,这里修改了一下,修正了一些bug,可以到 省、市、县、乡、村 5级,如果要爬下全国5级的地址码数据要点时间啦!!!