中国现行的行政区划实行如下:
一级省级行政区:包括省、自治区、直辖市、特别行政区。
二级地级行政区:包括地级市、地区、自治州、盟。
三级县级行政区:包括市辖区、县级市、县、自治县、旗、自治旗、特区、林区。
四级乡级行政区:包括街道、镇、乡、民族乡、苏木、民族苏木、县辖区。
通过该网站可以简单的爬取到全国的四级行政区域名称
from bs4 import BeautifulSoup
from urllib.request import urlopen,urlparse,urlsplit,Request
import urllib.request
import re
import codecs
import random
#
ua_list = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36",#Chrome
"Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0",#firwfox
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",#IE
"Opera/9.99 (Windows NT 5.1; U; zh-CN) Presto/9.9.9",#Opera
]
base_url = 'http://www.tcmap.com.cn/'
citys = []
distrcts = []
streets = []
def getStrongItem(p):
# print(base_url + p)
ua = random.choice(ua_list)
req = urllib.request.Request( base_url + p, headers={'User-agent' : ua} )
html=urlopen(req).read()
soup = BeautifulSoup ( html, 'html.parser',from_encoding="gb18030" )
href_limit = re.compile(".*<a class=\"blue\" href=\".*</a>.*")
# print(re.match(href_limit, "<a class=\"blue\" href=\"/zhejiangsheng/linan.html\">临安区</ a>"))
page=soup.find_all('table')
# print(page)
items = []
item_htmls = []
for p in page:
lines = p.find_all('strong')
for line in lines:
# print(line)
if (re.match(href_limit,str(line))) is not None:
item_html = str(line).split('"')[3]
item = str(line).split('>')[-3].split('<')[0]
# print(item,item_html)
items.append(item)
item_htmls.append(item_html)
return items,item_htmls
# break
def getAll():
provinces = {'zhejiangsheng':'浙江省'}#,'jiangxi':'江西省','xinjiang':'新疆维吾尔自治区','gansusheng':'甘肃省','neimenggu':'内蒙古自治区'
#,'heilongjiang':'黑龙江省','jilin':'吉林省','liaoning':'辽宁省','hebei':'河北省','shandong':'山东省','shanxisheng':'山西省'
#,'henan':'河南省','jiangsu':'江苏省','anhui':'安徽省'
p_map = {}
for p in provinces:
# print(provinces[p])
citys,city_htmls = getStrongItem(p)
c_map= {}
for city,city_html in zip(citys,city_htmls):
distrcts,distrct_htmls = getStrongItem(city_html)
d_map = {}
for distrct,distrct_html in zip(distrcts,distrct_htmls):
streets,street_htmls = getStrongItem(distrct_html)
d_map[distrct] = streets
c_map[city] = d_map
p_map[provinces[p]] = c_map
return p_map
if __name__ == '__main__':
getAll()