注:数据源于国家统计局
脚本取自2020统计结果
# -*- coding: GB2312 -*-
import requests
import bs4
from bs4 import BeautifulSoup
import re
#下载一个 网页
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
url ='index.html'
#模拟浏览器发送http请求
response = requests.get(base_url + url)
response.encoding ='gbk'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
provincetr = soup.find('table',{"class":"provincetable"});
#print (provincetr)
f = open(r"result.txt", "w")
# urls=re.findall(r"<a href=.*?<\/a>",html,re.I)
for i in provincetr.find_all('a'):
#print(i)
#print(i['href'])#查a标签的href值
print("省:" + i.get_text())#查a标签的string
#f.write(i)
shiUrl = base_url + i['href']
#print (shiUrl)
responseShi = requests.get(shiUrl)
responseShi.encoding ='gbk'
htmlShi = responseShi.text
soupShi = BeautifulSoup(htmlShi, 'html.parser')
citytr = soupShi.find('table',{"class":"citytable"});
for c in citytr.find_all('tr',{"class":"citytr"}):
ca = c.find_all("td");
# print("市:" + ca[0].get_text()+ " " + ca[1].get_text())#查a标签的string
#print(c['href'])
responseQu = requests.get(base_url + ca[0].a['href'])
responseQu.encoding ='gbk'
htmlQu = responseQu.text
soupQu = BeautifulSoup(htmlQu, 'html.parser')
countytr = soupQu.find('table',{"class":"countytable"});
if countytr is None:
print(ca[0].get_text()+ "," + ca[1].get_text() + "," + i.get_text() + ":没有区")
continue;
for cq in countytr.find_all('tr',{"class":"countytr"}):
cqa = cq.find_all("td");
rec = cqa[0].get_text() + "," + cqa[1].get_text() + "," + ca[0].get_text()+ "," + ca[1].get_text() + "," + i.get_text()
print(rec)
f.write(rec + "\n")
f.flush()
f.close()