爬取国家统计局数据
import re
from bs4 import BeautifulSoup
import requests
import time
import string
def geturl(url):
headers={
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Mobile Safari/537.36 Edg/89.0.774.76'
}
html = requests.get(url,headers=headers)
html.encoding=html.apparent_encoding
return html.text
def province_parser(url):
if geturl(url) is None:
raise Exception('Html is None')
soup = BeautifulSoup(geturl(url), 'html.parser')
# 找出“北京市”、“天津市”等<td>标签
url_tds = soup.find_all('a', href=re.compile(r'\d+.html'))
time.sleep(0.5)
# 生成包含省名称、下级url、省编码(在后续拼装区级网址需要用到)的元组的列表
urls = [(td.text, url + td['href'], td['href'].replace('.html', '')) for td in url_tds]
return urls
def city_parser(url):
if geturl(url) is None:
raise Exception('Html is None')
soup = BeautifulSoup(geturl(url), 'html.parser')
# 找出“杭州市”、“温州市”等<tr>标签
time.sleep(0.2)
url_trs = soup.find_all('tr', 'citytr')
# 生成包含市名称、下级url、市级12位编码的元组的列表
#
urls = [(tr.contents[1].text if tr.contents[1].a is None else tr.contents[1].a.text,
None if tr.contents[0].a is None else root_url + tr.contents[0].a['href'],
tr.contents[0].text if tr.contents[0].a is None else tr.contents[0].a.text)
for tr in url_trs]
return urls
def county_parser(url):
if geturl(url) is None:
raise Exception('Html is None')
soup = BeautifulSoup(geturl(url), 'html.parser')
# 找出“上城区”、“下城区”等<tr>标签
time.sleep(0.2)
url_trs = soup.find_all('tr', 'countytr')
# 生成包含区名称、下级url、区级12位编码的元组的列表
urls = [(tr.contents[1].text if tr.contents[1].a is None else tr.contents[1].a.text,
None if tr.contents[0].a is None else url[0:-9]+tr.contents[0].a['href'],
tr.contents[0].text if tr.contents[0].a is None else tr.contents[0].a.text)
for tr in url_trs]
#python特有的三目运算的表达方式,意为“当条件成立时取A,否则取B”,
#本爬虫中的条件为tr.contents[1].a is None,
#是因为例如”直辖市”没有下级页面,<tr>中不包含<a>,同样用tr.contents[1].a.get_text()去获取数据就会报错。
return urls
def town_parser(url):
#if geturl(url) is None:
#raise Exception('Html is None')
soup = BeautifulSoup(geturl(url), 'html.parser')
# 找出“西湖街道”、“留下街道”等<tr>标签
url_trs = soup.find_all('tr', 'towntr')
# 生成包含乡镇街道名称、下级url、乡镇街道级12位编码的元组的列表
urls = [(tr.contents[1].text if tr.contents[1].a is None else tr.contents[1].a.text,
None if tr.contents[0].a is None else url[0:-11]+tr.contents[0].a['href'],#此处的url[0:-11]为对上一级网址(区)做切片处理和得到的下一级的链接代码做拼接,生成下一级的完整网址
tr.contents[0].text if tr.contents[0].a is None else tr.contents[0].a.text)
for tr in url_trs]
return urls
def villagetr(url):
#if geturl(url) is None:
#raise Exception('Html is None')
soup = BeautifulSoup(geturl(url), 'html.parser')
# 找出“西湖街道”、“留下街道”等<tr>标签
url_trs = soup.find_all('tr', 'villagetr')
urls = [tr.contents[2].text for tr in url_trs]
return urls
url='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
for itema in province_parser(url):
with open(itema[0]+".txt", "w",encoding='utf-8') as f:#itema[0]为省(直辖市)名,作为文件名
f.write(itema[0])
for itemb in city_parser(itema[1]): #itema[1]表示下一级地址链接
with open(itema[0]+".txt", "a",encoding='utf-8') as f:
f.write(itemb[0])
for itemc in county_parser(itemb[1]):
with open(itema[0]+".txt", "a",encoding='utf-8') as f:
f.write(itemc[0])
time.sleep(0.5)
for itemd in town_parser(itemc[1]):
with open(itema[0]+".txt", "a",encoding='utf-8') as f:
f.write(itemd[0])
for iteme in villagetr(itemd[1]):
with open(itema[0]+".txt", "a",encoding='utf-8') as f:
f.write(iteme)
print('完成一个省(市)的爬取!')
运行结果:
下图为爬取到并保存的命名文件内容: