import requests
import time
from lxml import etree
def analysis_html(address, pattern):
global trs
response = requests.get(address, headers=headers)
response.encoding = 'utf-8'
text = response.text
html = etree.HTML(text)
trs = html.xpath(pattern)
def make_url(url_before, pattern):
return url_before + tr.xpath(pattern)[0]
def province():
global tr
analysis_html(f'{main_address}index.html', '//tr[@class="provincetr"]/td')
for tr in trs[:-1]:
province = tr.xpath('./a/text()')[0]
province_url = make_url(main_address, './a/@href')
print(province)
city(province_url, province)
time.sleep(1)
def city(province_url, province):
analysis_html(province_url, '//tr[@class="citytr"]')
for tr in trs:
city = tr.xpath('./td[2]/a/text()')[0]
#city_url = make_url(main_address, './td[1]/a/@href') #why?
page = tr.xpath('./td[1]/a/@href')[0]
city_url = main_address + page
city_id = tr.xpath('./td[1]/a/text()')[0]
country(city_url, city, city_id, province, province_url)
time.sleep(1)
def country(city_url, city, city_id, province, province_url):
analysis_html(city_url, '//tr[@class="countytr"]')
for tr in trs:
try:
country = tr.xpath('./td[2]/a/text()')[0]
country_id = tr.xpath('./td[1]/a/text()')[0]
# page = tr.xpath('./td[1]/a/@href')[0]
# country_url = province_url.replace('.html','/') + page
f.write(f'{country},{country_id}\t,{city},{city_id}\t,{province}\n')
time.sleep(1)
except:
pass
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
'Cookie':'AD_RS_COOKIE=20080918; _trs_uv=kahvgie3_6_fc6v'
}
main_address = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"
with open(r'省市区.csv', 'a',encoding='utf-8') as f:
province()
最后存储为相应的csv文件,示例为2021年数据