实现目标:
使用python+playwright从国家统计局获取天津市(示例)所有行政区划数据
前置条件:
1.获取数据的网站:
2.编辑器:
3.第三方库下载:
pip install playwight
playwright install
pip pandas
实现步骤:
1.region_1.py:待运行的python文件
# 导入
from playwright.sync_api import Playwright, sync_playwright
import pandas as pd
from urllib.parse import urljoin
import os
BASE_URL_1 = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/12/1201.html'
BASE_URL_2 = 'https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/12/01/120101.html'
url_list = [BASE_URL_1]
count = [0,0,0]
result = pd.DataFrame()
# 抓取网页内容
def scrape_page(page, url):
page.goto(url)
page.wait_for_load_state('networkidle')
# 获取解析内容
def parse_index(page):
# 获取网页内容请求
elements = page.query_selector_all('tr td:nth-child(2) a')
if page.query_selector('table.towntable'):
BASE_URL = BASE_URL_2
else:
BASE_URL = BASE_URL_1
# 获取元素信息
for element in elements:
part_of_url = element.get_attribute('href')
detail_url = urljoin(BASE_URL, part_of_url)
url_list.append(detail_url)
parse_detail(page)
def parse_detail(page):
global result
global count
data = page.query_selector_all('.villagetable tbody tr, .countytable tbody tr, .towntable tbody tr')
for i in range(1,len(data)):
acode = data[i].query_selector('td:nth-child(1)').text_content()
region = data[i].query_selector('td:nth-child(3)').text_content() if data[i].get_attribute('class') == 'villagetr' else data[i].query_selector('td:nth-child(2)').text_content()
#社区
if data[i].get_attribute('class') == 'villagetr':
facode = acode[:9]
count[2] += 1
#街道
elif data[i].get_attribute('class') == 'towntr':
acode = acode[:9]
facode = acode[:6]
count[1] += 1
#区县
else:
acode = acode[:6]
facode = acode[:4]
count[0] += 1
busi_data = [{
'acode': acode,
'region': region,
'facode':facode
}]
df = pd.DataFrame(busi_data)
result = pd.concat([result, df], axis=0, ignore_index=True)
url_list.pop(0)
def run (playwright: Playwright) -> None:
print("---------------行政区划获取中(来源:国家统计局)-------------------")
browser = playwright.chromium.launch(headless=False)
content = browser.new_context()
page = content.new_page()
while len(url_list):
scrape_page(page, url_list[0])
parse_index(page)
# 将脚本所在路径作为excel输出路径
output_path = os.getcwd() + os.sep + "output_region_1.xlsx"
# 将结果写入到output_path 所在的excel中
result.to_excel(output_path, index=False)
print('共统计出 %d 条数据:' %len(result))
print('———————————————————————————')
print(' 区(县) | %d 个 ' %count[0])
print(' 街道(乡/镇) | %d 个 ' %count[1])
print(' 社区(村) | %d 个 ' %count[2])
print('———————————————————————————')
print("---------------------------获取完成------------------------------")
content.close()
browser.close()
with sync_playwright() as playwright:
run(playwright)
2.运行region_1.py
3.会弹出一个浏览器窗口,待查询结束后会关闭
4.终端
5.生成的excel表格:output_region_1.xlsx
acode:区划代码
region:区划名称
facode:父级区划代码
问题汇总:
1)PermissionError: [Errno 13] Permission denied: 'E:\\gis\\output_region_1.xlsx'
说明output_region_1.xlsx文件未关闭,查询到的内容无法写入
2)开启无头浏览器
将output_region_1.py文件中的browser = playwright.chromium.launch(headless=False)修改为True