获取省市区行政区域编码

最新推荐文章于 2024-10-07 19:34:05 发布

mojoyo

最新推荐文章于 2024-10-07 19:34:05 发布

阅读量941

点赞数

文章标签： python

本文链接：https://blog.csdn.net/mojoyo/article/details/117780142

版权

注：数据源于国家统计局

脚本取自2020统计结果

# -*- coding: GB2312 -*-

import requests
import bs4
from bs4 import BeautifulSoup
import re

#下载一个 网页
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
url ='index.html'

#模拟浏览器发送http请求

response = requests.get(base_url + url)

response.encoding ='gbk'

html = response.text

soup = BeautifulSoup(html, 'html.parser')

provincetr = soup.find('table',{"class":"provincetable"});
#print (provincetr)
f = open(r"result.txt", "w")

# urls=re.findall(r"<a href=.*?<\/a>",html,re.I)
for i in provincetr.find_all('a'):
  #print(i)
  #print(i['href'])#查a标签的href值
  print("省：" + i.get_text())#查a标签的string
  #f.write(i)
  shiUrl = base_url + i['href']
  #print (shiUrl)
  responseShi = requests.get(shiUrl)
  responseShi.encoding ='gbk'
  htmlShi = responseShi.text
  soupShi = BeautifulSoup(htmlShi, 'html.parser')

  citytr = soupShi.find('table',{"class":"citytable"});
  for c in citytr.find_all('tr',{"class":"citytr"}):
    ca = c.find_all("td");
    # print("市：" + ca[0].get_text()+ " " + ca[1].get_text())#查a标签的string
    #print(c['href'])
    responseQu = requests.get(base_url + ca[0].a['href'])
    responseQu.encoding ='gbk'
    htmlQu = responseQu.text
    soupQu = BeautifulSoup(htmlQu, 'html.parser')

    countytr = soupQu.find('table',{"class":"countytable"});
    if countytr is None:
      print(ca[0].get_text()+ "," + ca[1].get_text() + "," + i.get_text() + ":没有区")
      continue;
    for cq in countytr.find_all('tr',{"class":"countytr"}):
      cqa = cq.find_all("td");
      rec = cqa[0].get_text() + "," + cqa[1].get_text() + "," + ca[0].get_text()+ "," + ca[1].get_text() + "," + i.get_text()
      print(rec)
      f.write(rec + "\n")

f.flush()
f.close()