python+ BeautifulSoup抓取“全国行政区划信息查询平台”的省市区信息

全国行政区划信息查询平台地址:http://xzqh.mca.gov.cn/map

检查网页源码:在这里插入图片描述

检查网页源码可以发现: 所有省级信息全部在javaScript下的json中,会在页面加载时加载json数据,填充到页面的option中。
1.第一步:使用正则表达式抓取json数据并解析,租成一个province集合:

 # 获取省的集合
       def get_province(self):
        pattern = re.compile(r"var json =(.*?);", re.MULTILINE | re.DOTALL)
        script = self.soup.find("script", text=pattern)
        lists = str(pattern.search(script.text).group(1))
        json_list = json.loads(lists)
        # province_list = set()
        province_dict = dict()
        for json_data in json_list:
            province = json_data['shengji']
            quhua_code = json_data['quHuaDaiMa']
            province_dict.update({quhua_code: province})
            # province_list.add(province)
            # print(province_dict)
        return province_dict

2.第二步:检查该网站实现级联查询的方式,找出查询市区的方式

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
根据这段源码可看出,在选择 省级的后,网页会调用selectJson接口进行一个post请求,上图可以看到请求的body和header等信息。
于是事情就变得简单起来:代码可以这样写(如下)

# 获取市
    def get_city(self, shengji):
        body = ("shengji=" + shengji).encode('UTF-8')
        # body = "shengji='江苏省(苏)'"..encode('UTF-8')
        headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
                   'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
                                 "like Gecko) Chrome/77.0.3865.120 Safari/537.36"
                   }
        response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
        content = response.content
        json_list = json.loads(content)
        # city_list = set()
        city_dict = dict()
        for json_data in json_list:
            citys = json_data['diji']
            # city_list.add(citys)
            quhua_code = json_data['quHuaDaiMa']
            city_dict.update({quhua_code: citys})
        return city_dict
        # return city_list

    # 获取区
    def get_area(self, shengji, diji):
        body = ("shengji=" + shengji + "&diji=" + diji).encode('UTF-8')
        headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
                   'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
                                 "like Gecko) Chrome/77.0.3865.120 Safari/537.36"
                   }
        response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
        content = response.content
        json_list = json.loads(content)
        # area_list = set()
        area_dict = dict()
        for json_data in json_list:
            area = json_data['xianji']
            # area_list.add(area)
            area_code = json_data['quHuaDaiMa']
            area_dict.update({area_code: area})
        return area_dict

2.第三步:main函数(遍历所有省市区+数据入库)
数据库表结构如下:
在这里插入图片描述
【全部代码如下】:

import requests
from bs4 import BeautifulSoup
import pymysql
import re
import json


class allAreaDataNew(object):
    base_url = 'http://xzqh.mca.gov.cn/map'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
    wb_data = requests.get(base_url, headers=headers)
    wb_data.encoding = 'GBK'
    soup = BeautifulSoup(wb_data.text, 'lxml')

    # print(soup)

    def __init__(self):
        self.db = pymysql.connect("***", "***", "***", "***", charset="utf8mb4")  # mysql数据库
        self.main()
        self.db.close()

    # 入口
    def main(self):
        sql_list = set()
        province_dict = self.get_province()
        for province_code in province_dict:
            province = province_dict[province_code]
            city_dict = self.get_city(province)
            sql_province = "insert into area_config values (null,'" + province + "','PROVINCE'," + province_code + ",0)"
            sql_list.add(sql_province)
            print(
                province_code + "----------------------------------省------------------------------------------" + province + "\n")
            for city_code in city_dict:
                city = city_dict[city_code]
                area_dict = self.get_area(province, city)
                print(city_code + "*******************市****************" + city + "\n")
                # 处理 省直辖县级行政单位
                if city == '省直辖县级行政单位' or city == '自治区直辖县级行政单位':
                    sql_city = "insert into area_config values (null,'" + city + "','CITY'," + province_code + "," + province_code + ")"
                    sql_list.add(sql_city)
                    for area_code in area_dict:
                        area = area_dict[area_code]
                        print(area_code + "-区-" + area + "\n")
                        sql_area = "insert into area_config values (null,'" + area + "','DISTRICT'," + area_code + "," + province_code + ")"
                        sql_list.add(sql_area)
                else:
                    sql_city = "insert into area_config values (null,'" + city + "','CITY'," + city_code + "," + province_code + ")"
                    sql_list.add(sql_city)
                    for area_code in area_dict:
                        area = area_dict[area_code]
                        print(area_code + "-区-" + area + "\n")
                        sql_area = "insert into area_config values (null,'" + area + "','DISTRICT'," + area_code + "," + city_code + ")"
                        sql_list.add(sql_area)
        print(str(sql_list))
        # 事务入库
        empty_sql = "delete from area_config"
        self.connect_mysql(empty_sql, sql_list)

    # 获取省
    def get_province(self):
        pattern = re.compile(r"var json =(.*?);", re.MULTILINE | re.DOTALL)
        script = self.soup.find("script", text=pattern)
        lists = str(pattern.search(script.text).group(1))
        json_list = json.loads(lists)
        # province_list = set()
        province_dict = dict()
        for json_data in json_list:
            province = json_data['shengji']
            quhua_code = json_data['quHuaDaiMa']
            province_dict.update({quhua_code: province})
            # province_list.add(province)
            # print(province_dict)
        return province_dict

    # 获取市
    def get_city(self, shengji):
        body = ("shengji=" + shengji).encode('UTF-8')
        # body = "shengji='江苏省(苏)'"..encode('UTF-8')
        headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
                   'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
                                 "like Gecko) Chrome/77.0.3865.120 Safari/537.36"
                   }
        response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
        content = response.content
        json_list = json.loads(content)
        # city_list = set()
        city_dict = dict()
        for json_data in json_list:
            citys = json_data['diji']
            # city_list.add(citys)
            quhua_code = json_data['quHuaDaiMa']
            city_dict.update({quhua_code: citys})
        return city_dict
        # return city_list

    # 获取区
    def get_area(self, shengji, diji):
        body = ("shengji=" + shengji + "&diji=" + diji).encode('UTF-8')
        headers = {'Content-Type': "application/x-www-form-urlencoded; charset=utf-8",
                   'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, "
                                 "like Gecko) Chrome/77.0.3865.120 Safari/537.36"
                   }
        response = requests.post('http://xzqh.mca.gov.cn/selectJson', data=body, headers=headers)
        content = response.content
        json_list = json.loads(content)
        # area_list = set()
        area_dict = dict()
        for json_data in json_list:
            area = json_data['xianji']
            # area_list.add(area)
            area_code = json_data['quHuaDaiMa']
            area_dict.update({area_code: area})
        return area_dict
        # return area_list

    def connect_mysql(self, empty_sql, sql_list):
        cursor = self.db.cursor()
        try:
            cursor.execute(empty_sql)
            for sql in sql_list:
                cursor.execute(sql)
            print('=================================更新所有数据完成!=================================')
        except Exception as e:
            print('=================================更新失败!=================================')
            print(e)
            self.db.rollback()
        finally:
            cursor.close()
            # 提交操作
            self.db.commit()


if __name__ == '__main__':
    allAreaDataNew()

代码执行成功后就可以查到中国所有省市区啦!:
在这里插入图片描述
特殊情况:“省直辖县级行政单位”和“自治区直辖县级行政单位”
在这里插入图片描述
部分省有特殊的“直辖县级行政单位”或“自治区直辖县级行政单位”
参考:https://baike.baidu.com/item/省直辖县级行政单位/6903180?fr=aladdin
遇到这种情况有点懵逼,因为没有 区号代码 所以无法关联父子关系。
但是无妨,数据库设计有type字段~直接将省级区号代码作为唯一值给到市,作为区号代码,再将这个值赋值给区,作为区的父级区号代码,这样后台用type+区号代码判断关联关系。

PS:后续
1.由于本身是java项目要用到中国的省市区 2.并且中国的省市区的变化很频繁(市级区级的变化时有发生),因此后期可以用jython将其用到java项目,并且可以写一个job定时任务,定时更新数据库表。

参考文档

https://jingyan.baidu.com/article/d169e1867bd27f436611d829.html

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值