# -*- coding:utf-8 -*-
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests
'''
功能步骤:
北京市教委中学:每一个中学右侧有“详情”链接,点击进入详情页可以获取曾用学校名、校址创办日期、邮编、电话、是否有寄宿、学校类别、办学规模和主要特色等信息
step 1:获取详情对应的链接url(找到规律 600多所学校 有规律的话就可以考虑用循环语句写)
step 2:进入详情页,获取以下内容(学校名、曾用学校名、校址创办日期、邮编、电话、是否有寄宿、学校类别、办学规模和主要特色
step 3:数据存储
step 4:数据写入excel
'''
# 获取各个学校详情页url
def find_placeId(url):
# 添加headers信息,防止访问拒绝[此处疑似被拦截,需要https://blog.csdn.net/weixin_43507410/article/details/113913822]
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
cookie = dict(cookies_are = 'BMAP_SECKEY2=001d68c88b4a89c55e674c88f5c5cd8e72663455c29e749ba01c3469b5874a56f46ea22690dc7c523d9ab04328f08d63641d3c9cdce65874d6b562d22b05cbe31e2fc185b29a7de8ed7ce8c6ea2a0834a17fd36dd5133d51bf1c9781a35d93c6b153f33ced5c43b3e11c139f7aa62168fc63351afff64683d00c64ecf9f697a02d8d3d9d29e6297dd237111adb12da0aec75f76d05c47d2f6ac029bcf8a1518b6b2693415237a10f997d988e16abdc4760a39cfa5b4a43288b100ccd5ae19d5d67d2121da519118a784aa895961fdd74440329ce44c037b94dfaa65c65d4dd29; _va_id=c7414a3a63766174.1632395467.6.1632918181.1632918181.; _va_ses=*; SECKEY_CID2=d1e5d9f0bc28c4c3da240a700c07e219562c1c13; __jsluid_s=096743c5ed6917320365a4ea610f3ce1; __jsluid_h=398619b3af25f29ad0e6cf8ba9aabb37; _trs_uv=ku5djmvl_1017_6atj; yfx_c_g_u_id_10008822=_ck21092918371319870262375735467; yfx_f_l_v_t_10008822=f_t_1632911833981__r_t_1632911833981__v_t_1632911833981__r_c_0')
head = {"User-Agent": user_agent}
# 获取响应体
contents = requests.get(url, cookies=cookie, headers=head).text
pattern = re.compile('"placeId":"(.*?)","placeName":.*?')
placeId = re.findall(pattern, contents)
print(placeId)
# https://map.beijing.gov.cn/place?placeId=5ba765b97e4e7316d93853a2&categoryId=zx 用placeID 可以看到不同学校详情页url里的placeId是不一样的,获取各个学校的placeId就可以利用这一规律循环语句获取各学校详情页里所需信息
return placeId
def get_detail(placeId):
info=[]
for placeid in placeId:
middle_school = {}
middle_school['placeid'] = placeid
url = "https://map.beijing.gov.cn/place?placeId={}&categoryId=zx".format(placeid)
print(url)
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
cookie = dict(cookies_are='_va_id=c7414a3a63766174.1632395467.7.1632933205.1632930730.; _va_ses=*; BMAP_SECKEY2=001d68c88b4a89c55e674c88f5c5cd8e72663455c29e749ba01c3469b5874a56f46ea22690dc7c523d9ab04328f08d63641d3c9cdce65874d6b562d22b05cbe333274af0015fe00d5c933791c5422aa7b0a4ac296420ca33db6485df5821057baea8c51f101567a770b8d2406e8818b11388bb4b46f1e1ceab4b8f8bf2bbcb581ab1af54b970b3a94e049cfae54889863322bc4d8c84b82ed71e2f5e1b5b86fe6fe3d9c2211432cd47612c58a01d6ab6077a22a3ca40735c2118a9c32d60f3f29e9a109b67063da2eefd0bdf5d7ee9d77d7d9e4bff214ac117b85307e57f1b25; SECKEY_CID2=75f2f1a7d5e47a775570dd885af66ab22a4c44d4; __jsluid_s=096743c5ed6917320365a4ea610f3ce1; __jsluid_h=398619b3af25f29ad0e6cf8ba9aabb37; _trs_uv=ku5djmvl_1017_6atj; yfx_c_g_u_id_10008822=_ck21092918371319870262375735467; yfx_f_l_v_t_10008822=f_t_1632911833981__r_t_1632911833981__v_t_1632911833981__r_c_0')
head = {"User-Agent": user_agent}
html = requests.get(url, cookies=cookie, headers=head).text
soup = BeautifulSoup(html, 'html.parser')
# 学校名称
name = soup.find('dt').text
print(name)
# 学校详情
detail = soup.find_all('td')
# 数据更新时间<dd style="width: 998px;margin: auto;">数据来源:市教委 更新时间:2018-09-23</dd>
update_raw = soup.find_all('dd')[2].text
print(update_raw)
try:
middle_school['学校名称'] = name
middle_school['办公地址'] = detail[0].text
middle_school['曾用名称'] = detail[1].text
middle_school['本校址创办日期'] = detail[2].text
middle_school['邮编'] = detail[3].text
middle_school['电话'] = detail[4].text
middle_school['是否有寄宿'] = detail[5].text
middle_school['学校类别'] = detail[6].text
middle_school['办学规模和主要特色'] = detail[7].text
middle_school['数据更新时间'] = update_raw
except:
print('{}失败'.format(placeid))
print(middle_school)
info.append(middle_school)
return info
def main():
url = 'https://map.beijing.gov.cn/api/place_list_for_category.json?categoryId=zx'
placeid = find_placeId(url)
info = get_detail(placeid)
dataframe = pd.DataFrame(info)
dataframe.to_excel('/Users/liusongyue/Onedrive_personal/OneDrive/RP/population/middle_school_Beijing.xlsx')
if __name__ == '__main__':
main()
上方是code
爬取到的信息包括学校名称、办公地址、曾用名称、本校址创办日期、邮编、电话、是否有寄宿、学校类别、办学规模和主要特色以及数据更新时间。