高考志愿-各校专业成绩
数据来自掌上高考,程序仅供学习使用,不得参与任何商业模式
前言
总之就是闲的
环境准备
-
python 3.8
-
安装所需的 python 库
-
pip install xlsxwriter pip install requests pip install selenium pip install bs4
-
根据电脑上的 Chrome 版本自行下载 chromedriver.exe ,需要保证 Chrome 浏览器前三个版本号相同
-
-
如图,我需要下载 104.0.5112.x 版本
-
修改 getScore.py 97 行,将下载的 chromedriver.exe 路径填写至 executable_path
-
运行 getScore.py 填写想要获取的专业,爬取结束后自动导出 Excel 表格
实现思路
使用 selenium 模拟浏览器访问,获取对应信息,模拟点击翻页
在掌上高考中通过其自身的id来获取浏览器的访问路径,首先获取相关专业的基本信息
getJson.py
import json
import requests
subjectJson = {'subject': []}
url1 = 'https://api.eol.cn/web/api/?keyword=&level1=1&level2=&page='
url2 = '&size=30&sort=&uri=apidata/api/gkv3/special/lists&signsafe=111cd45f1f73e18c6d5cf51e3bcdf3d2'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
d = {'keyword': '',
'level1': 1,
'level2': '',
'page': 26,
'signsafe': 'f6a12f70ae234d735d2e19e1642c4724',
'size': 30,
'sort': '',
'uri': 'apidata/api/gkv3/special/lists'
}
for i in range(26):
url = url1 + str(i + 1) + url2
result = requests.post(url, data=d, headers=header).json()['data']['item']
for j in result:
print(j['name'])
subject = {
'name': j['name'],
'id': j['special_id'],
'classify': j['level3_name'],
'degree': j['degree']
}
subjectJson['subject'].append(subject)
file = open('init.json', 'w+')
file.write(json.dumps(subjectJson, indent=4, ensure_ascii=False))
file.close()
写入对应的 json 文件,方便以后读取使用
getScore.py
import json
import time
import xlsxwriter
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
url1 = 'https://www.gaokao.cn/special/'
url2 = '?sort=2&special_type=3&year='
url3 = '&kelei=1'
year = ['2021', '2020', '2019', '2018', '2017']
def cookiesStrtoDir(cook: str):
cookiesDir = {}
for i in cook.split('; '):
cookiesDir[i.split('=')[0]] = i.split('=')[1]
cookiesDir['name'] = ''
cookiesDir['value'] = ''
return cookiesDir
def getCode():
html = driver.execute_script('return document.documentElement.outerHTML;')
html = html.encode('utf-8').decode('utf-8')
return html
def analysisCode(html):
htmlBs = BeautifulSoup(html, 'html.parser')
if htmlBs.find(class_='noData')['style'] == 'display: block;':
driver.refresh()
time.sleep(5)
html = getCode()
analysisCode(html)
else:
schoolList = htmlBs.find_all(class_='name_des')
for school in schoolList:
schoolBs = BeautifulSoup(str(school), 'html.parser')
schoolName = schoolBs.find(class_='float_l set_hoverl am_l').string
schoolScore = schoolBs.find_all(class_='tag_item')
schoolScore = schoolScore[len(schoolScore) - 1].string.split(':')[1]
schoolDir[schoolName] = schoolScore
print(schoolName + ':' + schoolScore)
def getWebNum(html):
htmlBs = BeautifulSoup(html, 'html.parser')
if htmlBs.find(class_='noData')['style'] == 'display: block;' and len(
htmlBs.find_all(class_='ant-pagination')) == 0:
print('未找到页面,重现加载...')
driver.refresh()
time.sleep(7)
html = getCode()
getWebNum(html)
else:
if 0 < len(htmlBs.find_all(class_='public_list_item public_tbl')) <= 10:
return 1
ui = htmlBs.find(class_='ant-pagination').find_all('li')
webNumber = int(ui[len(ui) - 2]['title'])
return webNumber
def writeExcel():
workbook = xlsxwriter.Workbook(subjectName + '.xlsx')
for key, value in allSchoolDir.items():
worksheet = workbook.add_worksheet(str(key))
worksheet.activate()
worksheet.write_row('A1', ('学校', '最低分/最低位次'))
num = 2
for i, j in value.items():
worksheet.write_row('A' + str(num), (i, j))
num += 1
workbook.close()
if __name__ == '__main__':
subjectName = str(input('请输入专业:'))
subjectJson = json.loads(open('init.json', 'r+').read())
isfind = False
for subject in subjectJson['subject']:
if subjectName == subject['name']:
isfind = True
print('初始化...')
print('专业昵称:' + subject['name'])
print('专业分类:' + subject['classify'])
print('专业学位:' + subject['degree'])
# 设置 Chorme
chromeOptions = webdriver.ChromeOptions()
# 禁用日志打印
chromeOptions.add_experimental_option('excludeSwitches', ['enable-logging'])
chromeOptions.add_argument('--headless')
chromeOptions.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path=r'E:\Driver\chromedriver.exe', options=chromeOptions)
allSchoolDir = {}
for y in year:
schoolDir = {}
yearUrl = url1 + str(subject['id']) + url2 + y + url3
driver.get(url=yearUrl)
html = getCode()
webNum = getWebNum(html)
while True:
if str(type(webNum)) == '<class \'NoneType\'>':
html = getCode()
webNum = getWebNum(html)
else:
break
print('开始爬取 ' + y + ' 信息...')
for i in range(int(webNum)):
classStr = 'ant-pagination-item.ant-pagination-item-' + str(i + 1)
if not (i == 0):
driver.find_element(By.CLASS_NAME, classStr).click()
time.sleep(1)
html = getCode()
analysisCode(html)
allSchoolDir[y] = schoolDir
writeExcel()
break
if not isfind:
print('未找到该专业,请核对后重试...')
在获取完近5年的分数后,自动导出 Excel 表格
最后
- 使用 Chrome 浏览器来示范,您也可以使用其他浏览器,需要下载对应浏览器对应版本的 WebDriver
- 使用 selenium 模拟浏览器访问 掌上高考
- 通过 BeautifulSoup 来对数据进行解析
- 通过 getJson.py 来获取 init.json 文件,其中包括专业的基本信息
- 所有数据来自 掌上高考 ,数据可能不全,还望谅解