高考志愿-各校专业成绩

忆古陌烟

已于 2023-02-23 14:35:52 修改

阅读量3.5k

点赞数 1

分类专栏： python 文章标签： python chrome 开发语言 beautifulsoup selenium

于 2023-02-23 14:31:43 首次发布

本文链接：https://blog.csdn.net/weixin_45952323/article/details/129180991

版权

python 专栏收录该内容

2 篇文章

订阅专栏

高考志愿-各校专业成绩

数据来自掌上高考，程序仅供学习使用，不得参与任何商业模式

前言

总之就是闲的

环境准备

python 3.8
安装所需的 python 库

pip install xlsxwriter
pip install requests
pip install selenium
pip install bs4

根据电脑上的 Chrome 版本自行下载 chromedriver.exe ，需要保证 Chrome 浏览器前三个版本号相同
如图，我需要下载 104.0.5112.x 版本
修改 getScore.py 97 行，将下载的 chromedriver.exe 路径填写至 executable_path
运行 getScore.py 填写想要获取的专业，爬取结束后自动导出 Excel 表格

实现思路

使用 selenium 模拟浏览器访问，获取对应信息，模拟点击翻页

在掌上高考中通过其自身的id来获取浏览器的访问路径，首先获取相关专业的基本信息

getJson.py

import json

import requests

subjectJson = {'subject': []}

url1 = 'https://api.eol.cn/web/api/?keyword=&level1=1&level2=&page='
url2 = '&size=30&sort=&uri=apidata/api/gkv3/special/lists&signsafe=111cd45f1f73e18c6d5cf51e3bcdf3d2'

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}

d = {'keyword': '',
     'level1': 1,
     'level2': '',
     'page': 26,
     'signsafe': 'f6a12f70ae234d735d2e19e1642c4724',
     'size': 30,
     'sort': '',
     'uri': 'apidata/api/gkv3/special/lists'
     }

for i in range(26):
    url = url1 + str(i + 1) + url2
    result = requests.post(url, data=d, headers=header).json()['data']['item']
    for j in result:
        print(j['name'])
        subject = {
            'name': j['name'],
            'id': j['special_id'],
            'classify': j['level3_name'],
            'degree': j['degree']
        }
        subjectJson['subject'].append(subject)

file = open('init.json', 'w+')
file.write(json.dumps(subjectJson, indent=4, ensure_ascii=False))
file.close()

写入对应的 json 文件，方便以后读取使用

getScore.py

import json
import time

import xlsxwriter
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

url1 = 'https://www.gaokao.cn/special/'
url2 = '?sort=2&special_type=3&year='
url3 = '&kelei=1'

year = ['2021', '2020', '2019', '2018', '2017']


def cookiesStrtoDir(cook: str):
    cookiesDir = {}
    for i in cook.split('; '):
        cookiesDir[i.split('=')[0]] = i.split('=')[1]
    cookiesDir['name'] = ''
    cookiesDir['value'] = ''
    return cookiesDir


def getCode():
    html = driver.execute_script('return document.documentElement.outerHTML;')
    html = html.encode('utf-8').decode('utf-8')
    return html


def analysisCode(html):
    htmlBs = BeautifulSoup(html, 'html.parser')
    if htmlBs.find(class_='noData')['style'] == 'display: block;':
        driver.refresh()
        time.sleep(5)
        html = getCode()
        analysisCode(html)
    else:
        schoolList = htmlBs.find_all(class_='name_des')
        for school in schoolList:
            schoolBs = BeautifulSoup(str(school), 'html.parser')
            schoolName = schoolBs.find(class_='float_l set_hoverl am_l').string
            schoolScore = schoolBs.find_all(class_='tag_item')
            schoolScore = schoolScore[len(schoolScore) - 1].string.split('：')[1]
            schoolDir[schoolName] = schoolScore
            print(schoolName + '：' + schoolScore)


def getWebNum(html):
    htmlBs = BeautifulSoup(html, 'html.parser')

    if htmlBs.find(class_='noData')['style'] == 'display: block;' and len(
            htmlBs.find_all(class_='ant-pagination')) == 0:
        print('未找到页面，重现加载...')
        driver.refresh()
        time.sleep(7)
        html = getCode()
        getWebNum(html)
    else:
        if 0 < len(htmlBs.find_all(class_='public_list_item public_tbl')) <= 10:
            return 1
        ui = htmlBs.find(class_='ant-pagination').find_all('li')
        webNumber = int(ui[len(ui) - 2]['title'])
        return webNumber


def writeExcel():
    workbook = xlsxwriter.Workbook(subjectName + '.xlsx')
    for key, value in allSchoolDir.items():
        worksheet = workbook.add_worksheet(str(key))
        worksheet.activate()
        worksheet.write_row('A1', ('学校', '最低分/最低位次'))
        num = 2
        for i, j in value.items():
            worksheet.write_row('A' + str(num), (i, j))
            num += 1
    workbook.close()


if __name__ == '__main__':
    subjectName = str(input('请输入专业：'))
    subjectJson = json.loads(open('init.json', 'r+').read())
    isfind = False
    for subject in subjectJson['subject']:
        if subjectName == subject['name']:
            isfind = True
            print('初始化...')
            print('专业昵称：' + subject['name'])
            print('专业分类：' + subject['classify'])
            print('专业学位：' + subject['degree'])
            # 设置 Chorme
            chromeOptions = webdriver.ChromeOptions()
            # 禁用日志打印
            chromeOptions.add_experimental_option('excludeSwitches', ['enable-logging'])
            chromeOptions.add_argument('--headless')
            chromeOptions.add_argument('--disable-gpu')
            driver = webdriver.Chrome(executable_path=r'E:\Driver\chromedriver.exe', options=chromeOptions)
            allSchoolDir = {}
            for y in year:
                schoolDir = {}
                yearUrl = url1 + str(subject['id']) + url2 + y + url3
                driver.get(url=yearUrl)
                html = getCode()
                webNum = getWebNum(html)
                while True:
                    if str(type(webNum)) == '<class \'NoneType\'>':
                        html = getCode()
                        webNum = getWebNum(html)
                    else:
                        break
                print('开始爬取 ' + y + ' 信息...')
                for i in range(int(webNum)):
                    classStr = 'ant-pagination-item.ant-pagination-item-' + str(i + 1)
                    if not (i == 0):
                        driver.find_element(By.CLASS_NAME, classStr).click()
                    time.sleep(1)
                    html = getCode()
                    analysisCode(html)
                allSchoolDir[y] = schoolDir
            writeExcel()
            break
    if not isfind:
        print('未找到该专业，请核对后重试...')