Selenium获取PTA平台所有题集和答案存为json文件

摘星喵Pro
已于 2022-08-26 16:21:01 修改
阅读量5.7k
点赞数 6
分类专栏： PTA 文章标签： python
于 2021-06-07 08:27:52 首次发布
转载请注明原创链接
本文链接：https://blog.csdn.net/a2272062968/article/details/117647849
版权
PTA 专栏收录该内容
18 篇文章
订阅专栏
注意cv2库是opencv-python
在这里插入图片描述
使用方法最下方url列表写要爬的题集，该题集要已经提交完毕
最后的json文件自己建，里面存一对花括号
网站的网页结构变了，有点css类名变了，稍微改了一下
爬取代码 2021-12-12更新可用
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import requests
import time
import numpy
import cv2
import os
import json

# 定义全局变量请求网页之前等待的时间，防止请求过快被拒绝
my_time = 3.5


def login_PTA(my_account, my_password):
    # 输入账号密码并点击登录
    account = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[1]/div/div/div/input')
    password = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[2]/div/div/div/input')
    account.send_keys(my_account)
    password.send_keys(my_password)
    web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button').click()  # 找到登录按钮并点击
    web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button/div/div').click()
    print("ok")
    for i in range(5):
        time.sleep(3)  # 等待验证码加载完成,时间间隔可根据网速调整，
        # print('当前url:' + web.current_url)
        # 如果当前url改变说明已经登录成功
        if web.current_url != login_url:
            break
        cracking_captcha()


def cracking_captcha():
    """破解验证码"""
    # bg背景图片
    bg_img_src = web.find_element_by_xpath(
        '/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[1]').get_attribute('src')
    # front可拖动图片
    front_img_src = web.find_element_by_xpath(
        '/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[2]').get_attribute('src')

    # 保存图片
    with open("bg.jpg", mode="wb") as f:
        f.write(requests.get(bg_img_src).content)
    with open("front.jpg", mode="wb") as f:
        f.write(requests.get(front_img_src).content)

    # 将图片加载至内存
    bg = cv2.imread("bg.jpg")
    front = cv2.imread("front.jpg")

    # 将背景图片转化为灰度图片，将三原色降维
    bg = cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY)
    # 将可滑动图片转化为灰度图片，将三原色降维
    front = cv2.cvtColor(front, cv2.COLOR_BGR2GRAY)
    front = front[front.any(1)]
    # 用cv算法匹配精度最高的xy值
    result = cv2.matchTemplate(bg, front, cv2.TM_CCOEFF_NORMED)
    # numpy解析xy，注意xy与实际为相反，x=y,y=x
    x, y = numpy.unravel_index(numpy.argmax(result), result.shape)
    # 找到可拖动区域
    div = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[2]/div[2]')
    # 拖动滑块，以实际相反的y值代替x
    ActionChains(web).drag_and_drop_by_offset(div, xoffset=y // 0.946, yoffset=0).perform()
    # 至此成功破解验证码，由于算法问题，准确率不能达到100%，所以加了循环判断


def get_question_type_url(headers_collection_url):
    time.sleep(my_time)
    """获取当前章节题目类型的url"""
    web.get(headers_collection_url)
    single_choice_url = ""
    judgment_url = ""
    fill_in_the_blanks_url = ""
    program_fill_in_the_blanks_url = ""
    function_url = ""
    programming_url = ""

    questions_type_list_a = web.find_element_by_css_selector(
        "[class='pc-h container_3U5RB pc-gap-default']").find_elements_by_css_selector("a")
    for t in questions_type_list_a:
        questions_type_name = t.find_element_by_css_selector("[class='pc-text-raw']").text
        this_url = t.get_attribute('href')
        if questions_type_name == '单选题':
            single_choice_url = this_url
        elif questions_type_name == '判断题':
            judgment_url = this_url
        elif questions_type_name == '填空题':
            fill_in_the_blanks_url = this_url
        elif questions_type_name == '程序填空题':
            program_fill_in_the_blanks_url = this_url
        elif questions_type_name == '函数题':
            function_url = this_url
        elif questions_type_name == '编程题':
            programming_url = this_url

    # print(single_choice_url)
    # print(judgment_url)
    # print(fill_in_the_blanks)
    # print(program_fill_in_the_blanks)
    # print(function)
    # print(programming)

    question_type_dict = {
        "single_choice_url": single_choice_url,
        "judgment_url": judgment_url,
        "fill_in_the_blanks_url": fill_in_the_blanks_url,
        "program_fill_in_the_blanks_url": program_fill_in_the_blanks_url,
        "function_url": function_url,
        "programming_url": programming_url
    }

    return question_type_dict


def get_judgment(judgment_url):
    """获取判断题并返回题集字典--判断题自动改错"""
    time.sleep(my_time)
    web.get(judgment_url)
    judgment_question_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    num = 0
    success_num = 0
    fail_num = 0
    for judgment in judgment_question_list:
        try:
            question = judgment.find_element_by_css_selector("[class='rendered-markdown']").find_element_by_xpath(
                'p').text
            is_select_T = judgment.find_element_by_css_selector(
                "[class='mr-2 mt-1 focus:outline-none']").is_selected()
            isTrue = judgment.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector(
                "[class='pc-text-raw']").text

            if isTrue != "答案正确":
                if is_select_T:
                    answer = "F"
                else:
                    answer = "T"
            else:
                if is_select_T:
                    answer = "T"
                else:
                    answer = "F"

            print(question)
            print(answer)
            questions_dict[question] = answer
            num += 1
        except Exception:
            traceback.print_exc()
            fail_num += 1
            print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + judgment_url + ", 程序跳过该题继续执行")
            continue
    print("判断题--题集: " + judgment_url + "获取题目数量--成功: " + str(num) + " 失败: " + str(fail_num))
    return questions_dict


def get_single_choice(single_choice_url):
    """获取选择题并返回题集字典"""
    time.sleep(my_time)
    web.get(single_choice_url)
    single_choice_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    success_num = 0
    fail_num = 0
    for single_choice in single_choice_list:
        try:
            success_num += 1
            question = single_choice.find_element_by_css_selector("[class='pc-x min-w-0 shrink']").text

            options_urls = single_choice.find_elements_by_css_selector(
                "[class='flex items-start p-2 rounded hover:bg-gray-100 focus:bg-gray-200 focus-within:bg-gray-100 focus-within:ring focus-within:ring-blue-300 multiple-choice-label min-w-0 items-baseline']")
            option = []
            answer = ""
            for option_url in options_urls:
                this_answer = option_url.find_element_by_css_selector(
                    "[class='rendered-markdown']").text
                option.append(this_answer)
                if option_url.find_element_by_css_selector(
                        "[class='mr-2 mt-1 focus:outline-none']").is_selected():
                    answer = this_answer
            is_true = single_choice.find_element_by_css_selector(
                "[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text
            print(question)
            print(option)
            print(answer)
            print(is_true)

            questions_dict[question] = [option, answer, is_true]

        except:
            fail_num += 1
            success_num -= 1
            print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + single_choice_url + ", 程序跳过该题继续执行")
            continue

    print("选择题--题集: " + single_choice_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
    return questions_dict


def get_fill_or_program_in_the_blanks(fill_or_program_in_the_blanks_url):
    """获取填空题并返回题集字典"""
    time.sleep(my_time)
    web.get(fill_or_program_in_the_blanks_url)
    fill_or_program_in_the_blanks_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")
    questions_dict = {}
    success_num = 0
    fail_num = 0
    for fill_or_program_in_the_blanks in fill_or_program_in_the_blanks_list:
        try:
            success_num += 1
            question = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='rendered-markdown']").text
            answer_list_input = fill_or_program_in_the_blanks.find_element_by_css_selector(
                "[class='rendered-markdown']").find_elements_by_css_selector("input")
            answer = []
            for a in answer_list_input:
                answer.append(a.get_attribute("value"))
            is_true = fill_or_program_in_the_blanks.find_element_by_css_selector(
                "[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").text

            print(question)
            print(answer)
            print(is_true)

            questions_dict[question] = [answer, is_true]
        except:
            fail_num += 1
            success_num -= 1
            print("当前题目获取失败, 上一题序号: " + str(
                success_num) + "-序号, 当前题集:" + fill_or_program_in_the_blanks_url + ", 程序跳过该题继续执行")
            continue

    print("填空/程序填空题--题集: " + fill_or_program_in_the_blanks_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(
        fail_num))
    return questions_dict


def get_function_or_programming(function_or_programming_url):
    """获取函数/编程题并返回题集字典"""
    time.sleep(my_time)
    web.get(function_or_programming_url)
    questions_dict = {}

    # 获取所以题目行
    trp_problems = web.find_elements_by_xpath('/html/body/div/div[2]/div[1]/div/div[2]/div[2]/div/div[1]/table//tbody/tr')
    # 存放所有题目的链接
    problems_href = []
    for tr in trp_problems:
        problems_href.append(tr.find_element_by_xpath('td[3]/a').get_attribute('href'))

    success_num = 0
    fail_num = 0
    for problem in problems_href:
        # 这里循环3次的目的是防止请求过快被限制，如果正常执行则退出，否则继续请求(3次还get不到跳过)
        for i in range(3):
            try:
                time.sleep(my_time)  # 根据网速设置时间间隔，访问太快也会被提示
                web.get(problem)
                # 获取题目和答案
                problem_title = web.find_element_by_css_selector(
                    "[class='text-center text-light text-base font-bold my-4']").text
                answer = web.find_element_by_css_selector(
                    "[class='codeEditor_2kCM6 grow shrink']").find_element_by_css_selector('textarea').get_attribute(
                    'value')
                problem_content = web.find_element_by_css_selector("[class='rendered-markdown']").text

                questions_dict[problem_title] = [problem_content, answer]

                print(problem_title)
                print(problem_content)
                print(answer)

                success_num += 1
                break
            except:
                continue
            fail_num += 1  # 如果能执行到这说明当前题目获取失败

    print("函数/编程题--题集: " + function_or_programming_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))
    return questions_dict


def write_question_file(url_list, judgment_file_name, single_choice_file_name, fill_in_the_blanks_name,
                        program_fill_in_the_blanks_name, function_name, programming_name):
    """将题目分类并写入json文件"""
    for url in url_list:

        this_question_type_dict = get_question_type_url(url)
        questions_dict = {}

        def write_file(this_name):
            if not os.path.exists(os.getcwd() + "\\" + this_name):
                new_file = open(this_name, 'w')
                new_file.write("{}")
                new_file.close()
            f = open(this_name, 'r', encoding="utf-8")
            content = f.read()
            file_dict = json.loads(content)
            f.close()

            file_dict.update(questions_dict)

            judgment_file = open(this_name, mode='w', encoding="utf-8")
            judgment_file.write(json.dumps(file_dict, ensure_ascii=False))
            judgment_file.close()
            print("-----当前题记长度-----------------------------------------------------" + str(len(questions_dict)))
            print("-----写入文件--总长度-------------------------------------------------" + str(len(file_dict)))

        # 判断题
        if this_question_type_dict['judgment_url'] != "":
            questions_dict = get_judgment(this_question_type_dict['judgment_url'])
            write_file(this_name=judgment_file_name)

        # 选择题
        if this_question_type_dict['single_choice_url'] != "":
            questions_dict = get_single_choice(this_question_type_dict['single_choice_url'])
            write_file(this_name=single_choice_file_name)

        # 填空题
        if this_question_type_dict['fill_in_the_blanks_url'] != "":
            questions_dict = get_fill_or_program_in_the_blanks(this_question_type_dict['fill_in_the_blanks_url'])
            write_file(this_name=fill_in_the_blanks_name)

        # 程序填空题
        if this_question_type_dict['program_fill_in_the_blanks_url'] != "":
            questions_dict = get_fill_or_program_in_the_blanks(
                this_question_type_dict['program_fill_in_the_blanks_url'])
            write_file(this_name=program_fill_in_the_blanks_name)

        # 函数题
        if this_question_type_dict['function_url'] != "":
            questions_dict = get_function_or_programming(
                function_or_programming_url=this_question_type_dict['function_url'])
            write_file(this_name=function_name)
        # # 编程题
        if this_question_type_dict['programming_url'] != "":
            questions_dict = get_function_or_programming(
                function_or_programming_url=this_question_type_dict['programming_url'])
            write_file(this_name=programming_name)




if __name__ == '__main__':
    # 创建 WebDriver 对象，指明使用chrome浏览器驱动
    web = webdriver.Chrome(service=Service(r'C:\Users\Cat\AppData\Local\Google\Chrome\Application\chromedriver.exe'))
    web.implicitly_wait(5)
    login_url = 'https://pintia.cn/auth/login'
    # 调用WebDriver 对象的get方法 可以让浏览器打开 指定网址
    web.get('https://pintia.cn/auth/login')

    login_PTA('zzz@qq.com', 'xxx')

    # # 题目集类型(数据库)
    # # 1判断;单选;多选！！！;填空
    # # 2判断;单选;填空
    # # 3判断;单选
    # # 4单选
    # # 5单选
    # # 6判断;单选
    # # 7判断;单选;填空
    # # 8判断;单选
    # # 9判断;单选
    # sql_url_list_walking = [
    #     'https://pintia.cn/problem-sets/1343789975057166336/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343794588401487872/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343799990153117696/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343806731523719168/problems/type/2',
    #     'https://pintia.cn/problem-sets/1343807501140754432/problems/type/2',
    #     'https://pintia.cn/problem-sets/1343808640402018304/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343811518420176896/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343798231569530880/problems/type/1',
    #     'https://pintia.cn/problem-sets/1343819242272718848/problems/type/1'
    # ]
    #
    # # 题目集类型(java)
    # # 1判断;单选
    # # 2判断;单选;填空;函数;编程
    # # 3判断;单选;填空;程序填空;函数;编程
    # # 4判断;单选;填空;程序填空;函数;编程
    # # 5判断;单选;填空;函数;编程
    # # 6判断;单选;填空;程序填空;编程
    # # 7判断;单选;填空;编程
    # # 8判断;单选;填空;程序填空;函数;编程
    # java_url_list_lgr = [
    #     'https://pintia.cn/problem-sets/1368832382463172608/problems/type/1',
    #     'https://pintia.cn/problem-sets/1368833022220361728/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369164346714021888/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369165326734123008/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369165872660537344/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166179822002176/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166486127828992/problems/type/1',
    #     'https://pintia.cn/problem-sets/1369166803779248128/problems/type/1'
    # ]

    java_url_list_lxf = [
        'https://pintia.cn/problem-sets/1468315811752116224/problems/type/1'
    ]


    write_question_file(java_url_list_lxf, "dataSql\\judgment.json", "dataSql\\single_choice.json",
                        "dataSql\\fill_in_the_blanks.json", "dataSql\\program_fill_in_the_blanks.json",
                        "dataSql\\function.json", "dataSql\\programming.json")