一、生成题库并处理图片地址 import requests from DrissionPage._pages.chromium_page import ChromiumPage page = ChromiumPage() import json import re import pandas as pd list = [] for i in range(1): page.listen.start('kaoshiti/') page.get('https://www.jsyks.com/kms-fzks') resp = page.listen.wait() info = resp.response.body kaoshitiku = re.findall('var ExamCnts = (.*?);',info)[0] json_data = json.loads(kaoshitiku) for t in json_data: dic = {} tm= t['tm'] dic['tm'] = tm dic['dalist'] = t['da'] url = t['tv'] tm1=tm[:10] print(tm1) if url == '/tkimg_files/source/': dic['newurl'] = '无' else: url_split = url.split('/')[-1] if url_split.endswith('.jpg'): newurl = 'https://tkimg.mnks.cn/i/' + url_split.replace('jpg', 'webp/jsyks') res = requests.get(newurl) open(f'./img/{tm1}.jpg',"wb").write(res.content) dic['filepath'] = f'file:///D:\PycharmProjects\pythonProject2024\PYTHON基础知识\DAY 15 自动答题系统\img\{tm1}.jpg' dic['newurl'] = newurl elif url_split.endswith('.gif'): newurl = 'https://tkimg.mnks.cn/i/' + url_split.replace('gif', 'webp/jsyks') dic['newurl'] = newurl res = requests.get(newurl) open(f'./img/{tm1}.jpg', "wb").write(res.content) dic['filepath'] = f'file:///D:\PycharmProjects\pythonProject2024\PYTHON基础知识\DAY 15 自动答题系统\img\{tm1}.jpg' list.append(dic) # print(list) df=pd.DataFrame(list) df.to_excel("科目四考试题库03.xlsx")
二、数据处理相同的题目保留第一个
import pandas as pd # 读取Excel文件 df = pd.read_excel("科目四考试题库03.xlsx") # 找出重复的行(不包括第一列) # df.drop_duplicates(df.columns[1:]) duplicates = df.duplicated(df.columns[1:]) # 删除重复行,保留第一个出现的行 df = df.loc[~duplicates] # 将结果写回Excel文件 df.to_excel('科目四考试题库04.xlsx', index=False)