- 通过开发者工具找到试题链接地址;
- 对试题链接的url进行分析,发现index是试题id名称,构造随机数,可使用range或者excel拉出全部;
- 对json数据进行字段分析
- 我这里分开写了两个脚本,一个是获取数据一个是转成excel,本文主要为多进程获取数据
- 开发环境python3.9.1/windows10/vscode
-
#coding:utf-8 import requests from concurrent.futures import ProcessPoolExecutor import json # 通过url获取数据 # url = 'http://mnks.jxedt.com/get_question?r=0.5376675619396274&index=3' urls_list = [] with open('D:/YYFX/ip.txt','r') as f: for line in f: #print line, urls_list.append(line.replace('\n', '')) #模拟浏览器header hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'} #进程 pool = ProcessPoolExecutor(20) def get_page(url): #requests.get 自带 json.load response = requests.get('http://%s'%(url),headers = hea,timeout = 30 ,verify=False) response = response.content #将bytes转换成字符串 response = response.decode('utf-8') return response def read_data(future,*args,**kwargs): response = future.result() state = json.loads(response) # print(response.status_code,response.url) print (state) #product = response1["question"]+'\n' with open('%s.json'%'data','a',encoding='utf-8') as f: #保存json数据防止乱码 f.write(json.dumps(state,ensure_ascii=False) + '\n') f.close() def main(): for url in urls_list: done = pool.submit(get_page,url) done.add_done_callback(read_data) if __name__ == '__main__': main() pool.shutdown(wait=True) f.close()
python3通过request多进程获取驾校一点通试题库
最新推荐文章于 2021-06-18 20:29:34 发布