最近有个需求在爬取数据时碰到验证码。准备使用yolov8训练模型但没有训练集
基本就是这个大佬的代码:我稍作改动
https://github.com/cycyup/crack_geetest
import requests
import time
import json
def geetest_crack():
session = requests.session()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6'
}
register_url = 'https://www.geetest.com/api/user/show/register-space?t=' + str(int(round(time.time() * 1000)))
res = session.get(url=register_url, headers=headers)
res = json.loads(res.text)
gt, challenge = res['gt'], res['challenge']
get_url = f"https://api.geetest.com/get.php?gt={gt}&challenge={challenge}""&lang=zh-cn&pt=0&client_type=web&w=&callback=geetest_{str(int(round(time.time() * 1000)))}"
res1 = session.get(url=get_url, headers=headers)
ajax_url = 'https://api.geetest.com/ajax.php?' \
'gt='+gt+'&' \
'challenge='+challenge+'&' \
'lang=zh-cn&pt=0&client_type=web_mobile&' \
'w=&' \
'callback=geetest_' + str(int(time.time())*1000)
res2 = session.get(url=ajax_url, headers=headers)
get_spcae_url = 'https://api.geetest.com/get.php?is_next=true&type=click&' \
'gt=' + gt + '&' \
'challenge=' + challenge + '&' \
'lang=zh-cn&https=true&protocol=https%3A%2F%2F&offline=false&' \
'product=popup&api_server=api.geetest.com&isPC=true&autoReset=true&' \
'width=100%25&callback=geetest_' + str(int(time.time()) * 1000)
data = session.get(url=get_spcae_url, headers=headers).text
data = json.loads(data[22:-1])['data']
sign_value = data['sign']
pic_value = data['pic']
print("sign的值为:", sign_value)
picurl = "https://static.geetest.com" + pic_value
print(picurl)
response = requests.get(picurl)
if response.status_code == 200:
with open(""+str(int(round(time.time() * 1000)))+"geetest_image.jpg", "wb") as file:
file.write(response.content)
print("图片已保存为 "+str(int(round(time.time() * 1000)))+"geetest_image.jpg")
else:
print("图片下载失败,HTTP状态码:", response.status_code)
if __name__ == '__main__':
for i in range(1, 100):
print(i)
geetest_crack()
time.sleep(10)