抓取2021百科知识竞赛题库和答案
刚学习PYTHON,希望路过的高手给个JSON方法。
from selenium import webdriver
import time
# 引入
driver = webdriver.Firefox()
# 用Firefox浏览器
driver.get("https://www.jhq8.cn/daan/tiku/3/")
time.sleep(10)
latest_window = driver.window_handles[-1]
driver.switch_to.window(latest_window)
time.sleep(35)
#list3=(8,27,39,43) #只抓取部分网页,容易漏的网页
for i in range(1, 127):
#for i in list3:
url = "https://www.jhq8.cn/daan/tiku/3/" + str(i) + ".html"
try:
latest_window = driver.window_handles[-1]
driver.switch_to.window(latest_window)
time.sleep(3)
driver.get(url)
time.sleep(20)
with open("抓取的题库3774答案.txt", "a+") as f:
f.write(url)
f.write("\n")
for j in range(1, 31):
latest_window = driver.window_handles[-1]
driver.switch_to.window(latest_window)
time.sleep(2)
tigan = "/html/body/div[1]/div/div/div[4]/div/div[1]/div/ul/li[" + str(j) + "]/a"
driver.find_element_by_xpath(tigan).click()
# 第一题
latest_window = driver.window_handles[-1]
driver.switch_to.window(latest_window)
time.sleep(6)
g1 = driver.find_element_by_xpath("/html/body/div[1]/div/div/div[3]/div/div/div[1]/div[2]/p").text.strip()
time.sleep(2)
with open("抓取的题库3774答案.txt", "a+") as f:
f.write(tigan)
f.write("\n")
f.write(g1)
f.write("\n")
time.sleep(2)
latest_window = driver.window_handles[-1]
driver.switch_to.window(latest_window)
driver.close()
time.sleep(2)
except Exception as e:
print(e)
#latest_window = driver.window_handles[-1]
#driver.switch_to.window(latest_window)
# driver.close()
# time.sleep(2)