1、webdriver下载地址:
https://registry.npmmirror.com/binary.html?path=chromedriver/
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
driver = Chrome()
driver.maximize_window()
driver.get("https://jour.duxiu.com/magDetail.jsp?magid=320910034129&d=3305845747873E9BCB4DD9EDA6053C20")
# for year in range(1990,2022):
for year in range(1985,1990):
driver.find_element(By.XPATH,f'//*[@id="y{year}"]/a').click()
year = str(year)
time.sleep(2)
# driver.find_element(By.XPATH,'//*[@id="qihao_20220"]/a').click()
for qihao in range(0,12):
time.sleep(4)
qihao = str(qihao)
try:
driver.find_element(By.XPATH,f'//*[@id="qihao_{year+qihao}"]/a').click()
except:
continue
else:
# 爬取目录详情
# ulEle = driver.find_element(By.XPATH,'//*[@id="jourlist"]/ul')
# time.sleep(2)
# titles = ulEle.find_elements(By.TAG_NAME,'li')
# time.sleep(2)
# print(len(titles))
# print(f"{year}-{qihao}:")
qihao = int(qihao)
year_content = f"{year}-{qihao+1}:"
file = open('content.txt', 'a')
file.write(year_content + '\n')
for i in range(1, 16):
# //*[@id="jourlist"]/ul/li[15]
time.sleep(2)
try:
titles = driver.find_elements(By.XPATH, f'//*[@id="jourlist"]/ul/li[{i}]')
except:
continue
else:
# file = open('content.txt', 'a')
# file.write(year_content + '\n')
for title in titles:
print(year_content)
print(title.text)
file.write(title.text + '\n')