文件下载的Selenium配置
由于打开PDF文件网页时,可能会直接打开PDF预览页面,所以需要用:
from selenium import webdriver
download_dir = r"C:\Users\xxx\Desktop"
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": download_dir, #Change default directory for downloads
"download.prompt_for_download": False, #To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True #It will not show PDF directly in chrome
})
driver = webdriver.Chrome(r'C:\Users\HenryFox\Downloads\chromedriver.exe', options=options) # Optional argument, if not specified will search path.
上述代码修改自 https://stackoverflow.com/a/54427220
文件名修改
需要去掉一些不能用于文件和路径名的字符,下面的代码摘录自 https://www.polarxiong.com/archives/Python-%E6%9B%BF%E6%8D%A2%E6%88%96%E5%8E%BB%E9%99%A4%E4%B8%8D%E8%83%BD%E7%94%A8%E4%BA%8E%E6%96%87%E4%BB%B6%E5%90%8D%E7%9A%84%E5%AD%97%E7%AC%A6.html
import re
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
按钮点击出错
错误提示: ElementClickInterceptedException:element click intercepted
详见 https://www.cnblogs.com/xiaoguo-/p/12143912.html
处理方法:用webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
示例
例子:需要获取 https://dl.acm.org/doi/proceedings/10.1145/3448016 的所有PDF文件及相关信息
根据Xpath
获取内容
首先先获取每个论文的element:
from selenium import webdriver
import time
driver = webdriver.Chrome(r'C:\Users\xxx\Downloads\chromedriver.exe')
driver.get("https://dl.acm.org/doi/proceedings/10.1145/3448016")
tabs = driver.find_elements_by_xpath('//*[@id="pb-page-content"]/div/main/div[4]/div/div[2]/div[1]/div/div[2]/div/div/div')
打开所有折叠的session标签
for i in tabs:
if 'js--open' not in i.get_attribute('class'):
i.find_element_by_tag_name('a').click()
print("GETTING", i.find_element_by_tag_name('a').text)
while i.get_attribute("data-ajaxloaded") != 'true':
time.sleep(5)
print("FINISHED!")
获取所有session的论文列表
from selenium.webdriver.common.action_chains import ActionChains
result = []
for i in tabs:
TAB_NAME = i.find_element_by_tag_name('a').text
papers = i.find_elements_by_class_name('issue-item-container')
for p in papers:
PAPER_TYPE = p.find_element_by_class_name('issue-item__citation').text
title = p.find_element_by_class_name('issue-item__title').find_element_by_tag_name('a')
PAPER_TITLE = title.text
PAPER_URL = title.get_attribute('href')
# 作者信息
more_author_count = p.find_elements_by_class_name('count-list')
if len(more_author_count) > 0:
# more_author_count[0].click() # 无法直接点击
ActionChains(driver).move_to_element(more_author_count[0]).click().perform()
time.sleep(0.5)
AUTHORS = [[author.text, author.get_attribute('href')] for author in p.find_element_by_tag_name('ul').find_elements_by_tag_name('a') if author.text != '(Less)']
# 月份,页数,网站
ISSUE_DETAIL = [e.text for e in p.find_element_by_class_name('issue-item__detail').find_elements_by_tag_name('span')]
# 摘要
# abstract_more = p.find_element_by_class_name('issue-item__abstract').find_elements_by_tag_name('a')
# if len(abstract_more) > 0:
# ActionChains(driver).move_to_element(abstract_more[0]).click().perform()
# time.sleep(0.5)
ABSTRACT = p.find_element_by_class_name('issue-item__abstract').text
if ABSTRACT.endswith("(Less)") or ABSTRACT.endswith("(More)"):
ABSTRACT = ABSTRACT[:-6]
DOWNLOAD = ""
for a in p.find_elements_by_tag_name('a'):
if a.get_attribute('data-title') == 'PDF':
DOWNLOAD = a.get_attribute('href')
break
result.append([TAB_NAME, PAPER_TYPE, PAPER_TITLE, PAPER_URL, AUTHORS, ISSUE_DETAIL, ABSTRACT, DOWNLOAD])
数据保存
用json和csv分别保存
import json
with open("sigmod2021.json", 'w') as f:
json.dump(result, f)
import pandas as pd
dataframe = pd.DataFrame({
'SESSION': [i[0] for i in result],
'TITLE': [i[2] for i in result],
'DOI': [i[3] for i in result],
'PDF_URL': [i[7] for i in result]
})
dataframe.to_csv("sigmod2021.csv", index=False)
下载文件&重命名
需要根据文件下载的Selenium配置
这一节配置。在此需要下载文件+判断哪个是新下载的文件+放入对应文件夹中。
for i, r in enumerate(result):
session_collection[r[0]] += 1
os.makedirs(os.path.join(download_dir, validateTitle(r[0])), exist_ok=True)
title = ('%03d-' % session_collection[r[0]]) + validateTitle(r[2]) + '.pdf'
now_files = os.listdir(download_dir)
driver.get(r[7])
time.sleep(30)
for i in os.listdir(download_dir):
if i not in now_files:
shutil.move(os.path.join(download_dir, i), os.path.join(download_dir, validateTitle(r[0]), title))
print("OK", i)
break