授人以渔
上面的论文我是通过爬取论文获得对应excel文件,现在开源给大家
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
# 初始化Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # 取消注释以启用无头模式
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
# 打开目标URL
url = "https://dblp.uni-trier.de/search?q=open-vocabulary"
driver.get(url)
# 模拟向下滚动以加载更多内容
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height or new_height>30000:
break
last_height = new_height
# 解析加载的内容
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
# 定义论文信息列表
papers = []
# 解析论文条目
for entry in soup.find_all('li', class_='entry'):
title = entry.find('span', class_='title').text.strip()
authors = ', '.join([author.text.strip() for author in entry.find_all('span', itemprop='author')])
year = entry.find('span', itemprop='datePublished').text.strip() if entry.find('span', itemprop='datePublished') else "N/A"
venue = entry.find('span', class_='venue').text.strip() if entry.find('span', class_='venue') else "N/A"
if year in ['2022', '2023', '2024']:
papers.append([year, title, authors, venue])
# 保存为CSV文件
with open('open_vocabulary_papers_2022_2024.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Year", "Title", "Authors", "Venue"])
writer.writerows(papers)
print("CSV文件已生成: open_vocabulary_papers_2022_2024.csv")
祝大家学业有成,paper多中!