python 爬取dblp相关论文

最新推荐文章于 2024-09-29 14:13:49 发布

木木阳

最新推荐文章于 2024-09-29 14:13:49 发布

阅读量204

点赞数 4

文章标签： python 开发语言

本文链接：https://blog.csdn.net/weixin_44287798/article/details/139497102

版权

授人以渔
上面的论文我是通过爬取论文获得对应excel文件，现在开源给大家

import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# 初始化Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # 取消注释以启用无头模式
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# 打开目标URL
url = "https://dblp.uni-trier.de/search?q=open-vocabulary"
driver.get(url)

# 模拟向下滚动以加载更多内容
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE_TIME)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height or new_height>30000:
        break
    last_height = new_height

# 解析加载的内容
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

# 定义论文信息列表
papers = []

# 解析论文条目
for entry in soup.find_all('li', class_='entry'):
    title = entry.find('span', class_='title').text.strip()
    authors = ', '.join([author.text.strip() for author in entry.find_all('span', itemprop='author')])
    year = entry.find('span', itemprop='datePublished').text.strip() if entry.find('span', itemprop='datePublished') else "N/A"
    venue = entry.find('span', class_='venue').text.strip() if entry.find('span', class_='venue') else "N/A"
    if year in ['2022', '2023', '2024']:
        papers.append([year, title, authors, venue])

# 保存为CSV文件
with open('open_vocabulary_papers_2022_2024.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Year", "Title", "Authors", "Venue"])
    writer.writerows(papers)

print("CSV文件已生成: open_vocabulary_papers_2022_2024.csv")